Proteus
Programmable JIT compilation and optimization for C/C++ using LLVM
Loading...
Searching...
No Matches
CoreLLVMHIP.hpp
Go to the documentation of this file.
1#ifndef PROTEUS_CORE_LLVM_HIP_HPP
2#define PROTEUS_CORE_LLVM_HIP_HPP
3
4#include <llvm/Bitcode/BitcodeWriter.h>
5#include <llvm/CodeGen/MachineModuleInfo.h>
6#include <llvm/CodeGen/ParallelCG.h>
7#include <llvm/IR/DiagnosticPrinter.h>
8#include <llvm/IR/Function.h>
9#include <llvm/IR/LegacyPassManager.h>
10#include <llvm/IR/Module.h>
11#include <llvm/IR/Verifier.h>
12#include <llvm/LTO/LTO.h>
13#include <llvm/Support/CodeGen.h>
14#include <llvm/Support/FileSystem.h>
15#include <llvm/Support/MemoryBuffer.h>
16#include <llvm/Support/Path.h>
17#include <llvm/Support/Signals.h>
18#include <llvm/Support/TargetSelect.h>
19#include <llvm/Support/WithColor.h>
20#include <llvm/Target/TargetMachine.h>
21#include <llvm/Transforms/IPO/ThinLTOBitcodeWriter.h>
22#include <llvm/Transforms/Utils/SplitModule.h>
23
24#if LLVM_VERSION_MAJOR >= 18
25#include <lld/Common/Driver.h>
26LLD_HAS_DRIVER(elf)
27#endif
28
29#include "proteus/Debug.h"
30#include "proteus/Error.h"
31#include "proteus/Logger.hpp"
33#include "proteus/Utils.h"
34#include "proteus/UtilsHIP.h"
35
36namespace proteus {
37
38using namespace llvm;
39
40namespace detail {
41
42inline const SmallVector<StringRef> &gridDimXFnName() {
43 static SmallVector<StringRef> Names = {
44 "_ZNK17__HIP_CoordinatesI13__HIP_GridDimE3__XcvjEv",
45 "llvm.amdgcn.num.workgroups.x", "_ZL20__hip_get_grid_dim_xv"};
46 return Names;
47}
48
49inline const SmallVector<StringRef> &gridDimYFnName() {
50 static SmallVector<StringRef> Names = {
51 "_ZNK17__HIP_CoordinatesI13__HIP_GridDimE3__YcvjEv",
52 "llvm.amdgcn.num.workgroups.y", "_ZL20__hip_get_grid_dim_yv"};
53 return Names;
54}
55
56inline const SmallVector<StringRef> &gridDimZFnName() {
57 static SmallVector<StringRef> Names = {
58 "_ZNK17__HIP_CoordinatesI13__HIP_GridDimE3__ZcvjEv",
59 "llvm.amdgcn.num.workgroups.z", "_ZL20__hip_get_grid_dim_zv"};
60 return Names;
61}
62
63inline const SmallVector<StringRef> &blockDimXFnName() {
64 static SmallVector<StringRef> Names = {
65 "_ZNK17__HIP_CoordinatesI14__HIP_BlockDimE3__XcvjEv",
66 "llvm.amdgcn.workgroup.size.x", "_ZL21__hip_get_block_dim_xv"};
67 return Names;
68}
69
70inline const SmallVector<StringRef> &blockDimYFnName() {
71 static SmallVector<StringRef> Names = {
72 "_ZNK17__HIP_CoordinatesI14__HIP_BlockDimE3__YcvjEv",
73 "llvm.amdgcn.workgroup.size.y", "_ZL21__hip_get_block_dim_yv"};
74 return Names;
75}
76
77inline const SmallVector<StringRef> &blockDimZFnName() {
78 static SmallVector<StringRef> Names = {
79 "_ZNK17__HIP_CoordinatesI14__HIP_BlockDimE3__ZcvjEv",
80 "llvm.amdgcn.workgroup.size.z", "_ZL21__hip_get_block_dim_zv"};
81 return Names;
82}
83
84inline const SmallVector<StringRef> &blockIdxXFnName() {
85 static SmallVector<StringRef> Names = {
86 "_ZNK17__HIP_CoordinatesI14__HIP_BlockIdxE3__XcvjEv",
87 "llvm.amdgcn.workgroup.id.x"};
88 return Names;
89};
90
91inline const SmallVector<StringRef> &blockIdxYFnName() {
92 static SmallVector<StringRef> Names = {
93 "_ZNK17__HIP_CoordinatesI14__HIP_BlockIdxE3__YcvjEv",
94 "llvm.amdgcn.workgroup.id.y"};
95 return Names;
96};
97
98inline const SmallVector<StringRef> &blockIdxZFnName() {
99 static SmallVector<StringRef> Names = {
100 "_ZNK17__HIP_CoordinatesI14__HIP_BlockIdxE3__ZcvjEv",
101 "llvm.amdgcn.workgroup.id.z"};
102 return Names;
103}
104
105inline const SmallVector<StringRef> &threadIdxXFnName() {
106 static SmallVector<StringRef> Names = {
107 "_ZNK17__HIP_CoordinatesI15__HIP_ThreadIdxE3__XcvjEv",
108 "llvm.amdgcn.workitem.id.x"};
109 return Names;
110};
111
112inline const SmallVector<StringRef> &threadIdxYFnName() {
113 static SmallVector<StringRef> Names = {
114 "_ZNK17__HIP_CoordinatesI15__HIP_ThreadIdxE3__YcvjEv",
115 "llvm.amdgcn.workitem.id.y"};
116 return Names;
117};
118
119inline const SmallVector<StringRef> &threadIdxZFnName() {
120 static SmallVector<StringRef> Names = {
121 "_ZNK17__HIP_CoordinatesI15__HIP_ThreadIdxE3__ZcvjEv",
122 "llvm.amdgcn.workitem.id.z"};
123 return Names;
124};
125
126#if LLVM_VERSION_MAJOR >= 18
127inline SmallVector<std::unique_ptr<sys::fs::TempFile>>
128codegenSerial(Module &M, StringRef DeviceArch,
129 [[maybe_unused]] char OptLevel = '3', int CodegenOptLevel = 3) {
130 SmallVector<std::unique_ptr<sys::fs::TempFile>> ObjectFiles;
131
132 auto ExpectedTM =
133 proteus::detail::createTargetMachine(M, DeviceArch, CodegenOptLevel);
134 if (!ExpectedTM)
135 PROTEUS_FATAL_ERROR(toString(ExpectedTM.takeError()));
136
137 std::unique_ptr<TargetMachine> TM = std::move(*ExpectedTM);
138 TargetLibraryInfoImpl TLII(Triple(M.getTargetTriple()));
139
140 legacy::PassManager PM;
141 PM.add(new TargetLibraryInfoWrapperPass(TLII));
142 MachineModuleInfoWrapperPass *MMIWP = new MachineModuleInfoWrapperPass(
143 reinterpret_cast<LLVMTargetMachine *>(TM.get()));
144
145 SmallVector<char, 4096> ObjectCode;
146 raw_svector_ostream OS(ObjectCode);
147 auto ExpectedF = sys::fs::TempFile::create("object-%%%%%%.o");
148 if (auto E = ExpectedF.takeError())
149 PROTEUS_FATAL_ERROR("Error creating object tmp file " +
150 toString(std::move(E)));
151 auto ObjectFile = std::move(*ExpectedF);
152 auto FileStream = std::make_unique<CachedFileStream>(
153 std::make_unique<llvm::raw_fd_ostream>(ObjectFile.FD, false));
154 TM->addPassesToEmitFile(PM, *FileStream->OS, nullptr,
155 CodeGenFileType::ObjectFile,
156 /* DisableVerify */ true, MMIWP);
157
158 std::unique_ptr<sys::fs::TempFile> ObjectFilePtr =
159 std::make_unique<sys::fs::TempFile>(std::move(ObjectFile));
160 ObjectFiles.emplace_back(std::move(ObjectFilePtr));
161
162 PM.run(M);
163
164 return ObjectFiles;
165}
166
167inline void runPreLinkPipeline(Module &M, StringRef DeviceArch,
168 unsigned OptLevel, unsigned CodegenOptLevel) {
169 Timer T;
170 auto ExpectedTM =
171 proteus::detail::createTargetMachine(M, DeviceArch, CodegenOptLevel);
172 if (!ExpectedTM)
173 PROTEUS_FATAL_ERROR(toString(ExpectedTM.takeError()));
174 std::unique_ptr<TargetMachine> TM = std::move(*ExpectedTM);
175
176 PassBuilder PB(TM.get());
177 LoopAnalysisManager LAM;
178 FunctionAnalysisManager FAM;
179 CGSCCAnalysisManager CGAM;
180 ModuleAnalysisManager MAM;
181
182 PB.registerModuleAnalyses(MAM);
183 PB.registerCGSCCAnalyses(CGAM);
184 PB.registerFunctionAnalyses(FAM);
185 PB.registerLoopAnalyses(LAM);
186 PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
187
188 ModulePassManager MPM;
189 std::optional<OptimizationLevel> OL = std::nullopt;
190 switch (OptLevel) {
191 case 0:
192 OL = OptimizationLevel::O0;
193 break;
194 case 1:
195 OL = OptimizationLevel::O1;
196 break;
197 case 2:
198 OL = OptimizationLevel::O2;
199 break;
200 case 3:
201 OL = OptimizationLevel::O3;
202 break;
203 default:
204 OL = OptimizationLevel();
205 Logger::outs("proteus")
206 << "Unknown optlevel " << OptLevel << " fallback to default "
207 << OL.value().getSpeedupLevel() << "\n";
208 }
209 MPM = PB.buildThinLTOPreLinkDefaultPipeline(OL.value());
210 MPM.run(M, MAM);
212 << __FUNCTION__ << " " << T.elapsed() << " ms\n");
213}
214
215inline SmallVector<std::unique_ptr<sys::fs::TempFile>>
216codegenParallel(Module &M, StringRef DeviceArch,
217 [[maybe_unused]] char OptLevel = '3', int CodegenOptLevel = 3) {
218 auto TMFactory = [&]() {
219 auto TMExpected =
220 proteus::detail::createTargetMachine(M, DeviceArch, CodegenOptLevel);
221 if (!TMExpected)
222 PROTEUS_FATAL_ERROR(toString(TMExpected.takeError()));
223
224 return std::move(*TMExpected);
225 };
226
227 const size_t NumShards = std::min(
228 M.size(),
229 static_cast<size_t>(
230 llvm::heavyweight_hardware_concurrency().compute_thread_count()));
231
232 SmallVector<std::unique_ptr<sys::fs::TempFile>> ObjectFiles;
233
234 SmallVector<SmallString<0>> Objects{NumShards};
235 SmallVector<std::unique_ptr<raw_svector_ostream>> OwnedObjectsOS;
236 SmallVector<raw_pwrite_stream *> ObjectsOS;
237
238 for (size_t I = 0; I < NumShards; ++I) {
239 OwnedObjectsOS.push_back(std::make_unique<raw_svector_ostream>(Objects[I]));
240 ObjectsOS.push_back(OwnedObjectsOS.back().get());
241 }
242
243 splitCodeGen(M, ObjectsOS, {}, TMFactory);
244
245 for (unsigned I = 0; I < NumShards; ++I) {
246 PROTEUS_DBG(Logger::logs("proteus") << "Shard #" << I << " object size = "
247 << Objects[I].size() << "\n");
248 auto ExpectedF =
249 sys::fs::TempFile::create("shard." + std::to_string(I) + "-%%%%%%%.o");
250 if (auto E = ExpectedF.takeError())
251 PROTEUS_FATAL_ERROR("Error creating tmp file " + toString(std::move(E)));
252 saveToFile(ExpectedF->TmpName, Objects[I]);
253 std::unique_ptr<sys::fs::TempFile> ObjectFilePtr =
254 std::make_unique<sys::fs::TempFile>(std::move(*ExpectedF));
255 ObjectFiles.emplace_back(std::move(ObjectFilePtr));
256 }
257
258 return ObjectFiles;
259}
260
261inline SmallVector<std::unique_ptr<sys::fs::TempFile>>
262codegenParallelThinLTO(Module &M, StringRef DeviceArch,
263 unsigned int OptLevel = 3, int CodegenOptLevel = 3) {
264 const size_t NumShards = std::min(
265 M.size(),
266 static_cast<size_t>(
267 llvm::heavyweight_hardware_concurrency().compute_thread_count()));
268
269 SmallVector<SmallString<0>> Bitcodes{NumShards};
270 SmallVector<std::unique_ptr<raw_svector_ostream>> OwnedBitcodesOS;
271 SmallVector<raw_pwrite_stream *> BitcodesOS;
272 for (unsigned int I = 0; I < NumShards; ++I) {
273 OwnedBitcodesOS.push_back(
274 std::make_unique<raw_svector_ostream>(Bitcodes[I]));
275 BitcodesOS.push_back(OwnedBitcodesOS.back().get());
276 }
277
278 // Running the prelink pipeline is needed for AMDGPU lowering.
279 runPreLinkPipeline(M, DeviceArch, OptLevel, CodegenOptLevel);
280
281 // Split the kernel module to separate bitcodes for thinlto code generation.
282 Timer T;
283 size_t PartIdx = 0;
284 SplitModule(
285 M, BitcodesOS.size(),
286 [&PartIdx, &BitcodesOS](std::unique_ptr<Module> MPart) {
287#if PROTEUS_ENABLE_DEBUG
288 if (verifyModule(*MPart, &errs()))
289 PROTEUS_FATAL_ERROR("Broken module found, JIT compilation aborted!");
290#endif
291 PassBuilder PB;
292
293 LoopAnalysisManager LAM;
294 FunctionAnalysisManager FAM;
295 CGSCCAnalysisManager CGAM;
296 ModuleAnalysisManager MAM;
297
298 PB.registerModuleAnalyses(MAM);
299 PB.registerCGSCCAnalyses(CGAM);
300 PB.registerFunctionAnalyses(FAM);
301 PB.registerLoopAnalyses(LAM);
302 PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
303
304 ModulePassManager MPM;
305 MPM.addPass(ThinLTOBitcodeWriterPass(*BitcodesOS[PartIdx], nullptr));
306 MPM.run(*MPart, MAM);
307 PartIdx++;
308 },
309 false);
311 << "SplitModule " << T.elapsed() << " ms\n");
312
313 std::atomic<bool> LTOError = false;
314 auto DiagnosticHandler = [&](const DiagnosticInfo &DI) {
315 std::string ErrStorage;
316 raw_string_ostream OS(ErrStorage);
317 DiagnosticPrinterRawOStream DP(OS);
318 DI.print(DP);
319
320 switch (DI.getSeverity()) {
321 case DS_Error:
322 WithColor::error(errs(), "[proteus codegen]") << ErrStorage << "\n";
323 LTOError = true;
324 break;
325 case DS_Warning:
326 WithColor::warning(errs(), "[proteus codegen]") << ErrStorage << "\n";
327 break;
328 case DS_Note:
329 WithColor::note(errs(), "[proteus codegen]") << ErrStorage << "\n";
330 break;
331 case DS_Remark:
332 WithColor::remark(errs()) << ErrStorage << "\n";
333 break;
334 }
335 };
336
337 lto::Config Conf;
338 Conf.CPU = DeviceArch;
339 // Use default machine attributes.
340 Conf.MAttrs = {};
341 Conf.UseDefaultPipeline = false;
342 Conf.DisableVerify = true;
343 Conf.TimeTraceEnabled = false;
344 Conf.DebugPassManager = false;
345 Conf.VerifyEach = false;
346 Conf.DiagHandler = DiagnosticHandler;
347 Conf.OptLevel = OptLevel;
348 Conf.CGOptLevel = static_cast<CodeGenOptLevel>(CodegenOptLevel);
349
350 // Create the backend for multi-threaded, parallel processing/
351 lto::ThinBackend Backend = lto::createInProcessThinBackend(
352 llvm::heavyweight_hardware_concurrency(NumShards));
353 auto LTOBackend = lto::LTO(std::move(Conf), Backend);
354 size_t Idx = 0;
355 BumpPtrAllocator Alloc;
356 StringSaver Identifiers(Alloc);
357 std::set<std::string> PrevailingSymbols;
358 for (auto &BitcodeInput : Bitcodes) {
359 StringRef Identifier =
360 Identifiers.save((std::to_string(Idx) + ".shard.bc"));
361 Idx++;
362 Expected<std::unique_ptr<lto::InputFile>> BitcodeFileOrErr =
363 llvm::lto::InputFile::create(MemoryBufferRef{
364 StringRef{BitcodeInput.data(), BitcodeInput.size()}, Identifier});
365 if (auto E = BitcodeFileOrErr.takeError())
366 PROTEUS_FATAL_ERROR("Error");
367
368 // Save the input file and the buffer associated with its memory.
369 const auto Symbols = (*BitcodeFileOrErr)->symbols();
370 SmallVector<lto::SymbolResolution, 16> Resolutions(Symbols.size());
371 size_t SymbolIdx = 0;
372 for (auto &Sym : Symbols) {
373 lto::SymbolResolution &Res = Resolutions[SymbolIdx];
374 SymbolIdx++;
375
376 // All defined symbols are prevailing.
377 Res.Prevailing = !Sym.isUndefined() &&
378 PrevailingSymbols.insert(Sym.getName().str()).second;
379
380 Res.VisibleToRegularObj =
381 Res.Prevailing &&
382 Sym.getVisibility() != GlobalValue::HiddenVisibility &&
383 !Sym.canBeOmittedFromSymbolTable();
384
385 Res.ExportDynamic =
386 Sym.getVisibility() != GlobalValue::HiddenVisibility &&
387 (!Sym.canBeOmittedFromSymbolTable());
388
389 Res.FinalDefinitionInLinkageUnit =
390 Sym.getVisibility() != GlobalValue::DefaultVisibility &&
391 (!Sym.isUndefined() && !Sym.isCommon());
392
393 // Device linking does not support linker redefined symbols (e.g. --wrap).
394 Res.LinkerRedefined = false;
395
396#if PROTEUS_ENABLE_DEBUG
397 auto PrintSymbol = [](const lto::InputFile::Symbol &Sym,
398 lto::SymbolResolution &Res) {
399 auto &OutStream = Logger::logs("proteus");
400 OutStream << "Vis: ";
401 switch (Sym.getVisibility()) {
402 case GlobalValue::HiddenVisibility:
403 OutStream << 'H';
404 break;
405 case GlobalValue::ProtectedVisibility:
406 OutStream << 'P';
407 break;
408 case GlobalValue::DefaultVisibility:
409 OutStream << 'D';
410 break;
411 }
412
413 OutStream << " Sym: ";
414 auto PrintBool = [&](char C, bool B) { OutStream << (B ? C : '-'); };
415 PrintBool('U', Sym.isUndefined());
416 PrintBool('C', Sym.isCommon());
417 PrintBool('W', Sym.isWeak());
418 PrintBool('I', Sym.isIndirect());
419 PrintBool('O', Sym.canBeOmittedFromSymbolTable());
420 PrintBool('T', Sym.isTLS());
421 PrintBool('X', Sym.isExecutable());
422 OutStream << ' ' << Sym.getName();
423 OutStream << "| P " << Res.Prevailing;
424 OutStream << " V " << Res.VisibleToRegularObj;
425 OutStream << " E " << Res.ExportDynamic;
426 OutStream << " F " << Res.FinalDefinitionInLinkageUnit;
427 OutStream << "\n";
428 };
429 PrintSymbol(Sym, Res);
430#endif
431 }
432
433 // Add the bitcode file with its resolved symbols to the LTO job.
434 if (Error Err = LTOBackend.add(std::move(*BitcodeFileOrErr), Resolutions))
435 PROTEUS_FATAL_ERROR("Error adding file to backend");
436 }
437
438 // Run the LTO job to compile the bitcode.
439 size_t MaxTasks = LTOBackend.getMaxTasks();
440 SmallVector<std::unique_ptr<sys::fs::TempFile>> ObjectFiles{MaxTasks};
441 auto AddStream =
442 [&](size_t Task,
443 const Twine & /*ModuleName*/) -> std::unique_ptr<CachedFileStream> {
444 std::string TaskStr = Task ? "." + std::to_string(Task) : "";
445 auto ExpectedF =
446 sys::fs::TempFile::create("lto.shard" + TaskStr + "-%%%%%%%.o");
447 if (auto E = ExpectedF.takeError())
448 PROTEUS_FATAL_ERROR("Error creating tmp file " + toString(std::move(E)));
449 ObjectFiles[Task] =
450 std::make_unique<sys::fs::TempFile>(std::move(*ExpectedF));
451 auto Ret = std::make_unique<CachedFileStream>(
452 std::make_unique<llvm::raw_fd_ostream>(ObjectFiles[Task]->FD, false));
453 if (!Ret)
454 PROTEUS_FATAL_ERROR("Error creating CachedFileStream");
455 return Ret;
456 };
457
458 if (Error E = LTOBackend.run(AddStream))
459 PROTEUS_FATAL_ERROR("Error: " + toString(std::move(E)));
460
461 if (LTOError)
463 createStringError(inconvertibleErrorCode(),
464 "Errors encountered inside the LTO pipeline.")));
465
466 return ObjectFiles;
467}
468#endif
469
470inline std::unique_ptr<MemoryBuffer> codegenRTC(Module &M,
471 StringRef DeviceArch) {
472 char *BinOut;
473 size_t BinSize;
474
475 SmallString<4096> ModuleBuf;
476 raw_svector_ostream ModuleBufOS(ModuleBuf);
477 WriteBitcodeToFile(M, ModuleBufOS);
478
479 hiprtcLinkState HipLinkStatePtr;
480
481 // NOTE: This code is an example of passing custom, AMD-specific
482 // options to the compiler/linker.
483 // NOTE: Unrolling can have a dramatic (time-consuming) effect on JIT
484 // compilation time and on the resulting optimization, better or worse
485 // depending on code specifics.
486 std::string MArchOpt = ("-march=" + DeviceArch).str();
487 const char *OptArgs[] = {"-mllvm", "-unroll-threshold=1000",
488 MArchOpt.c_str()};
489 std::vector<hiprtcJIT_option> JITOptions = {
490 HIPRTC_JIT_IR_TO_ISA_OPT_EXT, HIPRTC_JIT_IR_TO_ISA_OPT_COUNT_EXT};
491 size_t OptArgsSize = 3;
492 const void *JITOptionsValues[] = {(void *)OptArgs, (void *)(OptArgsSize)};
493 proteusHiprtcErrCheck(hiprtcLinkCreate(JITOptions.size(), JITOptions.data(),
494 (void **)JITOptionsValues,
495 &HipLinkStatePtr));
496 // NOTE: the following version of te code does not set options.
497 // proteusHiprtcErrCheck(hiprtcLinkCreate(0, nullptr, nullptr,
498 // &hip_link_state_ptr));
499
500 proteusHiprtcErrCheck(hiprtcLinkAddData(
501 HipLinkStatePtr, HIPRTC_JIT_INPUT_LLVM_BITCODE, (void *)ModuleBuf.data(),
502 ModuleBuf.size(), "", 0, nullptr, nullptr));
504 hiprtcLinkComplete(HipLinkStatePtr, (void **)&BinOut, &BinSize));
505
506 return MemoryBuffer::getMemBuffer(StringRef{BinOut, BinSize});
507}
508
509} // namespace detail
510
511inline void setLaunchBoundsForKernel(Module & /*M*/, Function &F,
512 [[maybe_unused]] size_t GridSize,
513 int BlockSize) {
514 // TODO: fix calculation of launch bounds.
515 // TODO: find maximum (hardcoded 1024) from device info.
516 // TODO: Setting as 1, BlockSize to replicate launch bounds settings
517 // Does setting it as BlockSize, BlockSize help?
518 // Setting the attribute override any previous setting.
519 F.addFnAttr("amdgpu-flat-work-group-size",
520 "1," + std::to_string(std::min(1024, BlockSize)));
521 // TODO: find warp size (hardcoded 64) from device info.
522 // int WavesPerEU = (GridSize * BlockSize) / 64 / 110 / 4 / 2;
523 [[maybe_unused]] int WavesPerEU = 0;
524 // F->addFnAttr("amdgpu-waves-per-eu", std::to_string(WavesPerEU));
525 PROTEUS_DBG(Logger::logs("proteus")
526 << "BlockSize " << BlockSize << " GridSize " << GridSize
527 << " => Set Wokgroup size " << BlockSize
528 << " WavesPerEU (unused) " << WavesPerEU << "\n");
529}
530
531inline std::unique_ptr<MemoryBuffer>
532codegenObject(Module &M, StringRef DeviceArch,
533 [[maybe_unused]] SmallPtrSetImpl<void *> &GlobalLinkedBinaries,
535 assert(GlobalLinkedBinaries.empty() &&
536 "Expected empty linked binaries for HIP");
537 Timer T;
538 SmallVector<std::unique_ptr<sys::fs::TempFile>> ObjectFiles;
539 switch (CGOption) {
540 case CodegenOption::RTC: {
541 auto Ret = detail::codegenRTC(M, DeviceArch);
543 << "Codegen RTC " << T.elapsed() << " ms\n");
544 return Ret;
545 }
546#if LLVM_VERSION_MAJOR >= 18
548 ObjectFiles = detail::codegenSerial(M, DeviceArch);
549 break;
551 ObjectFiles = detail::codegenParallel(M, DeviceArch);
552 break;
554 ObjectFiles = detail::codegenParallelThinLTO(M, DeviceArch);
555 break;
556#endif
557 default:
558 PROTEUS_FATAL_ERROR("Unknown Codegen Option");
559 }
560
561 if (ObjectFiles.empty())
562 PROTEUS_FATAL_ERROR("Expected non-empty vector of object files");
563
564 // TODO: make it work for LLVM < 18 or drop claimed support.
565#if LLVM_VERSION_MAJOR >= 18
566 auto ExpectedF = sys::fs::TempFile::create("proteus-jit-%%%%%%%.o");
567 if (auto E = ExpectedF.takeError())
568 PROTEUS_FATAL_ERROR("Error creating shared object file " +
569 toString(std::move(E)));
570
571 auto SharedObject = std::move(*ExpectedF);
572
573 std::vector<const char *> Args{"ld.lld", "--no-undefined", "-shared", "-o",
574 SharedObject.TmpName.c_str()};
575 for (auto &File : ObjectFiles) {
576 if (!File)
577 continue;
578 Args.push_back(File->TmpName.c_str());
579 }
580
581#if PROTEUS_ENABLE_DEBUG
582 for (auto &Arg : Args) {
583 Logger::logs("proteus") << Arg << " ";
584 }
585 Logger::logs("proteus") << "\n";
586#endif
587
589 << "Codegen object " << toString(CGOption) << "["
590 << ObjectFiles.size() << "] " << T.elapsed() << " ms\n");
591
592 T.reset();
593 // The LLD linker interface is not thread-safe, so we use a mutex.
594 static std::mutex Mutex;
595 {
596 std::lock_guard LockGuard{Mutex};
597 lld::Result S = lld::lldMain(Args, llvm::outs(), llvm::errs(),
598 {{lld::Gnu, &lld::elf::link}});
599 if (S.retCode)
600 PROTEUS_FATAL_ERROR("Error: lld failed");
601 }
602
603 ErrorOr<std::unique_ptr<MemoryBuffer>> Buffer =
604 MemoryBuffer::getFileAsStream(SharedObject.TmpName);
605 if (!Buffer)
606 PROTEUS_FATAL_ERROR("Error reading file: " + Buffer.getError().message());
607
608 // Remove temporary files.
609 for (auto &File : ObjectFiles) {
610 if (!File)
611 continue;
612 if (auto E = File->discard())
613 PROTEUS_FATAL_ERROR("Error removing object tmp file " +
614 toString(std::move(E)));
615 }
616 if (auto E = SharedObject.discard())
617 PROTEUS_FATAL_ERROR("Error removing shared object tmp file " +
618 toString(std::move(E)));
619
621 << "Codegen linking " << T.elapsed() << " ms\n");
622
623 return std::move(*Buffer);
624#else
625 PROTEUS_FATAL_ERROR("Expected LLVM18 for non-RTC codegen");
626#endif
627}
628
629} // namespace proteus
630
631#endif
char int void ** Args
Definition CompilerInterfaceHost.cpp:20
#define PROTEUS_DBG(x)
Definition Debug.h:10
#define PROTEUS_FATAL_ERROR(x)
Definition Error.h:4
#define PROTEUS_TIMER_OUTPUT(x)
Definition TimeTracing.hpp:57
#define proteusHiprtcErrCheck(CALL)
Definition UtilsHIP.h:28
void saveToFile(llvm::StringRef Filepath, T &&Data)
Definition Utils.h:23
static llvm::raw_ostream & outs(const std::string &Name)
Definition Logger.hpp:25
static llvm::raw_ostream & logs(const std::string &Name)
Definition Logger.hpp:19
const SmallVector< StringRef > & threadIdxXFnName()
Definition CoreLLVMCUDA.hpp:70
const SmallVector< StringRef > & gridDimYFnName()
Definition CoreLLVMCUDA.hpp:30
const SmallVector< StringRef > & threadIdxZFnName()
Definition CoreLLVMCUDA.hpp:80
const SmallVector< StringRef > & blockIdxZFnName()
Definition CoreLLVMCUDA.hpp:65
const SmallVector< StringRef > & gridDimZFnName()
Definition CoreLLVMCUDA.hpp:35
std::unique_ptr< MemoryBuffer > codegenRTC(Module &M, StringRef DeviceArch)
Definition CoreLLVMHIP.hpp:470
const SmallVector< StringRef > & gridDimXFnName()
Definition CoreLLVMCUDA.hpp:25
const SmallVector< StringRef > & blockIdxXFnName()
Definition CoreLLVMCUDA.hpp:55
Expected< std::unique_ptr< TargetMachine > > createTargetMachine(Module &M, StringRef Arch, unsigned OptLevel=3)
Definition CoreLLVM.hpp:52
const SmallVector< StringRef > & threadIdxYFnName()
Definition CoreLLVMCUDA.hpp:75
const SmallVector< StringRef > & blockIdxYFnName()
Definition CoreLLVMCUDA.hpp:60
const SmallVector< StringRef > & blockDimYFnName()
Definition CoreLLVMCUDA.hpp:45
const SmallVector< StringRef > & blockDimZFnName()
Definition CoreLLVMCUDA.hpp:50
const SmallVector< StringRef > & blockDimXFnName()
Definition CoreLLVMCUDA.hpp:40
Definition Dispatcher.cpp:14
CodegenOption
Definition Config.hpp:10
std::unique_ptr< MemoryBuffer > codegenObject(Module &M, StringRef DeviceArch, SmallPtrSetImpl< void * > &GlobalLinkedBinaries, CodegenOption CGOption=CodegenOption::RTC)
Definition CoreLLVMCUDA.hpp:155
void setLaunchBoundsForKernel(Module &M, Function &F, size_t, int BlockSize)
Definition CoreLLVMCUDA.hpp:87
std::string toString(CodegenOption Option)
Definition Config.hpp:23