Proteus
Programmable JIT compilation and optimization for C/C++ using LLVM
Loading...
Searching...
No Matches
CoreLLVMHIP.h
Go to the documentation of this file.
1#ifndef PROTEUS_CORE_LLVM_HIP_H
2#define PROTEUS_CORE_LLVM_HIP_H
3
4#include "proteus/Error.h"
11
12#include <llvm/Bitcode/BitcodeWriter.h>
13#include <llvm/CodeGen/MachineModuleInfo.h>
14#include <llvm/IR/DiagnosticPrinter.h>
15#include <llvm/IR/Function.h>
16#include <llvm/IR/LegacyPassManager.h>
17#include <llvm/IR/Module.h>
18#include <llvm/IR/Verifier.h>
19#include <llvm/LTO/LTO.h>
20#include <llvm/MC/MCSubtargetInfo.h>
21#include <llvm/Support/CodeGen.h>
22#include <llvm/Support/FileSystem.h>
23#include <llvm/Support/MemoryBuffer.h>
24#include <llvm/Support/Path.h>
25#include <llvm/Support/Signals.h>
26#include <llvm/Support/TargetSelect.h>
27#include <llvm/Support/WithColor.h>
28#include <llvm/Target/TargetMachine.h>
29
30#include <optional>
31
32#if LLVM_VERSION_MAJOR >= 18
33#include <lld/Common/Driver.h>
34LLD_HAS_DRIVER(elf)
35#endif
36
37namespace proteus {
38
39using namespace llvm;
40
41namespace detail {
42
43inline const SmallVector<StringRef> &gridDimXFnName() {
44 static SmallVector<StringRef> Names = {
45 "_ZNK17__HIP_CoordinatesI13__HIP_GridDimE3__XcvjEv",
46 "llvm.amdgcn.num.workgroups.x", "_ZL20__hip_get_grid_dim_xv"};
47 return Names;
48}
49
50inline const SmallVector<StringRef> &gridDimYFnName() {
51 static SmallVector<StringRef> Names = {
52 "_ZNK17__HIP_CoordinatesI13__HIP_GridDimE3__YcvjEv",
53 "llvm.amdgcn.num.workgroups.y", "_ZL20__hip_get_grid_dim_yv"};
54 return Names;
55}
56
57inline const SmallVector<StringRef> &gridDimZFnName() {
58 static SmallVector<StringRef> Names = {
59 "_ZNK17__HIP_CoordinatesI13__HIP_GridDimE3__ZcvjEv",
60 "llvm.amdgcn.num.workgroups.z", "_ZL20__hip_get_grid_dim_zv"};
61 return Names;
62}
63
64inline const SmallVector<StringRef> &blockDimXFnName() {
65 static SmallVector<StringRef> Names = {
66 "_ZNK17__HIP_CoordinatesI14__HIP_BlockDimE3__XcvjEv",
67 "llvm.amdgcn.workgroup.size.x", "_ZL21__hip_get_block_dim_xv"};
68 return Names;
69}
70
71inline const SmallVector<StringRef> &blockDimYFnName() {
72 static SmallVector<StringRef> Names = {
73 "_ZNK17__HIP_CoordinatesI14__HIP_BlockDimE3__YcvjEv",
74 "llvm.amdgcn.workgroup.size.y", "_ZL21__hip_get_block_dim_yv"};
75 return Names;
76}
77
78inline const SmallVector<StringRef> &blockDimZFnName() {
79 static SmallVector<StringRef> Names = {
80 "_ZNK17__HIP_CoordinatesI14__HIP_BlockDimE3__ZcvjEv",
81 "llvm.amdgcn.workgroup.size.z", "_ZL21__hip_get_block_dim_zv"};
82 return Names;
83}
84
85inline const SmallVector<StringRef> &blockIdxXFnName() {
86 static SmallVector<StringRef> Names = {
87 "_ZNK17__HIP_CoordinatesI14__HIP_BlockIdxE3__XcvjEv",
88 "llvm.amdgcn.workgroup.id.x"};
89 return Names;
90};
91
92inline const SmallVector<StringRef> &blockIdxYFnName() {
93 static SmallVector<StringRef> Names = {
94 "_ZNK17__HIP_CoordinatesI14__HIP_BlockIdxE3__YcvjEv",
95 "llvm.amdgcn.workgroup.id.y"};
96 return Names;
97};
98
99inline const SmallVector<StringRef> &blockIdxZFnName() {
100 static SmallVector<StringRef> Names = {
101 "_ZNK17__HIP_CoordinatesI14__HIP_BlockIdxE3__ZcvjEv",
102 "llvm.amdgcn.workgroup.id.z"};
103 return Names;
104}
105
106inline const SmallVector<StringRef> &threadIdxXFnName() {
107 static SmallVector<StringRef> Names = {
108 "_ZNK17__HIP_CoordinatesI15__HIP_ThreadIdxE3__XcvjEv",
109 "llvm.amdgcn.workitem.id.x"};
110 return Names;
111};
112
113inline const SmallVector<StringRef> &threadIdxYFnName() {
114 static SmallVector<StringRef> Names = {
115 "_ZNK17__HIP_CoordinatesI15__HIP_ThreadIdxE3__YcvjEv",
116 "llvm.amdgcn.workitem.id.y"};
117 return Names;
118};
119
120inline const SmallVector<StringRef> &threadIdxZFnName() {
121 static SmallVector<StringRef> Names = {
122 "_ZNK17__HIP_CoordinatesI15__HIP_ThreadIdxE3__ZcvjEv",
123 "llvm.amdgcn.workitem.id.z"};
124 return Names;
125};
126
127inline Expected<sys::fs::TempFile> createTempFile(StringRef Prefix,
128 StringRef Suffix) {
129 SmallString<128> TmpDir;
130 sys::path::system_temp_directory(true, TmpDir);
131
132 SmallString<64> FileName;
133 FileName.append(Prefix);
134 FileName.append(Suffix.empty() ? "-%%%%%%%" : "-%%%%%%%.");
135 FileName.append(Suffix);
136 sys::path::append(TmpDir, FileName);
137 return sys::fs::TempFile::create(TmpDir);
138}
139
140#if LLVM_VERSION_MAJOR >= 18
141inline SmallVector<std::unique_ptr<sys::fs::TempFile>>
142codegenSerial(Module &M, StringRef DeviceArch,
143 [[maybe_unused]] char OptLevel = '3', int CodegenOptLevel = 3) {
144 TIMESCOPE("proteus::codegenSerial");
145 SmallVector<std::unique_ptr<sys::fs::TempFile>> ObjectFiles;
146
147 auto ExpectedTM =
148 proteus::detail::createTargetMachine(M, DeviceArch, CodegenOptLevel);
149 if (!ExpectedTM)
150 reportFatalError(toString(ExpectedTM.takeError()));
151
152 std::unique_ptr<TargetMachine> TM = std::move(*ExpectedTM);
153 TargetLibraryInfoImpl TLII(Triple(M.getTargetTriple()));
154 M.setDataLayout(TM->createDataLayout());
155
156 legacy::PassManager PM;
157 PM.add(new TargetLibraryInfoWrapperPass(TLII));
158 MachineModuleInfoWrapperPass *MMIWP =
159#if LLVM_VERSION_MAJOR >= 20
160 new MachineModuleInfoWrapperPass(TM.get());
161#else
162 new MachineModuleInfoWrapperPass(
163 reinterpret_cast<LLVMTargetMachine *>(TM.get()));
164#endif
165
166 SmallVector<char, 4096> ObjectCode;
167 raw_svector_ostream OS(ObjectCode);
168 auto ExpectedF = createTempFile("object", "o");
169 if (auto E = ExpectedF.takeError())
170 reportFatalError("Error creating object tmp file " +
171 toString(std::move(E)));
172 auto ObjectFile = std::move(*ExpectedF);
173 auto FileStream = std::make_unique<CachedFileStream>(
174 std::make_unique<llvm::raw_fd_ostream>(ObjectFile.FD, false));
175 TM->addPassesToEmitFile(PM, *FileStream->OS, nullptr,
176 CodeGenFileType::ObjectFile,
177 /* DisableVerify */ true, MMIWP);
178
179 std::unique_ptr<sys::fs::TempFile> ObjectFilePtr =
180 std::make_unique<sys::fs::TempFile>(std::move(ObjectFile));
181 ObjectFiles.emplace_back(std::move(ObjectFilePtr));
182
183 PM.run(M);
184 if (Error E = FileStream->commit())
185 reportFatalError("Error committing object tmp file stream " +
186 toString(std::move(E)));
187
188 return ObjectFiles;
189}
190
191inline SmallVector<std::unique_ptr<sys::fs::TempFile>>
192codegenParallel(Module &M, StringRef DeviceArch,
193 const OptimizationPipelineConfig &OptConfig =
194 OptimizationPipelineConfig(std::nullopt, '3', 3)) {
195 TIMESCOPE("proteus::codegenParallel");
196 // Use regular LTO with parallelism enabled to parallelize codegen.
197 std::atomic<bool> LTOError = false;
198
199 auto DiagnosticHandler = [&](const DiagnosticInfo &DI) {
200 std::string ErrStorage;
201 raw_string_ostream OS(ErrStorage);
202 DiagnosticPrinterRawOStream DP(OS);
203 DI.print(DP);
204
205 switch (DI.getSeverity()) {
206 case DS_Error:
207 WithColor::error(errs(), "[proteus codegen]") << ErrStorage << "\n";
208 LTOError = true;
209 break;
210 case DS_Warning:
211 WithColor::warning(errs(), "[proteus codegen]") << ErrStorage << "\n";
212 break;
213 case DS_Note:
214 WithColor::note(errs(), "[proteus codegen]") << ErrStorage << "\n";
215 break;
216 case DS_Remark:
217 WithColor::remark(errs()) << ErrStorage << "\n";
218 break;
219 }
220 };
221
222 // Create TargetMachine and extract options/features.
223 auto ExpectedTM = proteus::detail::createTargetMachine(
224 M, DeviceArch, OptConfig.CodegenOptLevel);
225 if (!ExpectedTM)
226 reportFatalError(toString(ExpectedTM.takeError()));
227 std::unique_ptr<TargetMachine> TM = std::move(*ExpectedTM);
228
229 lto::Config Conf;
230 Conf.CPU = DeviceArch;
231
232 // Propagate attributes from TargetMachine to LTO Config.
233 std::string FeatureStr = TM->getMCSubtargetInfo()->getFeatureString().str();
234 if (!FeatureStr.empty()) {
235 SmallVector<StringRef> Features;
236 StringRef(FeatureStr).split(Features, ',');
237 for (auto &F : Features)
238 Conf.MAttrs.push_back(F.str());
239 } else {
240 Conf.MAttrs = {};
241 }
242
243 // FIX: Propagate TargetOptions (e.g. UnsafeFPMath, etc.)
244 Conf.Options = TM->Options;
245
246 Conf.DisableVerify = true;
247 Conf.TimeTraceEnabled = false;
248 Conf.DebugPassManager = false;
249 Conf.VerifyEach = false;
250 Conf.DiagHandler = DiagnosticHandler;
251 Conf.OptLevel = OptConfig.OptLevel;
252 // Parallel codegen lets LTO own optimization, so custom textual pipelines
253 // must be forwarded to the LTO configuration instead of run beforehand.
254 if (OptConfig.PassPipeline)
255 Conf.OptPipeline = OptConfig.PassPipeline.value();
256 Conf.CGOptLevel = static_cast<CodeGenOptLevel>(OptConfig.CodegenOptLevel);
257
258 unsigned ParallelCodeGenParallelismLevel =
259 std::max(1u, std::thread::hardware_concurrency());
260 lto::LTO L(std::move(Conf), {}, ParallelCodeGenParallelismLevel);
261
262 // Ensure module has the correct DataLayout prior to emitting bitcode.
263 M.setDataLayout(TM->createDataLayout());
264
265 SmallString<0> BitcodeBuf;
266 raw_svector_ostream BitcodeOS(BitcodeBuf);
267 WriteBitcodeToFile(M, BitcodeOS);
268
269 // TODO: Module identifier can be empty because you always have on module to
270 // link. However, in the general case, with multiple modules, each one must
271 // have a unique identifier for LTO to work correctly.
272 auto IF = cantFail(lto::InputFile::create(
273 MemoryBufferRef{BitcodeBuf, M.getModuleIdentifier()}));
274
275 std::set<std::string> PrevailingSymbols;
276 auto BuildResolutions = [&]() {
277 // Save the input file and the buffer associated with its memory.
278 const auto Symbols = IF->symbols();
279 SmallVector<lto::SymbolResolution, 16> Resolutions(Symbols.size());
280 size_t SymbolIdx = 0;
281 for (auto &Sym : Symbols) {
282 lto::SymbolResolution &Res = Resolutions[SymbolIdx];
283 SymbolIdx++;
284
285 // All defined symbols are prevailing.
286 Res.Prevailing = !Sym.isUndefined() &&
287 PrevailingSymbols.insert(Sym.getName().str()).second;
288
289 Res.VisibleToRegularObj =
290 Res.Prevailing &&
291 Sym.getVisibility() != GlobalValue::HiddenVisibility &&
292 !Sym.canBeOmittedFromSymbolTable();
293
294 Res.ExportDynamic =
295 Sym.getVisibility() != GlobalValue::HiddenVisibility &&
296 (!Sym.canBeOmittedFromSymbolTable());
297
298 Res.FinalDefinitionInLinkageUnit =
299 Sym.getVisibility() != GlobalValue::DefaultVisibility &&
300 (!Sym.isUndefined() && !Sym.isCommon());
301
302 // Device linking does not support linker redefined symbols (e.g.
303 // --wrap).
304 Res.LinkerRedefined = false;
305
307 auto PrintSymbol = [](const lto::InputFile::Symbol &Sym,
308 lto::SymbolResolution &Res) {
309 auto &OutStream = Logger::logs("proteus");
310 OutStream << "Vis: ";
311 switch (Sym.getVisibility()) {
312 case GlobalValue::HiddenVisibility:
313 OutStream << 'H';
314 break;
315 case GlobalValue::ProtectedVisibility:
316 OutStream << 'P';
317 break;
318 case GlobalValue::DefaultVisibility:
319 OutStream << 'D';
320 break;
321 }
322
323 OutStream << " Sym: ";
324 auto PrintBool = [&](char C, bool B) { OutStream << (B ? C : '-'); };
325 PrintBool('U', Sym.isUndefined());
326 PrintBool('C', Sym.isCommon());
327 PrintBool('W', Sym.isWeak());
328 PrintBool('I', Sym.isIndirect());
329 PrintBool('O', Sym.canBeOmittedFromSymbolTable());
330 PrintBool('T', Sym.isTLS());
331 PrintBool('X', Sym.isExecutable());
332 OutStream << ' ' << Sym.getName();
333 OutStream << "| P " << Res.Prevailing;
334 OutStream << " V " << Res.VisibleToRegularObj;
335 OutStream << " E " << Res.ExportDynamic;
336 OutStream << " F " << Res.FinalDefinitionInLinkageUnit;
337 OutStream << "\n";
338 };
339
340 PrintSymbol(Sym, Res);
341 }
342 }
343
344 // Add the bitcode file with its resolved symbols to the LTO job.
345 cantFail(L.add(std::move(IF), Resolutions));
346 };
347
348 BuildResolutions();
349
350 // Run the LTO job to compile the bitcode.
351 size_t MaxTasks = L.getMaxTasks();
352 SmallVector<std::unique_ptr<sys::fs::TempFile>> ObjectFiles{MaxTasks};
353
354 auto AddStream =
355 [&](size_t Task,
356 const Twine & /*ModuleName*/) -> std::unique_ptr<CachedFileStream> {
357 std::string TaskStr = Task ? "." + std::to_string(Task) : "";
358 auto ExpectedF = createTempFile("lto-shard" + TaskStr, "o");
359 if (auto E = ExpectedF.takeError())
360 reportFatalError("Error creating tmp file " + toString(std::move(E)));
361 ObjectFiles[Task] =
362 std::make_unique<sys::fs::TempFile>(std::move(*ExpectedF));
363 auto Ret = std::make_unique<CachedFileStream>(
364 std::make_unique<llvm::raw_fd_ostream>(ObjectFiles[Task]->FD, false));
365 if (!Ret)
366 reportFatalError("Error creating CachedFileStream");
367 return Ret;
368 };
369
370 if (Error E = L.run(AddStream))
371 reportFatalError("Error: " + toString(std::move(E)));
372
373 if (LTOError)
375 createStringError(inconvertibleErrorCode(),
376 "Errors encountered inside the LTO pipeline.")));
377
378 return ObjectFiles;
379}
380#endif
381
382inline std::unique_ptr<MemoryBuffer> codegenRTC(Module &M,
383 StringRef DeviceArch) {
384 TIMESCOPE("proteus::codegenRTC");
385 char *BinOut;
386 size_t BinSize;
387
388 SmallString<4096> ModuleBuf;
389 raw_svector_ostream ModuleBufOS(ModuleBuf);
390 WriteBitcodeToFile(M, ModuleBufOS);
391
392 hiprtcLinkState HipLinkStatePtr;
393
394 // NOTE: This code is an example of passing custom, AMD-specific
395 // options to the compiler/linker.
396 // NOTE: Unrolling can have a dramatic (time-consuming) effect on JIT
397 // compilation time and on the resulting optimization, better or worse
398 // depending on code specifics.
399 std::string MArchOpt = ("-march=" + DeviceArch).str();
400
401 // NOTE: We used to pass these options as well. "-mllvm",
402 // "-unroll-threshold=1000",
403 // We removed them cause we saw on bezier they cause slowdowns
404 const char *OptArgs[] = {MArchOpt.c_str()};
405 std::vector<hiprtcJIT_option> JITOptions = {
406 HIPRTC_JIT_IR_TO_ISA_OPT_EXT, HIPRTC_JIT_IR_TO_ISA_OPT_COUNT_EXT};
407 size_t OptArgsSize = 1;
408 const void *JITOptionsValues[] = {(void *)OptArgs, (void *)(OptArgsSize)};
409 proteusHiprtcErrCheck(hiprtcLinkCreate(JITOptions.size(), JITOptions.data(),
410 (void **)JITOptionsValues,
411 &HipLinkStatePtr));
412 // NOTE: the following version of te code does not set options.
413 // proteusHiprtcErrCheck(hiprtcLinkCreate(0, nullptr, nullptr,
414 // &hip_link_state_ptr));
415
416 proteusHiprtcErrCheck(hiprtcLinkAddData(
417 HipLinkStatePtr, HIPRTC_JIT_INPUT_LLVM_BITCODE, (void *)ModuleBuf.data(),
418 ModuleBuf.size(), "", 0, nullptr, nullptr));
420 hiprtcLinkComplete(HipLinkStatePtr, (void **)&BinOut, &BinSize));
421
422 return MemoryBuffer::getMemBuffer(StringRef{BinOut, BinSize});
423}
424
425} // namespace detail
426
427inline void setLaunchBoundsForKernel(Function &F, int MaxNumWorkGroups,
428 int MinBlocksPerSM = 0) {
429 // TODO: fix calculation of launch bounds.
430 // TODO: find maximum (hardcoded 1024) from device info.
431 // TODO: Setting as 1, BlockSize to replicate launch bounds settings
432 F.addFnAttr("amdgpu-flat-work-group-size",
433 "1," + std::to_string(std::min(1024, MaxNumWorkGroups)));
434 // F->addFnAttr("amdgpu-waves-per-eu", std::to_string(WavesPerEU));
435 if (MinBlocksPerSM != 0) {
436 // NOTE: We are missing a heuristic to define the `WavesPerEU`, as such we
437 // still need to study it. I restrict the waves by setting min equal to max
438 // and disallowing any heuristics that HIP will use internally.
439 // For more information please check:
440 // https://clang.llvm.org/docs/AttributeReference.html#amdgpu-waves-per-eu
441 F.addFnAttr("amdgpu-waves-per-eu", std::to_string(MinBlocksPerSM) + "," +
442 std::to_string(MinBlocksPerSM));
443 }
444
445 PROTEUS_DBG(Logger::logs("proteus")
446 << " => Set Workgroup size " << MaxNumWorkGroups
447 << " WavesPerEU (unused) " << MinBlocksPerSM << "\n");
448}
449
450inline std::unique_ptr<MemoryBuffer>
451codegenObject(Module &M, StringRef DeviceArch,
452 [[maybe_unused]] SmallPtrSetImpl<void *> &GlobalLinkedBinaries,
454 const OptimizationPipelineConfig &OptConfig =
455 OptimizationPipelineConfig(std::nullopt, '3', 3)) {
456 TIMESCOPE("proteus::codegenObjectHIP");
457 assert(GlobalLinkedBinaries.empty() &&
458 "Expected empty linked binaries for HIP");
459 Timer T(Config::get().ProteusEnableTimers);
460 SmallVector<std::unique_ptr<sys::fs::TempFile>> ObjectFiles;
461 switch (CGOption) {
462 case CodegenOption::RTC: {
463 auto Ret = detail::codegenRTC(M, DeviceArch);
465 << "Codegen RTC " << T.elapsed() << " ms\n");
466 return Ret;
467 }
468#if LLVM_VERSION_MAJOR >= 18
470 ObjectFiles = detail::codegenSerial(M, DeviceArch);
471 break;
473 ObjectFiles = detail::codegenParallel(M, DeviceArch, OptConfig);
474 break;
475#endif
476 default:
477 reportFatalError("Unknown Codegen Option");
478 }
479
480 if (ObjectFiles.empty())
481 reportFatalError("Expected non-empty vector of object files");
482
483#if LLVM_VERSION_MAJOR >= 18
484 auto ExpectedF = detail::createTempFile("proteus-jit", "o");
485 if (auto E = ExpectedF.takeError())
486 reportFatalError("Error creating shared object file " +
487 toString(std::move(E)));
488
489 auto SharedObject = std::move(*ExpectedF);
490
491 std::vector<const char *> Args{"ld.lld", "--no-undefined", "-shared", "-o",
492 SharedObject.TmpName.c_str()};
493 for (auto &File : ObjectFiles) {
494 if (!File)
495 continue;
496 Args.push_back(File->TmpName.c_str());
497 }
498
500 for (auto &Arg : Args) {
501 Logger::logs("proteus") << Arg << " ";
502 }
503 Logger::logs("proteus") << "\n";
504 }
505
507 << "Codegen object " << toString(CGOption) << "["
508 << ObjectFiles.size() << "] " << T.elapsed() << " ms\n");
509
510 T.reset();
511 // The LLD linker interface is not thread-safe, so we use a mutex.
512 static std::mutex Mutex;
513 {
514 std::lock_guard LockGuard{Mutex};
515 lld::Result S = lld::lldMain(Args, llvm::outs(), llvm::errs(),
516 {{lld::Gnu, &lld::elf::link}});
517 if (S.retCode)
518 reportFatalError("Error: lld failed");
519 }
520
521 ErrorOr<std::unique_ptr<MemoryBuffer>> Buffer =
522 MemoryBuffer::getFileAsStream(SharedObject.TmpName);
523 if (!Buffer)
524 reportFatalError("Error reading file: " + Buffer.getError().message());
525
526 // Remove temporary files.
527 for (auto &File : ObjectFiles) {
528 if (!File)
529 continue;
530 if (auto E = File->discard())
531 reportFatalError("Error removing object tmp file " +
532 toString(std::move(E)));
533 }
534 if (auto E = SharedObject.discard())
535 reportFatalError("Error removing shared object tmp file " +
536 toString(std::move(E)));
537
539 << "Codegen linking " << T.elapsed() << " ms\n");
540
541 return std::move(*Buffer);
542#else
543 reportFatalError("Expected LLVM18 for non-RTC codegen");
544#endif
545}
546
547} // namespace proteus
548
549#endif
char int void ** Args
Definition CompilerInterfaceHost.cpp:23
#define PROTEUS_TIMER_OUTPUT(x)
Definition Config.h:440
#define PROTEUS_DBG(x)
Definition Debug.h:9
#define TIMESCOPE(...)
Definition TimeTracing.h:66
#define proteusHiprtcErrCheck(CALL)
Definition UtilsHIP.h:28
static Config & get()
Definition Config.h:334
bool ProteusDebugOutput
Definition Config.h:350
static llvm::raw_ostream & outs(const std::string &Name)
Definition Logger.h:25
static llvm::raw_ostream & logs(const std::string &Name)
Definition Logger.h:19
Definition TimeTracing.h:33
void reset()
Definition TimeTracing.cpp:68
uint64_t elapsed()
Definition TimeTracing.cpp:66
Definition CompiledLibrary.h:7
const SmallVector< StringRef > & threadIdxXFnName()
Definition CoreLLVMCUDA.h:70
const SmallVector< StringRef > & gridDimYFnName()
Definition CoreLLVMCUDA.h:30
const SmallVector< StringRef > & threadIdxZFnName()
Definition CoreLLVMCUDA.h:80
const SmallVector< StringRef > & blockIdxZFnName()
Definition CoreLLVMCUDA.h:65
const SmallVector< StringRef > & gridDimZFnName()
Definition CoreLLVMCUDA.h:35
std::unique_ptr< MemoryBuffer > codegenRTC(Module &M, StringRef DeviceArch)
Definition CoreLLVMHIP.h:382
const SmallVector< StringRef > & gridDimXFnName()
Definition CoreLLVMCUDA.h:25
const SmallVector< StringRef > & blockIdxXFnName()
Definition CoreLLVMCUDA.h:55
Expected< std::unique_ptr< TargetMachine > > createTargetMachine(Module &M, StringRef Arch, unsigned OptLevel=3)
Definition CoreLLVM.h:57
const SmallVector< StringRef > & threadIdxYFnName()
Definition CoreLLVMCUDA.h:75
Expected< sys::fs::TempFile > createTempFile(StringRef Prefix, StringRef Suffix)
Definition CoreLLVMHIP.h:127
const SmallVector< StringRef > & blockIdxYFnName()
Definition CoreLLVMCUDA.h:60
const SmallVector< StringRef > & blockDimYFnName()
Definition CoreLLVMCUDA.h:45
const SmallVector< StringRef > & blockDimZFnName()
Definition CoreLLVMCUDA.h:50
const SmallVector< StringRef > & blockDimXFnName()
Definition CoreLLVMCUDA.h:40
Definition MemoryCache.h:27
void setLaunchBoundsForKernel(Function &F, int MaxThreadsPerSM, int MinBlocksPerSM=0)
Definition CoreLLVMCUDA.h:87
void reportFatalError(const llvm::Twine &Reason, const char *FILE, unsigned Line)
Definition Error.cpp:14
CodegenOption
Definition Config.h:16
std::unique_ptr< MemoryBuffer > codegenObject(Module &M, StringRef DeviceArch, SmallPtrSetImpl< void * > &GlobalLinkedBinaries, CodegenOption CGOption=CodegenOption::RTC)
Definition CoreLLVMCUDA.h:176
std::string toString(CodegenOption Option)
Definition Config.h:28
Definition CoreLLVM.h:187