Proteus
Programmable JIT compilation and optimization for C/C++ using LLVM
Loading...
Searching...
No Matches
CoreLLVMHIP.hpp
Go to the documentation of this file.
1#ifndef PROTEUS_CORE_LLVM_HIP_HPP
2#define PROTEUS_CORE_LLVM_HIP_HPP
3
4#include <llvm/Bitcode/BitcodeWriter.h>
5#include <llvm/CodeGen/MachineModuleInfo.h>
6#include <llvm/IR/DiagnosticPrinter.h>
7#include <llvm/IR/Function.h>
8#include <llvm/IR/LegacyPassManager.h>
9#include <llvm/IR/Module.h>
10#include <llvm/IR/Verifier.h>
11#include <llvm/LTO/LTO.h>
12#include <llvm/Support/CodeGen.h>
13#include <llvm/Support/FileSystem.h>
14#include <llvm/Support/MemoryBuffer.h>
15#include <llvm/Support/Path.h>
16#include <llvm/Support/Signals.h>
17#include <llvm/Support/TargetSelect.h>
18#include <llvm/Support/WithColor.h>
19#include <llvm/Target/TargetMachine.h>
20
21#if LLVM_VERSION_MAJOR >= 18
22#include <lld/Common/Driver.h>
23LLD_HAS_DRIVER(elf)
24#endif
25
26#include "proteus/CoreLLVM.hpp"
27#include "proteus/Debug.h"
28#include "proteus/Error.h"
29#include "proteus/Logger.hpp"
31#include "proteus/Utils.h"
32#include "proteus/UtilsHIP.h"
33
34namespace proteus {
35
36using namespace llvm;
37
38namespace detail {
39
42 "_ZNK17__HIP_CoordinatesI13__HIP_GridDimE3__XcvjEv",
43 "llvm.amdgcn.num.workgroups.x", "_ZL20__hip_get_grid_dim_xv"};
44 return Names;
45}
46
49 "_ZNK17__HIP_CoordinatesI13__HIP_GridDimE3__YcvjEv",
50 "llvm.amdgcn.num.workgroups.y", "_ZL20__hip_get_grid_dim_yv"};
51 return Names;
52}
53
56 "_ZNK17__HIP_CoordinatesI13__HIP_GridDimE3__ZcvjEv",
57 "llvm.amdgcn.num.workgroups.z", "_ZL20__hip_get_grid_dim_zv"};
58 return Names;
59}
60
63 "_ZNK17__HIP_CoordinatesI14__HIP_BlockDimE3__XcvjEv",
64 "llvm.amdgcn.workgroup.size.x", "_ZL21__hip_get_block_dim_xv"};
65 return Names;
66}
67
70 "_ZNK17__HIP_CoordinatesI14__HIP_BlockDimE3__YcvjEv",
71 "llvm.amdgcn.workgroup.size.y", "_ZL21__hip_get_block_dim_yv"};
72 return Names;
73}
74
77 "_ZNK17__HIP_CoordinatesI14__HIP_BlockDimE3__ZcvjEv",
78 "llvm.amdgcn.workgroup.size.z", "_ZL21__hip_get_block_dim_zv"};
79 return Names;
80}
81
84 "_ZNK17__HIP_CoordinatesI14__HIP_BlockIdxE3__XcvjEv",
85 "llvm.amdgcn.workgroup.id.x"};
86 return Names;
87};
88
91 "_ZNK17__HIP_CoordinatesI14__HIP_BlockIdxE3__YcvjEv",
92 "llvm.amdgcn.workgroup.id.y"};
93 return Names;
94};
95
98 "_ZNK17__HIP_CoordinatesI14__HIP_BlockIdxE3__ZcvjEv",
99 "llvm.amdgcn.workgroup.id.z"};
100 return Names;
101}
102
105 "_ZNK17__HIP_CoordinatesI15__HIP_ThreadIdxE3__XcvjEv",
106 "llvm.amdgcn.workitem.id.x"};
107 return Names;
108};
109
112 "_ZNK17__HIP_CoordinatesI15__HIP_ThreadIdxE3__YcvjEv",
113 "llvm.amdgcn.workitem.id.y"};
114 return Names;
115};
116
119 "_ZNK17__HIP_CoordinatesI15__HIP_ThreadIdxE3__ZcvjEv",
120 "llvm.amdgcn.workitem.id.z"};
121 return Names;
122};
123
125 StringRef Suffix) {
127 sys::path::system_temp_directory(true, TmpDir);
128
130 FileName.append(Prefix);
131 FileName.append(Suffix.empty() ? "-%%%%%%%" : "-%%%%%%%.");
132 FileName.append(Suffix);
133 sys::path::append(TmpDir, FileName);
134 return sys::fs::TempFile::create(TmpDir);
135}
136
137#if LLVM_VERSION_MAJOR >= 18
139codegenSerial(Module &M, StringRef DeviceArch,
140 [[maybe_unused]] char OptLevel = '3', int CodegenOptLevel = 3) {
142
143 auto ExpectedTM =
144 proteus::detail::createTargetMachine(M, DeviceArch, CodegenOptLevel);
145 if (!ExpectedTM)
147
148 std::unique_ptr<TargetMachine> TM = std::move(*ExpectedTM);
149 TargetLibraryInfoImpl TLII(Triple(M.getTargetTriple()));
150 M.setDataLayout(TM->createDataLayout());
151
152 legacy::PassManager PM;
155 reinterpret_cast<LLVMTargetMachine *>(TM.get()));
156
159 auto ExpectedF = createTempFile("object", "o");
160 if (auto E = ExpectedF.takeError())
161 PROTEUS_FATAL_ERROR("Error creating object tmp file " +
162 toString(std::move(E)));
163 auto ObjectFile = std::move(*ExpectedF);
164 auto FileStream = std::make_unique<CachedFileStream>(
165 std::make_unique<llvm::raw_fd_ostream>(ObjectFile.FD, false));
166 TM->addPassesToEmitFile(PM, *FileStream->OS, nullptr,
167 CodeGenFileType::ObjectFile,
168 /* DisableVerify */ true, MMIWP);
169
170 std::unique_ptr<sys::fs::TempFile> ObjectFilePtr =
171 std::make_unique<sys::fs::TempFile>(std::move(ObjectFile));
172 ObjectFiles.emplace_back(std::move(ObjectFilePtr));
173
174 PM.run(M);
175
176 return ObjectFiles;
177}
178
180codegenParallel(Module &M, StringRef DeviceArch, unsigned int OptLevel = 3,
181 int CodegenOptLevel = 3) {
182 // Use regular LTO with parallelism enabled to parallelize codegen.
183 std::atomic<bool> LTOError = false;
184
185 auto DiagnosticHandler = [&](const DiagnosticInfo &DI) {
186 std::string ErrStorage;
189 DI.print(DP);
190
191 switch (DI.getSeverity()) {
192 case DS_Error:
193 WithColor::error(errs(), "[proteus codegen]") << ErrStorage << "\n";
194 LTOError = true;
195 break;
196 case DS_Warning:
197 WithColor::warning(errs(), "[proteus codegen]") << ErrStorage << "\n";
198 break;
199 case DS_Note:
200 WithColor::note(errs(), "[proteus codegen]") << ErrStorage << "\n";
201 break;
202 case DS_Remark:
203 WithColor::remark(errs()) << ErrStorage << "\n";
204 break;
205 }
206 };
207
208 lto::Config Conf;
209 Conf.CPU = DeviceArch;
210 // Use default machine attributes.
211 Conf.MAttrs = {};
212 Conf.DisableVerify = true;
213 Conf.TimeTraceEnabled = false;
214 Conf.DebugPassManager = false;
215 Conf.VerifyEach = false;
216 Conf.DiagHandler = DiagnosticHandler;
217 Conf.OptLevel = OptLevel;
218 Conf.CGOptLevel = static_cast<CodeGenOptLevel>(CodegenOptLevel);
219
221 std::max(1u, std::thread::hardware_concurrency());
222 lto::LTO L(std::move(Conf), nullptr, ParallelCodeGenParallelismLevel);
223
224 // Ensure module has the correct DataLayout prior to emitting bitcode.
225 auto ExpectedTM =
226 proteus::detail::createTargetMachine(M, DeviceArch, CodegenOptLevel);
227 if (!ExpectedTM)
229 std::unique_ptr<TargetMachine> TM = std::move(*ExpectedTM);
230 M.setDataLayout(TM->createDataLayout());
231
235
236 // TODO: Module identifier can be empty because you always have on module to
237 // link. However, in the general case, with multiple modules, each one must
238 // have a unique identifier for LTO to work correctly.
239 auto IF = cantFail(lto::InputFile::create(
240 MemoryBufferRef{BitcodeBuf, M.getModuleIdentifier()}));
241
242 std::set<std::string> PrevailingSymbols;
243 auto BuildResolutions = [&]() {
244 // Save the input file and the buffer associated with its memory.
245 const auto Symbols = IF->symbols();
247 size_t SymbolIdx = 0;
248 for (auto &Sym : Symbols) {
249 lto::SymbolResolution &Res = Resolutions[SymbolIdx];
250 SymbolIdx++;
251
252 // All defined symbols are prevailing.
253 Res.Prevailing = !Sym.isUndefined() &&
254 PrevailingSymbols.insert(Sym.getName().str()).second;
255
256 Res.VisibleToRegularObj =
257 Res.Prevailing &&
258 Sym.getVisibility() != GlobalValue::HiddenVisibility &&
259 !Sym.canBeOmittedFromSymbolTable();
260
261 Res.ExportDynamic =
262 Sym.getVisibility() != GlobalValue::HiddenVisibility &&
263 (!Sym.canBeOmittedFromSymbolTable());
264
265 Res.FinalDefinitionInLinkageUnit =
266 Sym.getVisibility() != GlobalValue::DefaultVisibility &&
267 (!Sym.isUndefined() && !Sym.isCommon());
268
269 // Device linking does not support linker redefined symbols (e.g.
270 // --wrap).
271 Res.LinkerRedefined = false;
272
274 auto PrintSymbol = [](const lto::InputFile::Symbol &Sym,
275 lto::SymbolResolution &Res) {
276 auto &OutStream = Logger::logs("proteus");
277 OutStream << "Vis: ";
278 switch (Sym.getVisibility()) {
279 case GlobalValue::HiddenVisibility:
280 OutStream << 'H';
281 break;
282 case GlobalValue::ProtectedVisibility:
283 OutStream << 'P';
284 break;
285 case GlobalValue::DefaultVisibility:
286 OutStream << 'D';
287 break;
288 }
289
290 OutStream << " Sym: ";
291 auto PrintBool = [&](char C, bool B) { OutStream << (B ? C : '-'); };
292 PrintBool('U', Sym.isUndefined());
293 PrintBool('C', Sym.isCommon());
294 PrintBool('W', Sym.isWeak());
295 PrintBool('I', Sym.isIndirect());
296 PrintBool('O', Sym.canBeOmittedFromSymbolTable());
297 PrintBool('T', Sym.isTLS());
298 PrintBool('X', Sym.isExecutable());
299 OutStream << ' ' << Sym.getName();
300 OutStream << "| P " << Res.Prevailing;
301 OutStream << " V " << Res.VisibleToRegularObj;
302 OutStream << " E " << Res.ExportDynamic;
303 OutStream << " F " << Res.FinalDefinitionInLinkageUnit;
304 OutStream << "\n";
305 };
306
308 }
309 }
310
311 // Add the bitcode file with its resolved symbols to the LTO job.
312 cantFail(L.add(std::move(IF), Resolutions));
313 };
314
316
317 // Run the LTO job to compile the bitcode.
318 size_t MaxTasks = L.getMaxTasks();
320
321 auto AddStream =
322 [&](size_t Task,
323 const Twine & /*ModuleName*/) -> std::unique_ptr<CachedFileStream> {
324 std::string TaskStr = Task ? "." + std::to_string(Task) : "";
325 auto ExpectedF = createTempFile("lto-shard" + TaskStr, "o");
326 if (auto E = ExpectedF.takeError())
327 PROTEUS_FATAL_ERROR("Error creating tmp file " + toString(std::move(E)));
329 std::make_unique<sys::fs::TempFile>(std::move(*ExpectedF));
330 auto Ret = std::make_unique<CachedFileStream>(
331 std::make_unique<llvm::raw_fd_ostream>(ObjectFiles[Task]->FD, false));
332 if (!Ret)
333 PROTEUS_FATAL_ERROR("Error creating CachedFileStream");
334 return Ret;
335 };
336
337 if (Error E = L.run(AddStream))
338 PROTEUS_FATAL_ERROR("Error: " + toString(std::move(E)));
339
340 if (LTOError)
343 "Errors encountered inside the LTO pipeline.")));
344
345 return ObjectFiles;
346}
347#endif
348
349inline std::unique_ptr<MemoryBuffer> codegenRTC(Module &M,
350 StringRef DeviceArch) {
351 char *BinOut;
352 size_t BinSize;
353
357
359
360 // NOTE: This code is an example of passing custom, AMD-specific
361 // options to the compiler/linker.
362 // NOTE: Unrolling can have a dramatic (time-consuming) effect on JIT
363 // compilation time and on the resulting optimization, better or worse
364 // depending on code specifics.
365 std::string MArchOpt = ("-march=" + DeviceArch).str();
366
367 // NOTE: We used to pass these options as well. "-mllvm",
368 // "-unroll-threshold=1000",
369 // We removed them cause we saw on bezier they cause slowdowns
370 const char *OptArgs[] = {MArchOpt.c_str()};
371 std::vector<hiprtcJIT_option> JITOptions = {
373 size_t OptArgsSize = 1;
374 const void *JITOptionsValues[] = {(void *)OptArgs, (void *)(OptArgsSize)};
376 (void **)JITOptionsValues,
378 // NOTE: the following version of te code does not set options.
379 // proteusHiprtcErrCheck(hiprtcLinkCreate(0, nullptr, nullptr,
380 // &hip_link_state_ptr));
381
384 ModuleBuf.size(), "", 0, nullptr, nullptr));
387
388 return MemoryBuffer::getMemBuffer(StringRef{BinOut, BinSize});
389}
390
391} // namespace detail
392
394 int WavesPerEU = 0) {
395 // TODO: fix calculation of launch bounds.
396 // TODO: find maximum (hardcoded 1024) from device info.
397 // TODO: Setting as 1, BlockSize to replicate launch bounds settings
398 F.addFnAttr("amdgpu-flat-work-group-size",
399 "1," + std::to_string(std::min(1024, MaxNumWorkGroups)));
400 // F->addFnAttr("amdgpu-waves-per-eu", std::to_string(WavesPerEU));
401 if (WavesPerEU != 0) {
402 // NOTE: We are missing a heuristic to define the `WavesPerEU`, as such we
403 // still need to study it. I restrict the waves by setting min equal to max
404 // and disallowing any heuristics that HIP will use internally.
405 // For more information please check:
406 // https://clang.llvm.org/docs/AttributeReference.html#amdgpu-waves-per-eu
407 F.addFnAttr("amdgpu-waves-per-eu",
408 std::to_string(WavesPerEU) + "," + std::to_string(WavesPerEU));
409 }
410
411 PROTEUS_DBG(Logger::logs("proteus")
412 << " => Set Workgroup size " << MaxNumWorkGroups
413 << " WavesPerEU (unused) " << WavesPerEU << "\n");
414}
415
416inline std::unique_ptr<MemoryBuffer>
417codegenObject(Module &M, StringRef DeviceArch,
418 [[maybe_unused]] SmallPtrSetImpl<void *> &GlobalLinkedBinaries,
420 assert(GlobalLinkedBinaries.empty() &&
421 "Expected empty linked binaries for HIP");
422 Timer T;
424 switch (CGOption) {
425 case CodegenOption::RTC: {
426 auto Ret = detail::codegenRTC(M, DeviceArch);
428 << "Codegen RTC " << T.elapsed() << " ms\n");
429 return Ret;
430 }
431#if LLVM_VERSION_MAJOR >= 18
433 ObjectFiles = detail::codegenSerial(M, DeviceArch);
434 break;
436 ObjectFiles = detail::codegenParallel(M, DeviceArch);
437 break;
438#endif
439 default:
440 PROTEUS_FATAL_ERROR("Unknown Codegen Option");
441 }
442
443 if (ObjectFiles.empty())
444 PROTEUS_FATAL_ERROR("Expected non-empty vector of object files");
445
446#if LLVM_VERSION_MAJOR >= 18
447 auto ExpectedF = detail::createTempFile("proteus-jit", "o");
448 if (auto E = ExpectedF.takeError())
449 PROTEUS_FATAL_ERROR("Error creating shared object file " +
450 toString(std::move(E)));
451
452 auto SharedObject = std::move(*ExpectedF);
453
454 std::vector<const char *> Args{"ld.lld", "--no-undefined", "-shared", "-o",
455 SharedObject.TmpName.c_str()};
456 for (auto &File : ObjectFiles) {
457 if (!File)
458 continue;
459 Args.push_back(File->TmpName.c_str());
460 }
461
462 if (Config::get().ProteusDebugOutput) {
463 for (auto &Arg : Args) {
464 Logger::logs("proteus") << Arg << " ";
465 }
466 Logger::logs("proteus") << "\n";
467 }
468
470 << "Codegen object " << toString(CGOption) << "["
471 << ObjectFiles.size() << "] " << T.elapsed() << " ms\n");
472
473 T.reset();
474 // The LLD linker interface is not thread-safe, so we use a mutex.
475 static std::mutex Mutex;
476 {
477 std::lock_guard LockGuard{Mutex};
478 lld::Result S = lld::lldMain(Args, llvm::outs(), llvm::errs(),
479 {{lld::Gnu, &lld::elf::link}});
480 if (S.retCode)
481 PROTEUS_FATAL_ERROR("Error: lld failed");
482 }
483
485 MemoryBuffer::getFileAsStream(SharedObject.TmpName);
486 if (!Buffer)
487 PROTEUS_FATAL_ERROR("Error reading file: " + Buffer.getError().message());
488
489 // Remove temporary files.
490 for (auto &File : ObjectFiles) {
491 if (!File)
492 continue;
493 if (auto E = File->discard())
494 PROTEUS_FATAL_ERROR("Error removing object tmp file " +
495 toString(std::move(E)));
496 }
497 if (auto E = SharedObject.discard())
498 PROTEUS_FATAL_ERROR("Error removing shared object tmp file " +
499 toString(std::move(E)));
500
502 << "Codegen linking " << T.elapsed() << " ms\n");
503
504 return std::move(*Buffer);
505#else
506 PROTEUS_FATAL_ERROR("Expected LLVM18 for non-RTC codegen");
507#endif
508}
509
510} // namespace proteus
511
512#endif
char int void ** Args
Definition CompilerInterfaceHost.cpp:21
#define PROTEUS_DBG(x)
Definition Debug.h:9
#define PROTEUS_FATAL_ERROR(x)
Definition Error.h:7
#define PROTEUS_TIMER_OUTPUT(x)
Definition TimeTracing.hpp:57
#define proteusHiprtcErrCheck(CALL)
Definition UtilsHIP.h:28
static Config & get()
Definition Config.hpp:284
bool ProteusDebugOutput
Definition Config.hpp:300
static llvm::raw_ostream & outs(const std::string &Name)
Definition Logger.hpp:25
static llvm::raw_ostream & logs(const std::string &Name)
Definition Logger.hpp:19
Definition TimeTracing.hpp:36
Definition Helpers.h:138
const SmallVector< StringRef > & threadIdxXFnName()
Definition CoreLLVMCUDA.hpp:70
const SmallVector< StringRef > & gridDimYFnName()
Definition CoreLLVMCUDA.hpp:30
const SmallVector< StringRef > & threadIdxZFnName()
Definition CoreLLVMCUDA.hpp:80
const SmallVector< StringRef > & blockIdxZFnName()
Definition CoreLLVMCUDA.hpp:65
const SmallVector< StringRef > & gridDimZFnName()
Definition CoreLLVMCUDA.hpp:35
std::unique_ptr< MemoryBuffer > codegenRTC(Module &M, StringRef DeviceArch)
Definition CoreLLVMHIP.hpp:349
const SmallVector< StringRef > & gridDimXFnName()
Definition CoreLLVMCUDA.hpp:25
const SmallVector< StringRef > & blockIdxXFnName()
Definition CoreLLVMCUDA.hpp:55
Expected< std::unique_ptr< TargetMachine > > createTargetMachine(Module &M, StringRef Arch, unsigned OptLevel=3)
Definition CoreLLVM.hpp:52
const SmallVector< StringRef > & threadIdxYFnName()
Definition CoreLLVMCUDA.hpp:75
Expected< sys::fs::TempFile > createTempFile(StringRef Prefix, StringRef Suffix)
Definition CoreLLVMHIP.hpp:124
const SmallVector< StringRef > & blockIdxYFnName()
Definition CoreLLVMCUDA.hpp:60
const SmallVector< StringRef > & blockDimYFnName()
Definition CoreLLVMCUDA.hpp:45
const SmallVector< StringRef > & blockDimZFnName()
Definition CoreLLVMCUDA.hpp:50
const SmallVector< StringRef > & blockDimXFnName()
Definition CoreLLVMCUDA.hpp:40
Definition BuiltinsCUDA.cpp:4
void setLaunchBoundsForKernel(Function &F, int MaxThreadsPerSM, int MinBlocksPerSM=0)
Definition CoreLLVMCUDA.hpp:87
T getRuntimeConstantValue(void *Arg)
Definition CompilerInterfaceRuntimeConstantInfo.h:114
CodegenOption
Definition Config.hpp:14
std::unique_ptr< MemoryBuffer > codegenObject(Module &M, StringRef DeviceArch, SmallPtrSetImpl< void * > &GlobalLinkedBinaries, CodegenOption CGOption=CodegenOption::RTC)
Definition CoreLLVMCUDA.hpp:160
std::string toString(CodegenOption Option)
Definition Config.hpp:26