1#ifndef PROTEUS_CORE_LLVM_HIP_HPP
2#define PROTEUS_CORE_LLVM_HIP_HPP
4#include <llvm/Bitcode/BitcodeWriter.h>
5#include <llvm/CodeGen/MachineModuleInfo.h>
6#include <llvm/CodeGen/ParallelCG.h>
7#include <llvm/IR/DiagnosticPrinter.h>
8#include <llvm/IR/Function.h>
9#include <llvm/IR/LegacyPassManager.h>
10#include <llvm/IR/Module.h>
11#include <llvm/IR/Verifier.h>
12#include <llvm/LTO/LTO.h>
13#include <llvm/Support/CodeGen.h>
14#include <llvm/Support/FileSystem.h>
15#include <llvm/Support/MemoryBuffer.h>
16#include <llvm/Support/Path.h>
17#include <llvm/Support/Signals.h>
18#include <llvm/Support/TargetSelect.h>
19#include <llvm/Support/WithColor.h>
20#include <llvm/Target/TargetMachine.h>
21#include <llvm/Transforms/IPO/ThinLTOBitcodeWriter.h>
22#include <llvm/Transforms/Utils/SplitModule.h>
24#if LLVM_VERSION_MAJOR >= 18
25#include <lld/Common/Driver.h>
43 static SmallVector<StringRef> Names = {
44 "_ZNK17__HIP_CoordinatesI13__HIP_GridDimE3__XcvjEv",
45 "llvm.amdgcn.num.workgroups.x",
"_ZL20__hip_get_grid_dim_xv"};
50 static SmallVector<StringRef> Names = {
51 "_ZNK17__HIP_CoordinatesI13__HIP_GridDimE3__YcvjEv",
52 "llvm.amdgcn.num.workgroups.y",
"_ZL20__hip_get_grid_dim_yv"};
57 static SmallVector<StringRef> Names = {
58 "_ZNK17__HIP_CoordinatesI13__HIP_GridDimE3__ZcvjEv",
59 "llvm.amdgcn.num.workgroups.z",
"_ZL20__hip_get_grid_dim_zv"};
64 static SmallVector<StringRef> Names = {
65 "_ZNK17__HIP_CoordinatesI14__HIP_BlockDimE3__XcvjEv",
66 "llvm.amdgcn.workgroup.size.x",
"_ZL21__hip_get_block_dim_xv"};
71 static SmallVector<StringRef> Names = {
72 "_ZNK17__HIP_CoordinatesI14__HIP_BlockDimE3__YcvjEv",
73 "llvm.amdgcn.workgroup.size.y",
"_ZL21__hip_get_block_dim_yv"};
78 static SmallVector<StringRef> Names = {
79 "_ZNK17__HIP_CoordinatesI14__HIP_BlockDimE3__ZcvjEv",
80 "llvm.amdgcn.workgroup.size.z",
"_ZL21__hip_get_block_dim_zv"};
85 static SmallVector<StringRef> Names = {
86 "_ZNK17__HIP_CoordinatesI14__HIP_BlockIdxE3__XcvjEv",
87 "llvm.amdgcn.workgroup.id.x"};
92 static SmallVector<StringRef> Names = {
93 "_ZNK17__HIP_CoordinatesI14__HIP_BlockIdxE3__YcvjEv",
94 "llvm.amdgcn.workgroup.id.y"};
99 static SmallVector<StringRef> Names = {
100 "_ZNK17__HIP_CoordinatesI14__HIP_BlockIdxE3__ZcvjEv",
101 "llvm.amdgcn.workgroup.id.z"};
106 static SmallVector<StringRef> Names = {
107 "_ZNK17__HIP_CoordinatesI15__HIP_ThreadIdxE3__XcvjEv",
108 "llvm.amdgcn.workitem.id.x"};
113 static SmallVector<StringRef> Names = {
114 "_ZNK17__HIP_CoordinatesI15__HIP_ThreadIdxE3__YcvjEv",
115 "llvm.amdgcn.workitem.id.y"};
120 static SmallVector<StringRef> Names = {
121 "_ZNK17__HIP_CoordinatesI15__HIP_ThreadIdxE3__ZcvjEv",
122 "llvm.amdgcn.workitem.id.z"};
126#if LLVM_VERSION_MAJOR >= 18
127inline SmallVector<std::unique_ptr<sys::fs::TempFile>>
128codegenSerial(Module &M, StringRef DeviceArch,
129 [[maybe_unused]]
char OptLevel =
'3',
int CodegenOptLevel = 3) {
130 SmallVector<std::unique_ptr<sys::fs::TempFile>> ObjectFiles;
137 std::unique_ptr<TargetMachine> TM = std::move(*ExpectedTM);
138 TargetLibraryInfoImpl TLII(Triple(M.getTargetTriple()));
140 legacy::PassManager PM;
141 PM.add(
new TargetLibraryInfoWrapperPass(TLII));
142 MachineModuleInfoWrapperPass *MMIWP =
new MachineModuleInfoWrapperPass(
143 reinterpret_cast<LLVMTargetMachine *
>(TM.get()));
145 SmallVector<char, 4096> ObjectCode;
146 raw_svector_ostream OS(ObjectCode);
147 auto ExpectedF = sys::fs::TempFile::create(
"object-%%%%%%.o");
148 if (
auto E = ExpectedF.takeError())
151 auto ObjectFile = std::move(*ExpectedF);
152 auto FileStream = std::make_unique<CachedFileStream>(
153 std::make_unique<llvm::raw_fd_ostream>(ObjectFile.FD,
false));
154 TM->addPassesToEmitFile(PM, *FileStream->OS,
nullptr,
155 CodeGenFileType::ObjectFile,
158 std::unique_ptr<sys::fs::TempFile> ObjectFilePtr =
159 std::make_unique<sys::fs::TempFile>(std::move(ObjectFile));
160 ObjectFiles.emplace_back(std::move(ObjectFilePtr));
167inline void runPreLinkPipeline(Module &M, StringRef DeviceArch,
168 unsigned OptLevel,
unsigned CodegenOptLevel) {
174 std::unique_ptr<TargetMachine> TM = std::move(*ExpectedTM);
176 PassBuilder PB(TM.get());
177 LoopAnalysisManager LAM;
178 FunctionAnalysisManager FAM;
179 CGSCCAnalysisManager CGAM;
180 ModuleAnalysisManager MAM;
182 PB.registerModuleAnalyses(MAM);
183 PB.registerCGSCCAnalyses(CGAM);
184 PB.registerFunctionAnalyses(FAM);
185 PB.registerLoopAnalyses(LAM);
186 PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
188 ModulePassManager MPM;
189 std::optional<OptimizationLevel> OL = std::nullopt;
192 OL = OptimizationLevel::O0;
195 OL = OptimizationLevel::O1;
198 OL = OptimizationLevel::O2;
201 OL = OptimizationLevel::O3;
204 OL = OptimizationLevel();
206 <<
"Unknown optlevel " << OptLevel <<
" fallback to default "
207 << OL.value().getSpeedupLevel() <<
"\n";
209 MPM = PB.buildThinLTOPreLinkDefaultPipeline(OL.value());
212 << __FUNCTION__ <<
" " << T.elapsed() <<
" ms\n");
215inline SmallVector<std::unique_ptr<sys::fs::TempFile>>
216codegenParallel(Module &M, StringRef DeviceArch,
217 [[maybe_unused]]
char OptLevel =
'3',
int CodegenOptLevel = 3) {
218 auto TMFactory = [&]() {
224 return std::move(*TMExpected);
227 const size_t NumShards = std::min(
230 llvm::heavyweight_hardware_concurrency().compute_thread_count()));
232 SmallVector<std::unique_ptr<sys::fs::TempFile>> ObjectFiles;
234 SmallVector<SmallString<0>> Objects{NumShards};
235 SmallVector<std::unique_ptr<raw_svector_ostream>> OwnedObjectsOS;
236 SmallVector<raw_pwrite_stream *> ObjectsOS;
238 for (
size_t I = 0; I < NumShards; ++I) {
239 OwnedObjectsOS.push_back(std::make_unique<raw_svector_ostream>(Objects[I]));
240 ObjectsOS.push_back(OwnedObjectsOS.back().get());
243 splitCodeGen(M, ObjectsOS, {}, TMFactory);
245 for (
unsigned I = 0; I < NumShards; ++I) {
247 << Objects[I].size() <<
"\n");
249 sys::fs::TempFile::create(
"shard." + std::to_string(I) +
"-%%%%%%%.o");
250 if (
auto E = ExpectedF.takeError())
253 std::unique_ptr<sys::fs::TempFile> ObjectFilePtr =
254 std::make_unique<sys::fs::TempFile>(std::move(*ExpectedF));
255 ObjectFiles.emplace_back(std::move(ObjectFilePtr));
261inline SmallVector<std::unique_ptr<sys::fs::TempFile>>
262codegenParallelThinLTO(Module &M, StringRef DeviceArch,
263 unsigned int OptLevel = 3,
int CodegenOptLevel = 3) {
264 const size_t NumShards = std::min(
267 llvm::heavyweight_hardware_concurrency().compute_thread_count()));
269 SmallVector<SmallString<0>> Bitcodes{NumShards};
270 SmallVector<std::unique_ptr<raw_svector_ostream>> OwnedBitcodesOS;
271 SmallVector<raw_pwrite_stream *> BitcodesOS;
272 for (
unsigned int I = 0; I < NumShards; ++I) {
273 OwnedBitcodesOS.push_back(
274 std::make_unique<raw_svector_ostream>(Bitcodes[I]));
275 BitcodesOS.push_back(OwnedBitcodesOS.back().get());
279 runPreLinkPipeline(M, DeviceArch, OptLevel, CodegenOptLevel);
285 M, BitcodesOS.size(),
286 [&PartIdx, &BitcodesOS](std::unique_ptr<Module> MPart) {
287#if PROTEUS_ENABLE_DEBUG
288 if (verifyModule(*MPart, &errs()))
289 PROTEUS_FATAL_ERROR(
"Broken module found, JIT compilation aborted!");
293 LoopAnalysisManager LAM;
294 FunctionAnalysisManager FAM;
295 CGSCCAnalysisManager CGAM;
296 ModuleAnalysisManager MAM;
298 PB.registerModuleAnalyses(MAM);
299 PB.registerCGSCCAnalyses(CGAM);
300 PB.registerFunctionAnalyses(FAM);
301 PB.registerLoopAnalyses(LAM);
302 PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
304 ModulePassManager MPM;
305 MPM.addPass(ThinLTOBitcodeWriterPass(*BitcodesOS[PartIdx], nullptr));
306 MPM.run(*MPart, MAM);
311 <<
"SplitModule " << T.elapsed() <<
" ms\n");
313 std::atomic<bool> LTOError =
false;
314 auto DiagnosticHandler = [&](
const DiagnosticInfo &DI) {
315 std::string ErrStorage;
316 raw_string_ostream OS(ErrStorage);
317 DiagnosticPrinterRawOStream DP(OS);
320 switch (DI.getSeverity()) {
322 WithColor::error(errs(),
"[proteus codegen]") << ErrStorage <<
"\n";
326 WithColor::warning(errs(),
"[proteus codegen]") << ErrStorage <<
"\n";
329 WithColor::note(errs(),
"[proteus codegen]") << ErrStorage <<
"\n";
332 WithColor::remark(errs()) << ErrStorage <<
"\n";
338 Conf.CPU = DeviceArch;
341 Conf.UseDefaultPipeline =
false;
342 Conf.DisableVerify =
true;
343 Conf.TimeTraceEnabled =
false;
344 Conf.DebugPassManager =
false;
345 Conf.VerifyEach =
false;
346 Conf.DiagHandler = DiagnosticHandler;
347 Conf.OptLevel = OptLevel;
348 Conf.CGOptLevel =
static_cast<CodeGenOptLevel
>(CodegenOptLevel);
351 lto::ThinBackend Backend = lto::createInProcessThinBackend(
352 llvm::heavyweight_hardware_concurrency(NumShards));
353 auto LTOBackend = lto::LTO(std::move(Conf), Backend);
355 BumpPtrAllocator Alloc;
356 StringSaver Identifiers(Alloc);
357 std::set<std::string> PrevailingSymbols;
358 for (
auto &BitcodeInput : Bitcodes) {
359 StringRef Identifier =
360 Identifiers.save((std::to_string(Idx) +
".shard.bc"));
362 Expected<std::unique_ptr<lto::InputFile>> BitcodeFileOrErr =
363 llvm::lto::InputFile::create(MemoryBufferRef{
364 StringRef{BitcodeInput.data(), BitcodeInput.size()}, Identifier});
365 if (
auto E = BitcodeFileOrErr.takeError())
369 const auto Symbols = (*BitcodeFileOrErr)->symbols();
370 SmallVector<lto::SymbolResolution, 16> Resolutions(Symbols.size());
371 size_t SymbolIdx = 0;
372 for (
auto &Sym : Symbols) {
373 lto::SymbolResolution &Res = Resolutions[SymbolIdx];
377 Res.Prevailing = !Sym.isUndefined() &&
378 PrevailingSymbols.insert(Sym.getName().str()).second;
380 Res.VisibleToRegularObj =
382 Sym.getVisibility() != GlobalValue::HiddenVisibility &&
383 !Sym.canBeOmittedFromSymbolTable();
386 Sym.getVisibility() != GlobalValue::HiddenVisibility &&
387 (!Sym.canBeOmittedFromSymbolTable());
389 Res.FinalDefinitionInLinkageUnit =
390 Sym.getVisibility() != GlobalValue::DefaultVisibility &&
391 (!Sym.isUndefined() && !Sym.isCommon());
394 Res.LinkerRedefined =
false;
396#if PROTEUS_ENABLE_DEBUG
397 auto PrintSymbol = [](
const lto::InputFile::Symbol &Sym,
398 lto::SymbolResolution &Res) {
400 OutStream <<
"Vis: ";
401 switch (Sym.getVisibility()) {
402 case GlobalValue::HiddenVisibility:
405 case GlobalValue::ProtectedVisibility:
408 case GlobalValue::DefaultVisibility:
413 OutStream <<
" Sym: ";
414 auto PrintBool = [&](
char C,
bool B) { OutStream << (B ? C :
'-'); };
415 PrintBool(
'U', Sym.isUndefined());
416 PrintBool(
'C', Sym.isCommon());
417 PrintBool(
'W', Sym.isWeak());
418 PrintBool(
'I', Sym.isIndirect());
419 PrintBool(
'O', Sym.canBeOmittedFromSymbolTable());
420 PrintBool(
'T', Sym.isTLS());
421 PrintBool(
'X', Sym.isExecutable());
422 OutStream <<
' ' << Sym.getName();
423 OutStream <<
"| P " << Res.Prevailing;
424 OutStream <<
" V " << Res.VisibleToRegularObj;
425 OutStream <<
" E " << Res.ExportDynamic;
426 OutStream <<
" F " << Res.FinalDefinitionInLinkageUnit;
429 PrintSymbol(Sym, Res);
434 if (Error Err = LTOBackend.add(std::move(*BitcodeFileOrErr), Resolutions))
439 size_t MaxTasks = LTOBackend.getMaxTasks();
440 SmallVector<std::unique_ptr<sys::fs::TempFile>> ObjectFiles{MaxTasks};
443 const Twine & ) -> std::unique_ptr<CachedFileStream> {
444 std::string TaskStr = Task ?
"." + std::to_string(Task) :
"";
446 sys::fs::TempFile::create(
"lto.shard" + TaskStr +
"-%%%%%%%.o");
447 if (
auto E = ExpectedF.takeError())
450 std::make_unique<sys::fs::TempFile>(std::move(*ExpectedF));
451 auto Ret = std::make_unique<CachedFileStream>(
452 std::make_unique<llvm::raw_fd_ostream>(ObjectFiles[Task]->FD,
false));
458 if (Error E = LTOBackend.run(AddStream))
463 createStringError(inconvertibleErrorCode(),
464 "Errors encountered inside the LTO pipeline.")));
471 StringRef DeviceArch) {
475 SmallString<4096> ModuleBuf;
476 raw_svector_ostream ModuleBufOS(ModuleBuf);
477 WriteBitcodeToFile(M, ModuleBufOS);
479 hiprtcLinkState HipLinkStatePtr;
486 std::string MArchOpt = (
"-march=" + DeviceArch).str();
487 const char *OptArgs[] = {
"-mllvm",
"-unroll-threshold=1000",
489 std::vector<hiprtcJIT_option> JITOptions = {
490 HIPRTC_JIT_IR_TO_ISA_OPT_EXT, HIPRTC_JIT_IR_TO_ISA_OPT_COUNT_EXT};
491 size_t OptArgsSize = 3;
492 const void *JITOptionsValues[] = {(
void *)OptArgs, (
void *)(OptArgsSize)};
494 (
void **)JITOptionsValues,
501 HipLinkStatePtr, HIPRTC_JIT_INPUT_LLVM_BITCODE, (
void *)ModuleBuf.data(),
502 ModuleBuf.size(),
"", 0,
nullptr,
nullptr));
504 hiprtcLinkComplete(HipLinkStatePtr, (
void **)&BinOut, &BinSize));
506 return MemoryBuffer::getMemBuffer(StringRef{BinOut, BinSize});
512 [[maybe_unused]]
size_t GridSize,
519 F.addFnAttr(
"amdgpu-flat-work-group-size",
520 "1," + std::to_string(std::min(1024, BlockSize)));
523 [[maybe_unused]]
int WavesPerEU = 0;
526 <<
"BlockSize " << BlockSize <<
" GridSize " << GridSize
527 <<
" => Set Wokgroup size " << BlockSize
528 <<
" WavesPerEU (unused) " << WavesPerEU <<
"\n");
531inline std::unique_ptr<MemoryBuffer>
533 [[maybe_unused]] SmallPtrSetImpl<void *> &GlobalLinkedBinaries,
535 assert(GlobalLinkedBinaries.empty() &&
536 "Expected empty linked binaries for HIP");
538 SmallVector<std::unique_ptr<sys::fs::TempFile>> ObjectFiles;
543 <<
"Codegen RTC " << T.elapsed() <<
" ms\n");
546#if LLVM_VERSION_MAJOR >= 18
548 ObjectFiles = detail::codegenSerial(M, DeviceArch);
551 ObjectFiles = detail::codegenParallel(M, DeviceArch);
554 ObjectFiles = detail::codegenParallelThinLTO(M, DeviceArch);
561 if (ObjectFiles.empty())
565#if LLVM_VERSION_MAJOR >= 18
566 auto ExpectedF = sys::fs::TempFile::create(
"proteus-jit-%%%%%%%.o");
567 if (
auto E = ExpectedF.takeError())
571 auto SharedObject = std::move(*ExpectedF);
573 std::vector<const char *>
Args{
"ld.lld",
"--no-undefined",
"-shared",
"-o",
574 SharedObject.TmpName.c_str()};
575 for (
auto &File : ObjectFiles) {
578 Args.push_back(File->TmpName.c_str());
581#if PROTEUS_ENABLE_DEBUG
582 for (
auto &Arg :
Args) {
589 <<
"Codegen object " <<
toString(CGOption) <<
"["
590 << ObjectFiles.size() <<
"] " << T.elapsed() <<
" ms\n");
594 static std::mutex Mutex;
596 std::lock_guard LockGuard{Mutex};
597 lld::Result S = lld::lldMain(
Args, llvm::outs(), llvm::errs(),
598 {{lld::Gnu, &lld::elf::link}});
603 ErrorOr<std::unique_ptr<MemoryBuffer>> Buffer =
604 MemoryBuffer::getFileAsStream(SharedObject.TmpName);
609 for (
auto &File : ObjectFiles) {
612 if (
auto E = File->discard())
616 if (
auto E = SharedObject.discard())
621 <<
"Codegen linking " << T.elapsed() <<
" ms\n");
623 return std::move(*Buffer);
char int void ** Args
Definition CompilerInterfaceHost.cpp:20
#define PROTEUS_DBG(x)
Definition Debug.h:10
#define PROTEUS_FATAL_ERROR(x)
Definition Error.h:4
#define PROTEUS_TIMER_OUTPUT(x)
Definition TimeTracing.hpp:57
#define proteusHiprtcErrCheck(CALL)
Definition UtilsHIP.h:28
void saveToFile(llvm::StringRef Filepath, T &&Data)
Definition Utils.h:23
static llvm::raw_ostream & outs(const std::string &Name)
Definition Logger.hpp:25
static llvm::raw_ostream & logs(const std::string &Name)
Definition Logger.hpp:19
const SmallVector< StringRef > & threadIdxXFnName()
Definition CoreLLVMCUDA.hpp:70
const SmallVector< StringRef > & gridDimYFnName()
Definition CoreLLVMCUDA.hpp:30
const SmallVector< StringRef > & threadIdxZFnName()
Definition CoreLLVMCUDA.hpp:80
const SmallVector< StringRef > & blockIdxZFnName()
Definition CoreLLVMCUDA.hpp:65
const SmallVector< StringRef > & gridDimZFnName()
Definition CoreLLVMCUDA.hpp:35
std::unique_ptr< MemoryBuffer > codegenRTC(Module &M, StringRef DeviceArch)
Definition CoreLLVMHIP.hpp:470
const SmallVector< StringRef > & gridDimXFnName()
Definition CoreLLVMCUDA.hpp:25
const SmallVector< StringRef > & blockIdxXFnName()
Definition CoreLLVMCUDA.hpp:55
Expected< std::unique_ptr< TargetMachine > > createTargetMachine(Module &M, StringRef Arch, unsigned OptLevel=3)
Definition CoreLLVM.hpp:52
const SmallVector< StringRef > & threadIdxYFnName()
Definition CoreLLVMCUDA.hpp:75
const SmallVector< StringRef > & blockIdxYFnName()
Definition CoreLLVMCUDA.hpp:60
const SmallVector< StringRef > & blockDimYFnName()
Definition CoreLLVMCUDA.hpp:45
const SmallVector< StringRef > & blockDimZFnName()
Definition CoreLLVMCUDA.hpp:50
const SmallVector< StringRef > & blockDimXFnName()
Definition CoreLLVMCUDA.hpp:40
Definition Dispatcher.cpp:14
CodegenOption
Definition Config.hpp:10
std::unique_ptr< MemoryBuffer > codegenObject(Module &M, StringRef DeviceArch, SmallPtrSetImpl< void * > &GlobalLinkedBinaries, CodegenOption CGOption=CodegenOption::RTC)
Definition CoreLLVMCUDA.hpp:155
void setLaunchBoundsForKernel(Module &M, Function &F, size_t, int BlockSize)
Definition CoreLLVMCUDA.hpp:87
std::string toString(CodegenOption Option)
Definition Config.hpp:23