1#ifndef PROTEUS_CORE_LLVM_HIP_H
2#define PROTEUS_CORE_LLVM_HIP_H
12#include <llvm/Bitcode/BitcodeWriter.h>
13#include <llvm/CodeGen/MachineModuleInfo.h>
14#include <llvm/IR/DiagnosticPrinter.h>
15#include <llvm/IR/Function.h>
16#include <llvm/IR/LegacyPassManager.h>
17#include <llvm/IR/Module.h>
18#include <llvm/IR/Verifier.h>
19#include <llvm/LTO/LTO.h>
20#include <llvm/MC/MCSubtargetInfo.h>
21#include <llvm/Support/CodeGen.h>
22#include <llvm/Support/FileSystem.h>
23#include <llvm/Support/MemoryBuffer.h>
24#include <llvm/Support/Path.h>
25#include <llvm/Support/Signals.h>
26#include <llvm/Support/TargetSelect.h>
27#include <llvm/Support/WithColor.h>
28#include <llvm/Target/TargetMachine.h>
32#if LLVM_VERSION_MAJOR >= 18
33#include <lld/Common/Driver.h>
44 static SmallVector<StringRef> Names = {
45 "_ZNK17__HIP_CoordinatesI13__HIP_GridDimE3__XcvjEv",
46 "llvm.amdgcn.num.workgroups.x",
"_ZL20__hip_get_grid_dim_xv"};
51 static SmallVector<StringRef> Names = {
52 "_ZNK17__HIP_CoordinatesI13__HIP_GridDimE3__YcvjEv",
53 "llvm.amdgcn.num.workgroups.y",
"_ZL20__hip_get_grid_dim_yv"};
58 static SmallVector<StringRef> Names = {
59 "_ZNK17__HIP_CoordinatesI13__HIP_GridDimE3__ZcvjEv",
60 "llvm.amdgcn.num.workgroups.z",
"_ZL20__hip_get_grid_dim_zv"};
65 static SmallVector<StringRef> Names = {
66 "_ZNK17__HIP_CoordinatesI14__HIP_BlockDimE3__XcvjEv",
67 "llvm.amdgcn.workgroup.size.x",
"_ZL21__hip_get_block_dim_xv"};
72 static SmallVector<StringRef> Names = {
73 "_ZNK17__HIP_CoordinatesI14__HIP_BlockDimE3__YcvjEv",
74 "llvm.amdgcn.workgroup.size.y",
"_ZL21__hip_get_block_dim_yv"};
79 static SmallVector<StringRef> Names = {
80 "_ZNK17__HIP_CoordinatesI14__HIP_BlockDimE3__ZcvjEv",
81 "llvm.amdgcn.workgroup.size.z",
"_ZL21__hip_get_block_dim_zv"};
86 static SmallVector<StringRef> Names = {
87 "_ZNK17__HIP_CoordinatesI14__HIP_BlockIdxE3__XcvjEv",
88 "llvm.amdgcn.workgroup.id.x"};
93 static SmallVector<StringRef> Names = {
94 "_ZNK17__HIP_CoordinatesI14__HIP_BlockIdxE3__YcvjEv",
95 "llvm.amdgcn.workgroup.id.y"};
100 static SmallVector<StringRef> Names = {
101 "_ZNK17__HIP_CoordinatesI14__HIP_BlockIdxE3__ZcvjEv",
102 "llvm.amdgcn.workgroup.id.z"};
107 static SmallVector<StringRef> Names = {
108 "_ZNK17__HIP_CoordinatesI15__HIP_ThreadIdxE3__XcvjEv",
109 "llvm.amdgcn.workitem.id.x"};
114 static SmallVector<StringRef> Names = {
115 "_ZNK17__HIP_CoordinatesI15__HIP_ThreadIdxE3__YcvjEv",
116 "llvm.amdgcn.workitem.id.y"};
121 static SmallVector<StringRef> Names = {
122 "_ZNK17__HIP_CoordinatesI15__HIP_ThreadIdxE3__ZcvjEv",
123 "llvm.amdgcn.workitem.id.z"};
129 SmallString<128> TmpDir;
130 sys::path::system_temp_directory(
true, TmpDir);
132 SmallString<64> FileName;
133 FileName.append(Prefix);
134 FileName.append(Suffix.empty() ?
"-%%%%%%%" :
"-%%%%%%%.");
135 FileName.append(Suffix);
136 sys::path::append(TmpDir, FileName);
137 return sys::fs::TempFile::create(TmpDir);
140#if LLVM_VERSION_MAJOR >= 18
141inline SmallVector<std::unique_ptr<sys::fs::TempFile>>
142codegenSerial(Module &M, StringRef DeviceArch,
143 [[maybe_unused]]
char OptLevel =
'3',
int CodegenOptLevel = 3) {
145 SmallVector<std::unique_ptr<sys::fs::TempFile>> ObjectFiles;
152 std::unique_ptr<TargetMachine> TM = std::move(*ExpectedTM);
153 TargetLibraryInfoImpl TLII(Triple(M.getTargetTriple()));
154 M.setDataLayout(TM->createDataLayout());
156 legacy::PassManager PM;
157 PM.add(
new TargetLibraryInfoWrapperPass(TLII));
158 MachineModuleInfoWrapperPass *MMIWP =
159#if LLVM_VERSION_MAJOR >= 20
160 new MachineModuleInfoWrapperPass(TM.get());
162 new MachineModuleInfoWrapperPass(
163 reinterpret_cast<LLVMTargetMachine *
>(TM.get()));
166 SmallVector<char, 4096> ObjectCode;
167 raw_svector_ostream OS(ObjectCode);
169 if (
auto E = ExpectedF.takeError())
172 auto ObjectFile = std::move(*ExpectedF);
173 auto FileStream = std::make_unique<CachedFileStream>(
174 std::make_unique<llvm::raw_fd_ostream>(ObjectFile.FD,
false));
175 TM->addPassesToEmitFile(PM, *FileStream->OS,
nullptr,
176 CodeGenFileType::ObjectFile,
179 std::unique_ptr<sys::fs::TempFile> ObjectFilePtr =
180 std::make_unique<sys::fs::TempFile>(std::move(ObjectFile));
181 ObjectFiles.emplace_back(std::move(ObjectFilePtr));
184 if (Error E = FileStream->commit())
191inline SmallVector<std::unique_ptr<sys::fs::TempFile>>
192codegenParallel(Module &M, StringRef DeviceArch,
197 std::atomic<bool> LTOError =
false;
199 auto DiagnosticHandler = [&](
const DiagnosticInfo &DI) {
200 std::string ErrStorage;
201 raw_string_ostream OS(ErrStorage);
202 DiagnosticPrinterRawOStream DP(OS);
205 switch (DI.getSeverity()) {
207 WithColor::error(errs(),
"[proteus codegen]") << ErrStorage <<
"\n";
211 WithColor::warning(errs(),
"[proteus codegen]") << ErrStorage <<
"\n";
214 WithColor::note(errs(),
"[proteus codegen]") << ErrStorage <<
"\n";
217 WithColor::remark(errs()) << ErrStorage <<
"\n";
224 M, DeviceArch, OptConfig.CodegenOptLevel);
227 std::unique_ptr<TargetMachine> TM = std::move(*ExpectedTM);
230 Conf.CPU = DeviceArch;
233 std::string FeatureStr = TM->getMCSubtargetInfo()->getFeatureString().str();
234 if (!FeatureStr.empty()) {
235 SmallVector<StringRef> Features;
236 StringRef(FeatureStr).split(Features,
',');
237 for (
auto &F : Features)
238 Conf.MAttrs.push_back(F.str());
244 Conf.Options = TM->Options;
246 Conf.DisableVerify =
true;
247 Conf.TimeTraceEnabled =
false;
248 Conf.DebugPassManager =
false;
249 Conf.VerifyEach =
false;
250 Conf.DiagHandler = DiagnosticHandler;
251 Conf.OptLevel = OptConfig.OptLevel;
254 if (OptConfig.PassPipeline)
255 Conf.OptPipeline = OptConfig.PassPipeline.value();
256 Conf.CGOptLevel =
static_cast<CodeGenOptLevel
>(OptConfig.CodegenOptLevel);
258 unsigned ParallelCodeGenParallelismLevel =
259 std::max(1u, std::thread::hardware_concurrency());
260 lto::LTO L(std::move(Conf), {}, ParallelCodeGenParallelismLevel);
263 M.setDataLayout(TM->createDataLayout());
265 SmallString<0> BitcodeBuf;
266 raw_svector_ostream BitcodeOS(BitcodeBuf);
267 WriteBitcodeToFile(M, BitcodeOS);
272 auto IF = cantFail(lto::InputFile::create(
273 MemoryBufferRef{BitcodeBuf, M.getModuleIdentifier()}));
275 std::set<std::string> PrevailingSymbols;
276 auto BuildResolutions = [&]() {
278 const auto Symbols =
IF->symbols();
279 SmallVector<lto::SymbolResolution, 16> Resolutions(Symbols.size());
280 size_t SymbolIdx = 0;
281 for (
auto &Sym : Symbols) {
282 lto::SymbolResolution &Res = Resolutions[SymbolIdx];
286 Res.Prevailing = !Sym.isUndefined() &&
287 PrevailingSymbols.insert(Sym.getName().str()).second;
289 Res.VisibleToRegularObj =
291 Sym.getVisibility() != GlobalValue::HiddenVisibility &&
292 !Sym.canBeOmittedFromSymbolTable();
295 Sym.getVisibility() != GlobalValue::HiddenVisibility &&
296 (!Sym.canBeOmittedFromSymbolTable());
298 Res.FinalDefinitionInLinkageUnit =
299 Sym.getVisibility() != GlobalValue::DefaultVisibility &&
300 (!Sym.isUndefined() && !Sym.isCommon());
304 Res.LinkerRedefined =
false;
307 auto PrintSymbol = [](
const lto::InputFile::Symbol &Sym,
308 lto::SymbolResolution &Res) {
310 OutStream <<
"Vis: ";
311 switch (Sym.getVisibility()) {
312 case GlobalValue::HiddenVisibility:
315 case GlobalValue::ProtectedVisibility:
318 case GlobalValue::DefaultVisibility:
323 OutStream <<
" Sym: ";
324 auto PrintBool = [&](
char C,
bool B) { OutStream << (B ? C :
'-'); };
325 PrintBool(
'U', Sym.isUndefined());
326 PrintBool(
'C', Sym.isCommon());
327 PrintBool(
'W', Sym.isWeak());
328 PrintBool(
'I', Sym.isIndirect());
329 PrintBool(
'O', Sym.canBeOmittedFromSymbolTable());
330 PrintBool(
'T', Sym.isTLS());
331 PrintBool(
'X', Sym.isExecutable());
332 OutStream <<
' ' << Sym.getName();
333 OutStream <<
"| P " << Res.Prevailing;
334 OutStream <<
" V " << Res.VisibleToRegularObj;
335 OutStream <<
" E " << Res.ExportDynamic;
336 OutStream <<
" F " << Res.FinalDefinitionInLinkageUnit;
340 PrintSymbol(Sym, Res);
345 cantFail(L.add(std::move(
IF), Resolutions));
351 size_t MaxTasks = L.getMaxTasks();
352 SmallVector<std::unique_ptr<sys::fs::TempFile>> ObjectFiles{MaxTasks};
356 const Twine & ) -> std::unique_ptr<CachedFileStream> {
357 std::string TaskStr = Task ?
"." + std::to_string(Task) :
"";
359 if (
auto E = ExpectedF.takeError())
362 std::make_unique<sys::fs::TempFile>(std::move(*ExpectedF));
363 auto Ret = std::make_unique<CachedFileStream>(
364 std::make_unique<llvm::raw_fd_ostream>(ObjectFiles[Task]->FD,
false));
370 if (Error E = L.run(AddStream))
375 createStringError(inconvertibleErrorCode(),
376 "Errors encountered inside the LTO pipeline.")));
383 StringRef DeviceArch) {
388 SmallString<4096> ModuleBuf;
389 raw_svector_ostream ModuleBufOS(ModuleBuf);
390 WriteBitcodeToFile(M, ModuleBufOS);
392 hiprtcLinkState HipLinkStatePtr;
399 std::string MArchOpt = (
"-march=" + DeviceArch).str();
404 const char *OptArgs[] = {MArchOpt.c_str()};
405 std::vector<hiprtcJIT_option> JITOptions = {
406 HIPRTC_JIT_IR_TO_ISA_OPT_EXT, HIPRTC_JIT_IR_TO_ISA_OPT_COUNT_EXT};
407 size_t OptArgsSize = 1;
408 const void *JITOptionsValues[] = {(
void *)OptArgs, (
void *)(OptArgsSize)};
410 (
void **)JITOptionsValues,
417 HipLinkStatePtr, HIPRTC_JIT_INPUT_LLVM_BITCODE, (
void *)ModuleBuf.data(),
418 ModuleBuf.size(),
"", 0,
nullptr,
nullptr));
420 hiprtcLinkComplete(HipLinkStatePtr, (
void **)&BinOut, &BinSize));
422 return MemoryBuffer::getMemBuffer(StringRef{BinOut, BinSize});
428 int MinBlocksPerSM = 0) {
432 F.addFnAttr(
"amdgpu-flat-work-group-size",
433 "1," + std::to_string(std::min(1024, MaxNumWorkGroups)));
435 if (MinBlocksPerSM != 0) {
441 F.addFnAttr(
"amdgpu-waves-per-eu", std::to_string(MinBlocksPerSM) +
"," +
442 std::to_string(MinBlocksPerSM));
446 <<
" => Set Workgroup size " << MaxNumWorkGroups
447 <<
" WavesPerEU (unused) " << MinBlocksPerSM <<
"\n");
450inline std::unique_ptr<MemoryBuffer>
452 [[maybe_unused]] SmallPtrSetImpl<void *> &GlobalLinkedBinaries,
457 assert(GlobalLinkedBinaries.empty() &&
458 "Expected empty linked binaries for HIP");
460 SmallVector<std::unique_ptr<sys::fs::TempFile>> ObjectFiles;
465 <<
"Codegen RTC " << T.
elapsed() <<
" ms\n");
468#if LLVM_VERSION_MAJOR >= 18
470 ObjectFiles = detail::codegenSerial(M, DeviceArch);
473 ObjectFiles = detail::codegenParallel(M, DeviceArch, OptConfig);
480 if (ObjectFiles.empty())
483#if LLVM_VERSION_MAJOR >= 18
485 if (
auto E = ExpectedF.takeError())
489 auto SharedObject = std::move(*ExpectedF);
491 std::vector<const char *>
Args{
"ld.lld",
"--no-undefined",
"-shared",
"-o",
492 SharedObject.TmpName.c_str()};
493 for (
auto &File : ObjectFiles) {
496 Args.push_back(File->TmpName.c_str());
500 for (
auto &Arg :
Args) {
507 <<
"Codegen object " <<
toString(CGOption) <<
"["
508 << ObjectFiles.size() <<
"] " << T.
elapsed() <<
" ms\n");
512 static std::mutex Mutex;
514 std::lock_guard LockGuard{Mutex};
515 lld::Result S = lld::lldMain(
Args, llvm::outs(), llvm::errs(),
516 {{lld::Gnu, &lld::elf::link}});
521 ErrorOr<std::unique_ptr<MemoryBuffer>> Buffer =
522 MemoryBuffer::getFileAsStream(SharedObject.TmpName);
527 for (
auto &File : ObjectFiles) {
530 if (
auto E = File->discard())
534 if (
auto E = SharedObject.discard())
539 <<
"Codegen linking " << T.
elapsed() <<
" ms\n");
541 return std::move(*Buffer);
char int void ** Args
Definition CompilerInterfaceHost.cpp:23
#define PROTEUS_TIMER_OUTPUT(x)
Definition Config.h:440
#define PROTEUS_DBG(x)
Definition Debug.h:9
#define TIMESCOPE(...)
Definition TimeTracing.h:66
#define proteusHiprtcErrCheck(CALL)
Definition UtilsHIP.h:28
static Config & get()
Definition Config.h:334
bool ProteusDebugOutput
Definition Config.h:350
static llvm::raw_ostream & outs(const std::string &Name)
Definition Logger.h:25
static llvm::raw_ostream & logs(const std::string &Name)
Definition Logger.h:19
Definition TimeTracing.h:33
void reset()
Definition TimeTracing.cpp:68
uint64_t elapsed()
Definition TimeTracing.cpp:66
Definition CompiledLibrary.h:7
const SmallVector< StringRef > & threadIdxXFnName()
Definition CoreLLVMCUDA.h:70
const SmallVector< StringRef > & gridDimYFnName()
Definition CoreLLVMCUDA.h:30
const SmallVector< StringRef > & threadIdxZFnName()
Definition CoreLLVMCUDA.h:80
const SmallVector< StringRef > & blockIdxZFnName()
Definition CoreLLVMCUDA.h:65
const SmallVector< StringRef > & gridDimZFnName()
Definition CoreLLVMCUDA.h:35
std::unique_ptr< MemoryBuffer > codegenRTC(Module &M, StringRef DeviceArch)
Definition CoreLLVMHIP.h:382
const SmallVector< StringRef > & gridDimXFnName()
Definition CoreLLVMCUDA.h:25
const SmallVector< StringRef > & blockIdxXFnName()
Definition CoreLLVMCUDA.h:55
Expected< std::unique_ptr< TargetMachine > > createTargetMachine(Module &M, StringRef Arch, unsigned OptLevel=3)
Definition CoreLLVM.h:57
const SmallVector< StringRef > & threadIdxYFnName()
Definition CoreLLVMCUDA.h:75
Expected< sys::fs::TempFile > createTempFile(StringRef Prefix, StringRef Suffix)
Definition CoreLLVMHIP.h:127
const SmallVector< StringRef > & blockIdxYFnName()
Definition CoreLLVMCUDA.h:60
const SmallVector< StringRef > & blockDimYFnName()
Definition CoreLLVMCUDA.h:45
const SmallVector< StringRef > & blockDimZFnName()
Definition CoreLLVMCUDA.h:50
const SmallVector< StringRef > & blockDimXFnName()
Definition CoreLLVMCUDA.h:40
Definition MemoryCache.h:27
void setLaunchBoundsForKernel(Function &F, int MaxThreadsPerSM, int MinBlocksPerSM=0)
Definition CoreLLVMCUDA.h:87
void reportFatalError(const llvm::Twine &Reason, const char *FILE, unsigned Line)
Definition Error.cpp:14
CodegenOption
Definition Config.h:16
std::unique_ptr< MemoryBuffer > codegenObject(Module &M, StringRef DeviceArch, SmallPtrSetImpl< void * > &GlobalLinkedBinaries, CodegenOption CGOption=CodegenOption::RTC)
Definition CoreLLVMCUDA.h:176
std::string toString(CodegenOption Option)
Definition Config.h:28
Definition CoreLLVM.h:187