1#ifndef PROTEUS_CORE_LLVM_HIP_H
2#define PROTEUS_CORE_LLVM_HIP_H
12#include <llvm/Bitcode/BitcodeWriter.h>
13#include <llvm/CodeGen/MachineModuleInfo.h>
14#include <llvm/IR/DiagnosticPrinter.h>
15#include <llvm/IR/Function.h>
16#include <llvm/IR/LegacyPassManager.h>
17#include <llvm/IR/Module.h>
18#include <llvm/IR/Verifier.h>
19#include <llvm/LTO/LTO.h>
20#include <llvm/MC/MCSubtargetInfo.h>
21#include <llvm/Support/CodeGen.h>
22#include <llvm/Support/FileSystem.h>
23#include <llvm/Support/MemoryBuffer.h>
24#include <llvm/Support/Path.h>
25#include <llvm/Support/Signals.h>
26#include <llvm/Support/TargetSelect.h>
27#include <llvm/Support/WithColor.h>
28#include <llvm/Target/TargetMachine.h>
30#if LLVM_VERSION_MAJOR >= 18
31#include <lld/Common/Driver.h>
42 static SmallVector<StringRef> Names = {
43 "_ZNK17__HIP_CoordinatesI13__HIP_GridDimE3__XcvjEv",
44 "llvm.amdgcn.num.workgroups.x",
"_ZL20__hip_get_grid_dim_xv"};
49 static SmallVector<StringRef> Names = {
50 "_ZNK17__HIP_CoordinatesI13__HIP_GridDimE3__YcvjEv",
51 "llvm.amdgcn.num.workgroups.y",
"_ZL20__hip_get_grid_dim_yv"};
56 static SmallVector<StringRef> Names = {
57 "_ZNK17__HIP_CoordinatesI13__HIP_GridDimE3__ZcvjEv",
58 "llvm.amdgcn.num.workgroups.z",
"_ZL20__hip_get_grid_dim_zv"};
63 static SmallVector<StringRef> Names = {
64 "_ZNK17__HIP_CoordinatesI14__HIP_BlockDimE3__XcvjEv",
65 "llvm.amdgcn.workgroup.size.x",
"_ZL21__hip_get_block_dim_xv"};
70 static SmallVector<StringRef> Names = {
71 "_ZNK17__HIP_CoordinatesI14__HIP_BlockDimE3__YcvjEv",
72 "llvm.amdgcn.workgroup.size.y",
"_ZL21__hip_get_block_dim_yv"};
77 static SmallVector<StringRef> Names = {
78 "_ZNK17__HIP_CoordinatesI14__HIP_BlockDimE3__ZcvjEv",
79 "llvm.amdgcn.workgroup.size.z",
"_ZL21__hip_get_block_dim_zv"};
84 static SmallVector<StringRef> Names = {
85 "_ZNK17__HIP_CoordinatesI14__HIP_BlockIdxE3__XcvjEv",
86 "llvm.amdgcn.workgroup.id.x"};
91 static SmallVector<StringRef> Names = {
92 "_ZNK17__HIP_CoordinatesI14__HIP_BlockIdxE3__YcvjEv",
93 "llvm.amdgcn.workgroup.id.y"};
98 static SmallVector<StringRef> Names = {
99 "_ZNK17__HIP_CoordinatesI14__HIP_BlockIdxE3__ZcvjEv",
100 "llvm.amdgcn.workgroup.id.z"};
105 static SmallVector<StringRef> Names = {
106 "_ZNK17__HIP_CoordinatesI15__HIP_ThreadIdxE3__XcvjEv",
107 "llvm.amdgcn.workitem.id.x"};
112 static SmallVector<StringRef> Names = {
113 "_ZNK17__HIP_CoordinatesI15__HIP_ThreadIdxE3__YcvjEv",
114 "llvm.amdgcn.workitem.id.y"};
119 static SmallVector<StringRef> Names = {
120 "_ZNK17__HIP_CoordinatesI15__HIP_ThreadIdxE3__ZcvjEv",
121 "llvm.amdgcn.workitem.id.z"};
127 SmallString<128> TmpDir;
128 sys::path::system_temp_directory(
true, TmpDir);
130 SmallString<64> FileName;
131 FileName.append(Prefix);
132 FileName.append(Suffix.empty() ?
"-%%%%%%%" :
"-%%%%%%%.");
133 FileName.append(Suffix);
134 sys::path::append(TmpDir, FileName);
135 return sys::fs::TempFile::create(TmpDir);
138#if LLVM_VERSION_MAJOR >= 18
139inline SmallVector<std::unique_ptr<sys::fs::TempFile>>
140codegenSerial(Module &M, StringRef DeviceArch,
141 [[maybe_unused]]
char OptLevel =
'3',
int CodegenOptLevel = 3) {
142 SmallVector<std::unique_ptr<sys::fs::TempFile>> ObjectFiles;
149 std::unique_ptr<TargetMachine> TM = std::move(*ExpectedTM);
150 TargetLibraryInfoImpl TLII(Triple(M.getTargetTriple()));
151 M.setDataLayout(TM->createDataLayout());
153 legacy::PassManager PM;
154 PM.add(
new TargetLibraryInfoWrapperPass(TLII));
155 MachineModuleInfoWrapperPass *MMIWP =
156#if LLVM_VERSION_MAJOR >= 20
157 new MachineModuleInfoWrapperPass(TM.get());
159 new MachineModuleInfoWrapperPass(
160 reinterpret_cast<LLVMTargetMachine *
>(TM.get()));
163 SmallVector<char, 4096> ObjectCode;
164 raw_svector_ostream OS(ObjectCode);
166 if (
auto E = ExpectedF.takeError())
169 auto ObjectFile = std::move(*ExpectedF);
170 auto FileStream = std::make_unique<CachedFileStream>(
171 std::make_unique<llvm::raw_fd_ostream>(ObjectFile.FD,
false));
172 TM->addPassesToEmitFile(PM, *FileStream->OS,
nullptr,
173 CodeGenFileType::ObjectFile,
176 std::unique_ptr<sys::fs::TempFile> ObjectFilePtr =
177 std::make_unique<sys::fs::TempFile>(std::move(ObjectFile));
178 ObjectFiles.emplace_back(std::move(ObjectFilePtr));
185inline SmallVector<std::unique_ptr<sys::fs::TempFile>>
186codegenParallel(Module &M, StringRef DeviceArch,
unsigned int OptLevel = 3,
187 int CodegenOptLevel = 3) {
189 std::atomic<bool> LTOError =
false;
191 auto DiagnosticHandler = [&](
const DiagnosticInfo &DI) {
192 std::string ErrStorage;
193 raw_string_ostream OS(ErrStorage);
194 DiagnosticPrinterRawOStream DP(OS);
197 switch (DI.getSeverity()) {
199 WithColor::error(errs(),
"[proteus codegen]") << ErrStorage <<
"\n";
203 WithColor::warning(errs(),
"[proteus codegen]") << ErrStorage <<
"\n";
206 WithColor::note(errs(),
"[proteus codegen]") << ErrStorage <<
"\n";
209 WithColor::remark(errs()) << ErrStorage <<
"\n";
219 std::unique_ptr<TargetMachine> TM = std::move(*ExpectedTM);
222 Conf.CPU = DeviceArch;
225 std::string FeatureStr = TM->getMCSubtargetInfo()->getFeatureString().str();
226 if (!FeatureStr.empty()) {
227 SmallVector<StringRef> Features;
228 StringRef(FeatureStr).split(Features,
',');
229 for (
auto &F : Features)
230 Conf.MAttrs.push_back(F.str());
236 Conf.Options = TM->Options;
238 Conf.DisableVerify =
true;
239 Conf.TimeTraceEnabled =
false;
240 Conf.DebugPassManager =
false;
241 Conf.VerifyEach =
false;
242 Conf.DiagHandler = DiagnosticHandler;
243 Conf.OptLevel = OptLevel;
244 Conf.CGOptLevel =
static_cast<CodeGenOptLevel
>(CodegenOptLevel);
246 unsigned ParallelCodeGenParallelismLevel =
247 std::max(1u, std::thread::hardware_concurrency());
248 lto::LTO L(std::move(Conf), {}, ParallelCodeGenParallelismLevel);
251 M.setDataLayout(TM->createDataLayout());
253 SmallString<0> BitcodeBuf;
254 raw_svector_ostream BitcodeOS(BitcodeBuf);
255 WriteBitcodeToFile(M, BitcodeOS);
260 auto IF = cantFail(lto::InputFile::create(
261 MemoryBufferRef{BitcodeBuf, M.getModuleIdentifier()}));
263 std::set<std::string> PrevailingSymbols;
264 auto BuildResolutions = [&]() {
266 const auto Symbols =
IF->symbols();
267 SmallVector<lto::SymbolResolution, 16> Resolutions(Symbols.size());
268 size_t SymbolIdx = 0;
269 for (
auto &Sym : Symbols) {
270 lto::SymbolResolution &Res = Resolutions[SymbolIdx];
274 Res.Prevailing = !Sym.isUndefined() &&
275 PrevailingSymbols.insert(Sym.getName().str()).second;
277 Res.VisibleToRegularObj =
279 Sym.getVisibility() != GlobalValue::HiddenVisibility &&
280 !Sym.canBeOmittedFromSymbolTable();
283 Sym.getVisibility() != GlobalValue::HiddenVisibility &&
284 (!Sym.canBeOmittedFromSymbolTable());
286 Res.FinalDefinitionInLinkageUnit =
287 Sym.getVisibility() != GlobalValue::DefaultVisibility &&
288 (!Sym.isUndefined() && !Sym.isCommon());
292 Res.LinkerRedefined =
false;
295 auto PrintSymbol = [](
const lto::InputFile::Symbol &Sym,
296 lto::SymbolResolution &Res) {
298 OutStream <<
"Vis: ";
299 switch (Sym.getVisibility()) {
300 case GlobalValue::HiddenVisibility:
303 case GlobalValue::ProtectedVisibility:
306 case GlobalValue::DefaultVisibility:
311 OutStream <<
" Sym: ";
312 auto PrintBool = [&](
char C,
bool B) { OutStream << (B ? C :
'-'); };
313 PrintBool(
'U', Sym.isUndefined());
314 PrintBool(
'C', Sym.isCommon());
315 PrintBool(
'W', Sym.isWeak());
316 PrintBool(
'I', Sym.isIndirect());
317 PrintBool(
'O', Sym.canBeOmittedFromSymbolTable());
318 PrintBool(
'T', Sym.isTLS());
319 PrintBool(
'X', Sym.isExecutable());
320 OutStream <<
' ' << Sym.getName();
321 OutStream <<
"| P " << Res.Prevailing;
322 OutStream <<
" V " << Res.VisibleToRegularObj;
323 OutStream <<
" E " << Res.ExportDynamic;
324 OutStream <<
" F " << Res.FinalDefinitionInLinkageUnit;
328 PrintSymbol(Sym, Res);
333 cantFail(L.add(std::move(
IF), Resolutions));
339 size_t MaxTasks = L.getMaxTasks();
340 SmallVector<std::unique_ptr<sys::fs::TempFile>> ObjectFiles{MaxTasks};
344 const Twine & ) -> std::unique_ptr<CachedFileStream> {
345 std::string TaskStr = Task ?
"." + std::to_string(Task) :
"";
347 if (
auto E = ExpectedF.takeError())
350 std::make_unique<sys::fs::TempFile>(std::move(*ExpectedF));
351 auto Ret = std::make_unique<CachedFileStream>(
352 std::make_unique<llvm::raw_fd_ostream>(ObjectFiles[Task]->FD,
false));
358 if (Error E = L.run(AddStream))
363 createStringError(inconvertibleErrorCode(),
364 "Errors encountered inside the LTO pipeline.")));
371 StringRef DeviceArch) {
375 SmallString<4096> ModuleBuf;
376 raw_svector_ostream ModuleBufOS(ModuleBuf);
377 WriteBitcodeToFile(M, ModuleBufOS);
379 hiprtcLinkState HipLinkStatePtr;
386 std::string MArchOpt = (
"-march=" + DeviceArch).str();
391 const char *OptArgs[] = {MArchOpt.c_str()};
392 std::vector<hiprtcJIT_option> JITOptions = {
393 HIPRTC_JIT_IR_TO_ISA_OPT_EXT, HIPRTC_JIT_IR_TO_ISA_OPT_COUNT_EXT};
394 size_t OptArgsSize = 1;
395 const void *JITOptionsValues[] = {(
void *)OptArgs, (
void *)(OptArgsSize)};
397 (
void **)JITOptionsValues,
404 HipLinkStatePtr, HIPRTC_JIT_INPUT_LLVM_BITCODE, (
void *)ModuleBuf.data(),
405 ModuleBuf.size(),
"", 0,
nullptr,
nullptr));
407 hiprtcLinkComplete(HipLinkStatePtr, (
void **)&BinOut, &BinSize));
409 return MemoryBuffer::getMemBuffer(StringRef{BinOut, BinSize});
415 int MinBlocksPerSM = 0) {
419 F.addFnAttr(
"amdgpu-flat-work-group-size",
420 "1," + std::to_string(std::min(1024, MaxNumWorkGroups)));
422 if (MinBlocksPerSM != 0) {
428 F.addFnAttr(
"amdgpu-waves-per-eu", std::to_string(MinBlocksPerSM) +
"," +
429 std::to_string(MinBlocksPerSM));
433 <<
" => Set Workgroup size " << MaxNumWorkGroups
434 <<
" WavesPerEU (unused) " << MinBlocksPerSM <<
"\n");
437inline std::unique_ptr<MemoryBuffer>
439 [[maybe_unused]] SmallPtrSetImpl<void *> &GlobalLinkedBinaries,
441 assert(GlobalLinkedBinaries.empty() &&
442 "Expected empty linked binaries for HIP");
444 SmallVector<std::unique_ptr<sys::fs::TempFile>> ObjectFiles;
449 <<
"Codegen RTC " << T.
elapsed() <<
" ms\n");
452#if LLVM_VERSION_MAJOR >= 18
454 ObjectFiles = detail::codegenSerial(M, DeviceArch);
457 ObjectFiles = detail::codegenParallel(M, DeviceArch);
464 if (ObjectFiles.empty())
467#if LLVM_VERSION_MAJOR >= 18
469 if (
auto E = ExpectedF.takeError())
473 auto SharedObject = std::move(*ExpectedF);
475 std::vector<const char *>
Args{
"ld.lld",
"--no-undefined",
"-shared",
"-o",
476 SharedObject.TmpName.c_str()};
477 for (
auto &File : ObjectFiles) {
480 Args.push_back(File->TmpName.c_str());
484 for (
auto &Arg :
Args) {
491 <<
"Codegen object " <<
toString(CGOption) <<
"["
492 << ObjectFiles.size() <<
"] " << T.
elapsed() <<
" ms\n");
496 static std::mutex Mutex;
498 std::lock_guard LockGuard{Mutex};
499 lld::Result S = lld::lldMain(
Args, llvm::outs(), llvm::errs(),
500 {{lld::Gnu, &lld::elf::link}});
505 ErrorOr<std::unique_ptr<MemoryBuffer>> Buffer =
506 MemoryBuffer::getFileAsStream(SharedObject.TmpName);
511 for (
auto &File : ObjectFiles) {
514 if (
auto E = File->discard())
518 if (
auto E = SharedObject.discard())
523 <<
"Codegen linking " << T.
elapsed() <<
" ms\n");
525 return std::move(*Buffer);
char int void ** Args
Definition CompilerInterfaceHost.cpp:22
#define PROTEUS_DBG(x)
Definition Debug.h:9
#define PROTEUS_TIMER_OUTPUT(x)
Definition TimeTracing.h:54
#define proteusHiprtcErrCheck(CALL)
Definition UtilsHIP.h:28
static Config & get()
Definition Config.h:334
bool ProteusDebugOutput
Definition Config.h:350
static llvm::raw_ostream & outs(const std::string &Name)
Definition Logger.h:25
static llvm::raw_ostream & logs(const std::string &Name)
Definition Logger.h:19
Definition TimeTracing.h:40
void reset()
Definition TimeTracing.cpp:57
uint64_t elapsed()
Definition TimeTracing.cpp:51
Definition CompiledLibrary.h:7
const SmallVector< StringRef > & threadIdxXFnName()
Definition CoreLLVMCUDA.h:70
const SmallVector< StringRef > & gridDimYFnName()
Definition CoreLLVMCUDA.h:30
const SmallVector< StringRef > & threadIdxZFnName()
Definition CoreLLVMCUDA.h:80
const SmallVector< StringRef > & blockIdxZFnName()
Definition CoreLLVMCUDA.h:65
const SmallVector< StringRef > & gridDimZFnName()
Definition CoreLLVMCUDA.h:35
std::unique_ptr< MemoryBuffer > codegenRTC(Module &M, StringRef DeviceArch)
Definition CoreLLVMHIP.h:370
const SmallVector< StringRef > & gridDimXFnName()
Definition CoreLLVMCUDA.h:25
const SmallVector< StringRef > & blockIdxXFnName()
Definition CoreLLVMCUDA.h:55
Expected< std::unique_ptr< TargetMachine > > createTargetMachine(Module &M, StringRef Arch, unsigned OptLevel=3)
Definition CoreLLVM.h:52
const SmallVector< StringRef > & threadIdxYFnName()
Definition CoreLLVMCUDA.h:75
Expected< sys::fs::TempFile > createTempFile(StringRef Prefix, StringRef Suffix)
Definition CoreLLVMHIP.h:125
const SmallVector< StringRef > & blockIdxYFnName()
Definition CoreLLVMCUDA.h:60
const SmallVector< StringRef > & blockDimYFnName()
Definition CoreLLVMCUDA.h:45
const SmallVector< StringRef > & blockDimZFnName()
Definition CoreLLVMCUDA.h:50
const SmallVector< StringRef > & blockDimXFnName()
Definition CoreLLVMCUDA.h:40
Definition MemoryCache.h:26
void setLaunchBoundsForKernel(Function &F, int MaxThreadsPerSM, int MinBlocksPerSM=0)
Definition CoreLLVMCUDA.h:87
void reportFatalError(const llvm::Twine &Reason, const char *FILE, unsigned Line)
Definition Error.cpp:14
CodegenOption
Definition Config.h:16
std::unique_ptr< MemoryBuffer > codegenObject(Module &M, StringRef DeviceArch, SmallPtrSetImpl< void * > &GlobalLinkedBinaries, CodegenOption CGOption=CodegenOption::RTC)
Definition CoreLLVMCUDA.h:165
std::string toString(CodegenOption Option)
Definition Config.h:28