Proteus
Programmable JIT compilation and optimization for C/C++ using LLVM
Loading...
Searching...
No Matches
CoreLLVMCUDA.h
Go to the documentation of this file.
1#ifndef PROTEUS_CORE_LLVM_CUDA_H
2#define PROTEUS_CORE_LLVM_CUDA_H
3
9
10#include <llvm/ADT/SmallVector.h>
11#include <llvm/ADT/StringRef.h>
12#include <llvm/CodeGen/MachineModuleInfo.h>
13#include <llvm/IR/LegacyPassManager.h>
14#include <llvm/IR/Module.h>
15#include <llvm/Support/MemoryBufferRef.h>
16#include <llvm/Support/TargetSelect.h>
17#include <llvm/Target/TargetMachine.h>
18
19namespace proteus {
20
21using namespace llvm;
22
23namespace detail {
24
25inline const SmallVector<StringRef> &gridDimXFnName() {
26 static SmallVector<StringRef> Names = {"llvm.nvvm.read.ptx.sreg.nctaid.x"};
27 return Names;
28}
29
30inline const SmallVector<StringRef> &gridDimYFnName() {
31 static SmallVector<StringRef> Names = {"llvm.nvvm.read.ptx.sreg.nctaid.y"};
32 return Names;
33}
34
35inline const SmallVector<StringRef> &gridDimZFnName() {
36 static SmallVector<StringRef> Names = {"llvm.nvvm.read.ptx.sreg.nctaid.z"};
37 return Names;
38}
39
40inline const SmallVector<StringRef> &blockDimXFnName() {
41 static SmallVector<StringRef> Names = {"llvm.nvvm.read.ptx.sreg.ntid.x"};
42 return Names;
43}
44
45inline const SmallVector<StringRef> &blockDimYFnName() {
46 static SmallVector<StringRef> Names = {"llvm.nvvm.read.ptx.sreg.ntid.y"};
47 return Names;
48}
49
50inline const SmallVector<StringRef> &blockDimZFnName() {
51 static SmallVector<StringRef> Names = {"llvm.nvvm.read.ptx.sreg.ntid.z"};
52 return Names;
53}
54
55inline const SmallVector<StringRef> &blockIdxXFnName() {
56 static SmallVector<StringRef> Names = {"llvm.nvvm.read.ptx.sreg.ctaid.x"};
57 return Names;
58}
59
60inline const SmallVector<StringRef> &blockIdxYFnName() {
61 static SmallVector<StringRef> Names = {"llvm.nvvm.read.ptx.sreg.ctaid.y"};
62 return Names;
63}
64
65inline const SmallVector<StringRef> &blockIdxZFnName() {
66 static SmallVector<StringRef> Names = {"llvm.nvvm.read.ptx.sreg.ctaid.z"};
67 return Names;
68}
69
70inline const SmallVector<StringRef> &threadIdxXFnName() {
71 static SmallVector<StringRef> Names = {"llvm.nvvm.read.ptx.sreg.tid.x"};
72 return Names;
73}
74
75inline const SmallVector<StringRef> &threadIdxYFnName() {
76 static SmallVector<StringRef> Names = {"llvm.nvvm.read.ptx.sreg.tid.y"};
77 return Names;
78}
79
80inline const SmallVector<StringRef> &threadIdxZFnName() {
81 static SmallVector<StringRef> Names = {"llvm.nvvm.read.ptx.sreg.tid.z"};
82 return Names;
83}
84
85} // namespace detail
86
87inline void setLaunchBoundsForKernel(Function &F, int MaxThreadsPerSM,
88 int MinBlocksPerSM = 0) {
89 // Overwrite any existing launch-bounds attributes so specialization remains
90 // the authoritative source.
91#if LLVM_VERSION_MAJOR >= 22
92 // LLVM 22+ lowers CUDA launch bounds from NVVM function attributes rather
93 // than mutating nvvm.annotations metadata in the IR.
94 F.addFnAttr("nvvm.maxntid", std::to_string(std::min(1024, MaxThreadsPerSM)));
95 if (MinBlocksPerSM != 0)
96 F.addFnAttr("nvvm.minctasm", std::to_string(MinBlocksPerSM));
97#else
98 auto *M = F.getParent();
99 NamedMDNode *NvvmAnnotations = M->getNamedMetadata("nvvm.annotations");
100 assert(NvvmAnnotations && "Expected non-null nvvm.annotations metadata");
101 auto *FuncMetadata = ConstantAsMetadata::get(&F);
102
103 auto SetMDNode = [&](const char *MDName, int MDValue) {
104 auto *MDNodeName = MDString::get(M->getContext(), MDName);
105 auto *MDNodeValue = ConstantAsMetadata::get(
106 ConstantInt::get(Type::getInt32Ty(M->getContext()), MDValue));
107
108 for (auto *MetadataNode : NvvmAnnotations->operands()) {
109 if (MetadataNode->getNumOperands() != 3)
110 continue;
111
112 auto *PtrMetadata = MetadataNode->getOperand(0).get();
113 auto *DescMetadata = MetadataNode->getOperand(1).get();
114 if (PtrMetadata == FuncMetadata && MDNodeName == DescMetadata) {
115 MetadataNode->replaceOperandWith(2, MDNodeValue);
116 return;
117 }
118 }
119 Metadata *MDVals[] = {FuncMetadata, MDNodeName, MDNodeValue};
120 NvvmAnnotations->addOperand(MDNode::get(M->getContext(), MDVals));
121 };
122
123 // TODO: fix hardcoded 1024 as the maximum, by reading device
124 // properties.
125 SetMDNode("maxntid", std::min(1024, MaxThreadsPerSM));
126 if (MinBlocksPerSM != 0)
127 SetMDNode("minctasm", MinBlocksPerSM);
128#endif
129}
130
131inline void codegenPTX(Module &M, StringRef DeviceArch,
132 SmallVectorImpl<char> &PTXStr) {
133 TIMESCOPE("proteus::codegenPTX");
134 // TODO: It is possbile to use PTX directly through the CUDA PTX JIT
135 // interface. Maybe useful if we can re-link globals using the CUDA API.
136 // Check this reference for PTX JIT caching:
137 // https://developer.nvidia.com/blog/cuda-pro-tip-understand-fat-binaries-jit-caching/
138 // Interesting env vars: CUDA_CACHE_DISABLE, CUDA_CACHE_MAXSIZE,
139 // CUDA_CACHE_PATH, CUDA_FORCE_PTX_JIT.
140
141 Timer T(Config::get().ProteusEnableTimers);
142 auto TMExpected = proteus::detail::createTargetMachine(M, DeviceArch);
143 if (!TMExpected)
144 reportFatalError(toString(TMExpected.takeError()));
145
146 std::unique_ptr<TargetMachine> TM = std::move(*TMExpected);
147 TargetLibraryInfoImpl TLII(Triple(M.getTargetTriple()));
148 M.setDataLayout(TM->createDataLayout());
149
150 legacy::PassManager PM;
151 PM.add(new TargetLibraryInfoWrapperPass(TLII));
152 MachineModuleInfoWrapperPass *MMIWP =
153#if LLVM_VERSION_MAJOR >= 20
154 new MachineModuleInfoWrapperPass(TM.get());
155#else
156 new MachineModuleInfoWrapperPass(
157 reinterpret_cast<LLVMTargetMachine *>(TM.get()));
158#endif
159
160 raw_svector_ostream PTXOS(PTXStr);
161#if LLVM_VERSION_MAJOR >= 18
162 TM->addPassesToEmitFile(PM, PTXOS, nullptr, CodeGenFileType::AssemblyFile,
163 /* DisableVerify */ false, MMIWP);
164#else
165 TM->addPassesToEmitFile(PM, PTXOS, nullptr, CGFT_AssemblyFile,
166 /* DisableVerify */ false, MMIWP);
167#endif
168
169 PM.run(M);
170
172 << "Codegen ptx " << T.elapsed() << " ms\n");
173}
174
175inline std::unique_ptr<MemoryBuffer>
176codegenObject(Module &M, StringRef DeviceArch,
177 SmallPtrSetImpl<void *> &GlobalLinkedBinaries,
179 TIMESCOPE("proteus::codegenObjectCUDA");
180 if (CGOption != CodegenOption::RTC)
181 reportFatalError("Only RTC compilation is supported for CUDA");
182 SmallVector<char, 4096> PTXStr;
183 size_t BinSize;
184
185 codegenPTX(M, DeviceArch, PTXStr);
186 PTXStr.push_back('\0');
187
188 Timer T(Config::get().ProteusEnableTimers);
189 nvPTXCompilerHandle PTXCompiler;
191 nvPTXCompilerCreate(&PTXCompiler, PTXStr.size(), PTXStr.data()));
192 std::string ArchOpt = ("--gpu-name=" + DeviceArch).str();
193 std::string RDCOption = "";
194 if (!GlobalLinkedBinaries.empty())
195 RDCOption = "-c";
196
198 const char *CompileOptions[] = {ArchOpt.c_str(), "--verbose",
199 RDCOption.c_str()};
200 size_t NumCompileOptions = 2 + (RDCOption.empty() ? 0 : 1);
202 nvPTXCompilerCompile(PTXCompiler, NumCompileOptions, CompileOptions));
203 } else {
204 const char *CompileOptions[] = {ArchOpt.c_str(), RDCOption.c_str()};
205 size_t NumCompileOptions = 1 + (RDCOption.empty() ? 0 : 1);
207 nvPTXCompilerCompile(PTXCompiler, NumCompileOptions, CompileOptions));
208 }
209
211 nvPTXCompilerGetCompiledProgramSize(PTXCompiler, &BinSize));
212 auto ObjBuf = WritableMemoryBuffer::getNewUninitMemBuffer(BinSize);
214 nvPTXCompilerGetCompiledProgram(PTXCompiler, ObjBuf->getBufferStart()));
215
216 if (Config::get().ProteusDebugOutput) {
217 size_t LogSize;
219 nvPTXCompilerGetInfoLogSize(PTXCompiler, &LogSize));
220 auto Log = std::make_unique<char[]>(LogSize);
222 nvPTXCompilerGetInfoLog(PTXCompiler, Log.get()));
223 Logger::logs("proteus") << "=== nvPTXCompiler Log\n" << Log.get() << "\n";
224 }
225
226 proteusNvPTXCompilerErrCheck(nvPTXCompilerDestroy(&PTXCompiler));
227
228 std::unique_ptr<MemoryBuffer> FinalObjBuf;
229 if (!GlobalLinkedBinaries.empty()) {
230 // Retain the primary CUDA context if needed. This is required by threaded
231 // async compilation for ensuring a valid CUDA context is set when linking
232 // with the CUDA API.
233 CUcontext CUCtx;
234 proteusCuErrCheck(cuCtxGetCurrent(&CUCtx));
235 if (!CUCtx) {
236 CUdevice CUDev;
237 proteusCuErrCheck(cuDeviceGet(&CUDev, 0));
238 proteusCuErrCheck(cuDevicePrimaryCtxRetain(&CUCtx, CUDev));
239 proteusCuErrCheck(cuCtxSetCurrent(CUCtx));
240 }
241
242 // TODO: re-implement using the more recent nvJitLink interface.
243 CUlinkState CULinkState;
244 proteusCuErrCheck(cuLinkCreate(0, nullptr, nullptr, &CULinkState));
245 for (auto *Ptr : GlobalLinkedBinaries) {
246 // We do not know the size of the binary but the CUDA API just needs a
247 // non-zero argument.
248 proteusCuErrCheck(cuLinkAddData(CULinkState, CU_JIT_INPUT_FATBINARY, Ptr,
249 1, "", 0, 0, 0));
250 }
251
252 // Again using a non-zero argument, though we can get the size from the ptx
253 // compiler.
254 proteusCuErrCheck(cuLinkAddData(
255 CULinkState, CU_JIT_INPUT_FATBINARY,
256 static_cast<void *>(ObjBuf->getBufferStart()), 1, "", 0, 0, 0));
257
258 void *BinOut;
259 size_t BinSize;
260 proteusCuErrCheck(cuLinkComplete(CULinkState, &BinOut, &BinSize));
261 FinalObjBuf = MemoryBuffer::getMemBufferCopy(
262 StringRef{static_cast<char *>(BinOut), BinSize});
263 } else {
264 FinalObjBuf = std::move(ObjBuf);
265 }
266
268 << "Codegen CUDA RTC " << T.elapsed() << " ms\n");
269 return FinalObjBuf;
270}
271
272} // namespace proteus
273
274#endif
#define PROTEUS_TIMER_OUTPUT(x)
Definition Config.h:440
#define TIMESCOPE(...)
Definition TimeTracing.h:66
#define proteusNvPTXCompilerErrCheck(CALL)
Definition UtilsCUDA.h:39
#define proteusCuErrCheck(CALL)
Definition UtilsCUDA.h:28
static Config & get()
Definition Config.h:334
bool ProteusDebugOutput
Definition Config.h:350
static llvm::raw_ostream & outs(const std::string &Name)
Definition Logger.h:25
static llvm::raw_ostream & logs(const std::string &Name)
Definition Logger.h:19
Definition TimeTracing.h:33
uint64_t elapsed()
Definition TimeTracing.cpp:66
Definition CompiledLibrary.h:7
const SmallVector< StringRef > & threadIdxXFnName()
Definition CoreLLVMCUDA.h:70
const SmallVector< StringRef > & gridDimYFnName()
Definition CoreLLVMCUDA.h:30
const SmallVector< StringRef > & threadIdxZFnName()
Definition CoreLLVMCUDA.h:80
const SmallVector< StringRef > & blockIdxZFnName()
Definition CoreLLVMCUDA.h:65
const SmallVector< StringRef > & gridDimZFnName()
Definition CoreLLVMCUDA.h:35
const SmallVector< StringRef > & gridDimXFnName()
Definition CoreLLVMCUDA.h:25
const SmallVector< StringRef > & blockIdxXFnName()
Definition CoreLLVMCUDA.h:55
Expected< std::unique_ptr< TargetMachine > > createTargetMachine(Module &M, StringRef Arch, unsigned OptLevel=3)
Definition CoreLLVM.h:57
const SmallVector< StringRef > & threadIdxYFnName()
Definition CoreLLVMCUDA.h:75
const SmallVector< StringRef > & blockIdxYFnName()
Definition CoreLLVMCUDA.h:60
const SmallVector< StringRef > & blockDimYFnName()
Definition CoreLLVMCUDA.h:45
const SmallVector< StringRef > & blockDimZFnName()
Definition CoreLLVMCUDA.h:50
const SmallVector< StringRef > & blockDimXFnName()
Definition CoreLLVMCUDA.h:40
Definition MemoryCache.h:27
void codegenPTX(Module &M, StringRef DeviceArch, SmallVectorImpl< char > &PTXStr)
Definition CoreLLVMCUDA.h:131
void setLaunchBoundsForKernel(Function &F, int MaxThreadsPerSM, int MinBlocksPerSM=0)
Definition CoreLLVMCUDA.h:87
void reportFatalError(const llvm::Twine &Reason, const char *FILE, unsigned Line)
Definition Error.cpp:14
CodegenOption
Definition Config.h:16
std::unique_ptr< MemoryBuffer > codegenObject(Module &M, StringRef DeviceArch, SmallPtrSetImpl< void * > &GlobalLinkedBinaries, CodegenOption CGOption=CodegenOption::RTC)
Definition CoreLLVMCUDA.h:176
std::string toString(CodegenOption Option)
Definition Config.h:28