Proteus
Programmable JIT compilation and optimization for C/C++ using LLVM
Loading...
Searching...
No Matches
CoreLLVMCUDA.hpp
Go to the documentation of this file.
1#ifndef PROTEUS_CORE_LLVM_CUDA_HPP
2#define PROTEUS_CORE_LLVM_CUDA_HPP
3
4#include <llvm/ADT/SmallVector.h>
5#include <llvm/ADT/StringRef.h>
6#include <llvm/CodeGen/MachineModuleInfo.h>
7#include <llvm/IR/LegacyPassManager.h>
8#include <llvm/IR/Module.h>
9#include <llvm/Support/MemoryBufferRef.h>
10#include <llvm/Support/TargetSelect.h>
11#include <llvm/Target/TargetMachine.h>
12
13#include "proteus/CoreLLVM.hpp"
14#include "proteus/Debug.h"
15#include "proteus/Logger.hpp"
17#include "proteus/UtilsCUDA.h"
18
19namespace proteus {
20
21using namespace llvm;
22
23namespace detail {
24
26 static SmallVector<StringRef> Names = {"llvm.nvvm.read.ptx.sreg.nctaid.x"};
27 return Names;
28}
29
31 static SmallVector<StringRef> Names = {"llvm.nvvm.read.ptx.sreg.nctaid.y"};
32 return Names;
33}
34
36 static SmallVector<StringRef> Names = {"llvm.nvvm.read.ptx.sreg.nctaid.z"};
37 return Names;
38}
39
41 static SmallVector<StringRef> Names = {"llvm.nvvm.read.ptx.sreg.ntid.x"};
42 return Names;
43}
44
46 static SmallVector<StringRef> Names = {"llvm.nvvm.read.ptx.sreg.ntid.y"};
47 return Names;
48}
49
51 static SmallVector<StringRef> Names = {"llvm.nvvm.read.ptx.sreg.ntid.z"};
52 return Names;
53}
54
56 static SmallVector<StringRef> Names = {"llvm.nvvm.read.ptx.sreg.ctaid.x"};
57 return Names;
58}
59
61 static SmallVector<StringRef> Names = {"llvm.nvvm.read.ptx.sreg.ctaid.y"};
62 return Names;
63}
64
66 static SmallVector<StringRef> Names = {"llvm.nvvm.read.ptx.sreg.ctaid.z"};
67 return Names;
68}
69
71 static SmallVector<StringRef> Names = {"llvm.nvvm.read.ptx.sreg.tid.x"};
72 return Names;
73}
74
76 static SmallVector<StringRef> Names = {"llvm.nvvm.read.ptx.sreg.tid.y"};
77 return Names;
78}
79
81 static SmallVector<StringRef> Names = {"llvm.nvvm.read.ptx.sreg.tid.z"};
82 return Names;
83}
84
85} // namespace detail
86
88 int MinBlocksPerSM = 0) {
89 auto *M = F.getParent();
90 NamedMDNode *NvvmAnnotations = M->getNamedMetadata("nvvm.annotations");
91 assert(NvvmAnnotations && "Expected non-null nvvm.annotations metadata");
92 auto *FuncMetadata = ConstantAsMetadata::get(&F);
93
94 auto SetMDNode = [&](const char *MDName, int MDValue) {
95 auto *MDNodeName = MDString::get(M->getContext(), MDName);
96 auto *MDNodeValue = ConstantAsMetadata::get(
97 ConstantInt::get(Type::getInt32Ty(M->getContext()), MDValue));
98
99 for (auto *MetadataNode : NvvmAnnotations->operands()) {
100 if (MetadataNode->getNumOperands() != 3)
101 continue;
102
103 auto *PtrMetadata = MetadataNode->getOperand(0).get();
104 auto *DescMetadata = MetadataNode->getOperand(1).get();
106 MetadataNode->replaceOperandWith(2, MDNodeValue);
107 return;
108 }
109 }
111 NvvmAnnotations->addOperand(MDNode::get(M->getContext(), MDVals));
112 };
113
114 // TODO: fix hardcoded 1024 as the maximum, by reading device
115 // properties.
116 SetMDNode("maxntid", std::min(1024, MaxThreadsPerSM));
117 if (MinBlocksPerSM != 0)
118 SetMDNode("minctasm", MinBlocksPerSM);
119}
120
121inline void codegenPTX(Module &M, StringRef DeviceArch,
123 // TODO: It is possbile to use PTX directly through the CUDA PTX JIT
124 // interface. Maybe useful if we can re-link globals using the CUDA API.
125 // Check this reference for PTX JIT caching:
126 // https://developer.nvidia.com/blog/cuda-pro-tip-understand-fat-binaries-jit-caching/
127 // Interesting env vars: CUDA_CACHE_DISABLE, CUDA_CACHE_MAXSIZE,
128 // CUDA_CACHE_PATH, CUDA_FORCE_PTX_JIT.
129
130 Timer T;
132 if (!TMExpected)
134
135 std::unique_ptr<TargetMachine> TM = std::move(*TMExpected);
136 TargetLibraryInfoImpl TLII(Triple(M.getTargetTriple()));
137 M.setDataLayout(TM->createDataLayout());
138
139 legacy::PassManager PM;
142#if LLVM_VERSION_MAJOR >= 20
144#else
146 reinterpret_cast<LLVMTargetMachine *>(TM.get()));
147#endif
148
150#if LLVM_VERSION_MAJOR >= 18
151 TM->addPassesToEmitFile(PM, PTXOS, nullptr, CodeGenFileType::AssemblyFile,
152 /* DisableVerify */ false, MMIWP);
153#else
154 TM->addPassesToEmitFile(PM, PTXOS, nullptr, CGFT_AssemblyFile,
155 /* DisableVerify */ false, MMIWP);
156#endif
157
158 PM.run(M);
159
161 << "Codegen ptx " << T.elapsed() << " ms\n");
162}
163
164inline std::unique_ptr<MemoryBuffer>
166 SmallPtrSetImpl<void *> &GlobalLinkedBinaries,
168 if (CGOption != CodegenOption::RTC)
169 PROTEUS_FATAL_ERROR("Only RTC compilation is supported for CUDA");
171 size_t BinSize;
172
173 codegenPTX(M, DeviceArch, PTXStr);
174 PTXStr.push_back('\0');
175
176 Timer T;
179 nvPTXCompilerCreate(&PTXCompiler, PTXStr.size(), PTXStr.data()));
180 std::string ArchOpt = ("--gpu-name=" + DeviceArch).str();
181 std::string RDCOption = "";
182 if (!GlobalLinkedBinaries.empty())
183 RDCOption = "-c";
184
186 const char *CompileOptions[] = {ArchOpt.c_str(), "--verbose",
187 RDCOption.c_str()};
188 size_t NumCompileOptions = 2 + (RDCOption.empty() ? 0 : 1);
191 } else {
192 const char *CompileOptions[] = {ArchOpt.c_str(), RDCOption.c_str()};
193 size_t NumCompileOptions = 1 + (RDCOption.empty() ? 0 : 1);
196 }
197
200 auto ObjBuf = WritableMemoryBuffer::getNewUninitMemBuffer(BinSize);
203
204 if (Config::get().ProteusDebugOutput) {
205 size_t LogSize;
208 auto Log = std::make_unique<char[]>(LogSize);
211 Logger::logs("proteus") << "=== nvPTXCompiler Log\n" << Log.get() << "\n";
212 }
213
215
216 std::unique_ptr<MemoryBuffer> FinalObjBuf;
217 if (!GlobalLinkedBinaries.empty()) {
218 // Create CUDA context if needed. This is required by threaded async
219 // compilation.
222 if (!CUCtx) {
227
230 }
231
232 // TODO: re-implement using the more recent nvJitLink interface.
234 proteusCuErrCheck(cuLinkCreate(0, nullptr, nullptr, &CULinkState));
235 for (auto *Ptr : GlobalLinkedBinaries) {
236 // We do not know the size of the binary but the CUDA API just needs a
237 // non-zero argument.
239 1, "", 0, 0, 0));
240 }
241
242 // Again using a non-zero argument, though we can get the size from the ptx
243 // compiler.
246 static_cast<void *>(ObjBuf->getBufferStart()), 1, "", 0, 0, 0));
247
248 void *BinOut;
249 size_t BinSize;
251 FinalObjBuf = MemoryBuffer::getMemBufferCopy(
252 StringRef{static_cast<char *>(BinOut), BinSize});
253 } else {
254 FinalObjBuf = std::move(ObjBuf);
255 }
256
258 << "Codegen CUDA RTC " << T.elapsed() << " ms\n");
259 return FinalObjBuf;
260}
261
262} // namespace proteus
263
264#endif
#define PROTEUS_FATAL_ERROR(x)
Definition Error.h:7
#define PROTEUS_TIMER_OUTPUT(x)
Definition TimeTracing.hpp:67
#define proteusNvPTXCompilerErrCheck(CALL)
Definition UtilsCUDA.h:39
#define proteusCuErrCheck(CALL)
Definition UtilsCUDA.h:28
static Config & get()
Definition Config.hpp:298
bool ProteusDebugOutput
Definition Config.hpp:314
static llvm::raw_ostream & outs(const std::string &Name)
Definition Logger.hpp:25
static llvm::raw_ostream & logs(const std::string &Name)
Definition Logger.hpp:19
Definition TimeTracing.hpp:46
Definition Helpers.h:142
const SmallVector< StringRef > & threadIdxXFnName()
Definition CoreLLVMCUDA.hpp:70
const SmallVector< StringRef > & gridDimYFnName()
Definition CoreLLVMCUDA.hpp:30
const SmallVector< StringRef > & threadIdxZFnName()
Definition CoreLLVMCUDA.hpp:80
const SmallVector< StringRef > & blockIdxZFnName()
Definition CoreLLVMCUDA.hpp:65
const SmallVector< StringRef > & gridDimZFnName()
Definition CoreLLVMCUDA.hpp:35
const SmallVector< StringRef > & gridDimXFnName()
Definition CoreLLVMCUDA.hpp:25
const SmallVector< StringRef > & blockIdxXFnName()
Definition CoreLLVMCUDA.hpp:55
Expected< std::unique_ptr< TargetMachine > > createTargetMachine(Module &M, StringRef Arch, unsigned OptLevel=3)
Definition CoreLLVM.hpp:52
const SmallVector< StringRef > & threadIdxYFnName()
Definition CoreLLVMCUDA.hpp:75
const SmallVector< StringRef > & blockIdxYFnName()
Definition CoreLLVMCUDA.hpp:60
const SmallVector< StringRef > & blockDimYFnName()
Definition CoreLLVMCUDA.hpp:45
const SmallVector< StringRef > & blockDimZFnName()
Definition CoreLLVMCUDA.hpp:50
const SmallVector< StringRef > & blockDimXFnName()
Definition CoreLLVMCUDA.hpp:40
Definition ObjectCacheChain.cpp:25
void codegenPTX(Module &M, StringRef DeviceArch, SmallVectorImpl< char > &PTXStr)
Definition CoreLLVMCUDA.hpp:121
void setLaunchBoundsForKernel(Function &F, int MaxThreadsPerSM, int MinBlocksPerSM=0)
Definition CoreLLVMCUDA.hpp:87
T getRuntimeConstantValue(void *Arg)
Definition CompilerInterfaceRuntimeConstantInfo.h:114
CodegenOption
Definition Config.hpp:14
std::unique_ptr< MemoryBuffer > codegenObject(Module &M, StringRef DeviceArch, SmallPtrSetImpl< void * > &GlobalLinkedBinaries, CodegenOption CGOption=CodegenOption::RTC)
Definition CoreLLVMCUDA.hpp:165
std::string toString(CodegenOption Option)
Definition Config.hpp:26