Proteus
Programmable JIT compilation and optimization for C/C++ using LLVM
Loading...
Searching...
No Matches
CoreLLVMCUDA.hpp
Go to the documentation of this file.
1#ifndef PROTEUS_CORE_LLVM_CUDA_HPP
2#define PROTEUS_CORE_LLVM_CUDA_HPP
3
4#include <llvm/ADT/SmallVector.h>
5#include <llvm/ADT/StringRef.h>
6#include <llvm/CodeGen/MachineModuleInfo.h>
7#include <llvm/IR/LegacyPassManager.h>
8#include <llvm/IR/Module.h>
9#include <llvm/Support/MemoryBufferRef.h>
10#include <llvm/Support/TargetSelect.h>
11#include <llvm/Target/TargetMachine.h>
12
14#include "proteus/Debug.h"
15#include "proteus/Logger.hpp"
17#include "proteus/UtilsCUDA.h"
18
19namespace proteus {
20
21using namespace llvm;
22
23namespace detail {
24
25inline const SmallVector<StringRef> &gridDimXFnName() {
26 static SmallVector<StringRef> Names = {"llvm.nvvm.read.ptx.sreg.nctaid.x"};
27 return Names;
28}
29
30inline const SmallVector<StringRef> &gridDimYFnName() {
31 static SmallVector<StringRef> Names = {"llvm.nvvm.read.ptx.sreg.nctaid.y"};
32 return Names;
33}
34
35inline const SmallVector<StringRef> &gridDimZFnName() {
36 static SmallVector<StringRef> Names = {"llvm.nvvm.read.ptx.sreg.nctaid.z"};
37 return Names;
38}
39
40inline const SmallVector<StringRef> &blockDimXFnName() {
41 static SmallVector<StringRef> Names = {"llvm.nvvm.read.ptx.sreg.ntid.x"};
42 return Names;
43}
44
45inline const SmallVector<StringRef> &blockDimYFnName() {
46 static SmallVector<StringRef> Names = {"llvm.nvvm.read.ptx.sreg.ntid.y"};
47 return Names;
48}
49
50inline const SmallVector<StringRef> &blockDimZFnName() {
51 static SmallVector<StringRef> Names = {"llvm.nvvm.read.ptx.sreg.ntid.z"};
52 return Names;
53}
54
55inline const SmallVector<StringRef> &blockIdxXFnName() {
56 static SmallVector<StringRef> Names = {"llvm.nvvm.read.ptx.sreg.ctaid.x"};
57 return Names;
58}
59
60inline const SmallVector<StringRef> &blockIdxYFnName() {
61 static SmallVector<StringRef> Names = {"llvm.nvvm.read.ptx.sreg.ctaid.y"};
62 return Names;
63}
64
65inline const SmallVector<StringRef> &blockIdxZFnName() {
66 static SmallVector<StringRef> Names = {"llvm.nvvm.read.ptx.sreg.ctaid.z"};
67 return Names;
68}
69
70inline const SmallVector<StringRef> &threadIdxXFnName() {
71 static SmallVector<StringRef> Names = {"llvm.nvvm.read.ptx.sreg.tid.x"};
72 return Names;
73}
74
75inline const SmallVector<StringRef> &threadIdxYFnName() {
76 static SmallVector<StringRef> Names = {"llvm.nvvm.read.ptx.sreg.tid.y"};
77 return Names;
78}
79
80inline const SmallVector<StringRef> &threadIdxZFnName() {
81 static SmallVector<StringRef> Names = {"llvm.nvvm.read.ptx.sreg.tid.z"};
82 return Names;
83}
84
85} // namespace detail
86
87inline void setLaunchBoundsForKernel(Module &M, Function &F,
88 size_t /*GridSize*/, int BlockSize) {
89 NamedMDNode *NvvmAnnotations = M.getNamedMetadata("nvvm.annotations");
90 assert(NvvmAnnotations && "Expected non-null nvvm.annotations metadata");
91 // TODO: fix hardcoded 1024 as the maximum, by reading device
92 // properties.
93 // TODO: set min GridSize.
94 int MaxThreads = std::min(1024, BlockSize);
95 auto *FuncMetadata = ConstantAsMetadata::get(&F);
96 auto *MaxntidxMetadata = MDString::get(M.getContext(), "maxntidx");
97 auto *MaxThreadsMetadata = ConstantAsMetadata::get(
98 ConstantInt::get(Type::getInt32Ty(M.getContext()), MaxThreads));
99
100 // Replace if the metadata exists.
101 for (auto *MetadataNode : NvvmAnnotations->operands()) {
102 // Expecting 3 operands ptr, desc, i32 value.
103 assert(MetadataNode->getNumOperands() == 3);
104
105 auto *PtrMetadata = MetadataNode->getOperand(0).get();
106 auto *DescMetadata = MetadataNode->getOperand(1).get();
107 if (PtrMetadata == FuncMetadata && MaxntidxMetadata == DescMetadata) {
108 MetadataNode->replaceOperandWith(2, MaxThreadsMetadata);
109 return;
110 }
111 }
112
113 // Otherwise create the metadata and insert.
114 Metadata *MDVals[] = {FuncMetadata, MaxntidxMetadata, MaxThreadsMetadata};
115 NvvmAnnotations->addOperand(MDNode::get(M.getContext(), MDVals));
116}
117inline void codegenPTX(Module &M, StringRef DeviceArch,
118 SmallVectorImpl<char> &PTXStr) {
119 // TODO: It is possbile to use PTX directly through the CUDA PTX JIT
120 // interface. Maybe useful if we can re-link globals using the CUDA API.
121 // Check this reference for PTX JIT caching:
122 // https://developer.nvidia.com/blog/cuda-pro-tip-understand-fat-binaries-jit-caching/
123 // Interesting env vars: CUDA_CACHE_DISABLE, CUDA_CACHE_MAXSIZE,
124 // CUDA_CACHE_PATH, CUDA_FORCE_PTX_JIT.
125
126 Timer T;
127 auto TMExpected = proteus::detail::createTargetMachine(M, DeviceArch);
128 if (!TMExpected)
129 PROTEUS_FATAL_ERROR(toString(TMExpected.takeError()));
130
131 std::unique_ptr<TargetMachine> TM = std::move(*TMExpected);
132 TargetLibraryInfoImpl TLII(Triple(M.getTargetTriple()));
133
134 legacy::PassManager PM;
135 PM.add(new TargetLibraryInfoWrapperPass(TLII));
136 MachineModuleInfoWrapperPass *MMIWP = new MachineModuleInfoWrapperPass(
137 reinterpret_cast<LLVMTargetMachine *>(TM.get()));
138
139 raw_svector_ostream PTXOS(PTXStr);
140#if LLVM_VERSION_MAJOR >= 18
141 TM->addPassesToEmitFile(PM, PTXOS, nullptr, CodeGenFileType::AssemblyFile,
142 /* DisableVerify */ false, MMIWP);
143#else
144 TM->addPassesToEmitFile(PM, PTXOS, nullptr, CGFT_AssemblyFile,
145 /* DisableVerify */ false, MMIWP);
146#endif
147
148 PM.run(M);
149
151 << "Codegen ptx " << T.elapsed() << " ms\n");
152}
153
154inline std::unique_ptr<MemoryBuffer>
155codegenObject(Module &M, StringRef DeviceArch,
156 SmallPtrSetImpl<void *> &GlobalLinkedBinaries,
158 if (CGOption != CodegenOption::RTC)
159 PROTEUS_FATAL_ERROR("Only RTC compilation is supported for CUDA");
160 SmallVector<char, 4096> PTXStr;
161 size_t BinSize;
162
163 codegenPTX(M, DeviceArch, PTXStr);
164 PTXStr.push_back('\0');
165
166 Timer T;
167 nvPTXCompilerHandle PTXCompiler;
169 nvPTXCompilerCreate(&PTXCompiler, PTXStr.size(), PTXStr.data()));
170 std::string ArchOpt = ("--gpu-name=" + DeviceArch).str();
171 std::string RDCOption = "";
172 if (!GlobalLinkedBinaries.empty())
173 RDCOption = "-c";
174#if PROTEUS_ENABLE_DEBUG
175 const char *CompileOptions[] = {ArchOpt.c_str(), "--verbose",
176 RDCOption.c_str()};
177 size_t NumCompileOptions = 2 + (RDCOption.empty() ? 0 : 1);
178#else
179 const char *CompileOptions[] = {ArchOpt.c_str(), RDCOption.c_str()};
180 size_t NumCompileOptions = 1 + (RDCOption.empty() ? 0 : 1);
181#endif
183 nvPTXCompilerCompile(PTXCompiler, NumCompileOptions, CompileOptions));
185 nvPTXCompilerGetCompiledProgramSize(PTXCompiler, &BinSize));
186 auto ObjBuf = WritableMemoryBuffer::getNewUninitMemBuffer(BinSize);
188 nvPTXCompilerGetCompiledProgram(PTXCompiler, ObjBuf->getBufferStart()));
189#if PROTEUS_ENABLE_DEBUG
190 {
191 size_t LogSize;
193 nvPTXCompilerGetInfoLogSize(PTXCompiler, &LogSize));
194 auto Log = std::make_unique<char[]>(LogSize);
196 nvPTXCompilerGetInfoLog(PTXCompiler, Log.get()));
197 Logger::logs("proteus") << "=== nvPTXCompiler Log\n" << Log.get() << "\n";
198 }
199#endif
200 proteusNvPTXCompilerErrCheck(nvPTXCompilerDestroy(&PTXCompiler));
201
202 std::unique_ptr<MemoryBuffer> FinalObjBuf;
203 if (!GlobalLinkedBinaries.empty()) {
204 // Create CUDA context if needed. This is required by threaded async
205 // compilation.
206 CUcontext CUCtx;
207 proteusCuErrCheck(cuCtxGetCurrent(&CUCtx));
208 if (!CUCtx) {
209 CUdevice CUDev;
210 CUresult CURes = cuCtxGetDevice(&CUDev);
211 if (CURes == CUDA_ERROR_INVALID_CONTEXT or !CUDev)
212 proteusCuErrCheck(cuDeviceGet(&CUDev, 0));
213
214 proteusCuErrCheck(cuCtxGetCurrent(&CUCtx));
215 proteusCuErrCheck(cuCtxCreate(&CUCtx, 0, CUDev));
216 }
217
218 // TODO: re-implement using the more recent nvJitLink interface.
219 CUlinkState CULinkState;
220 proteusCuErrCheck(cuLinkCreate(0, nullptr, nullptr, &CULinkState));
221 for (auto *Ptr : GlobalLinkedBinaries) {
222 // We do not know the size of the binary but the CUDA API just needs a
223 // non-zero argument.
224 proteusCuErrCheck(cuLinkAddData(CULinkState, CU_JIT_INPUT_FATBINARY, Ptr,
225 1, "", 0, 0, 0));
226 }
227
228 // Again using a non-zero argument, though we can get the size from the ptx
229 // compiler.
230 proteusCuErrCheck(cuLinkAddData(
231 CULinkState, CU_JIT_INPUT_FATBINARY,
232 static_cast<void *>(ObjBuf->getBufferStart()), 1, "", 0, 0, 0));
233
234 void *BinOut;
235 size_t BinSize;
236 proteusCuErrCheck(cuLinkComplete(CULinkState, &BinOut, &BinSize));
237 FinalObjBuf = MemoryBuffer::getMemBufferCopy(
238 StringRef{static_cast<char *>(BinOut), BinSize});
239 } else {
240 FinalObjBuf = std::move(ObjBuf);
241 }
242
244 << "Codegen CUDA RTC " << T.elapsed() << " ms\n");
245 return FinalObjBuf;
246}
247
248} // namespace proteus
249
250#endif
#define PROTEUS_FATAL_ERROR(x)
Definition Error.h:4
#define PROTEUS_TIMER_OUTPUT(x)
Definition TimeTracing.hpp:57
#define proteusNvPTXCompilerErrCheck(CALL)
Definition UtilsCUDA.h:39
#define proteusCuErrCheck(CALL)
Definition UtilsCUDA.h:28
static llvm::raw_ostream & outs(const std::string &Name)
Definition Logger.hpp:25
static llvm::raw_ostream & logs(const std::string &Name)
Definition Logger.hpp:19
Definition TimeTracing.hpp:36
uint64_t elapsed()
Definition TimeTracing.hpp:45
const SmallVector< StringRef > & threadIdxXFnName()
Definition CoreLLVMCUDA.hpp:70
const SmallVector< StringRef > & gridDimYFnName()
Definition CoreLLVMCUDA.hpp:30
const SmallVector< StringRef > & threadIdxZFnName()
Definition CoreLLVMCUDA.hpp:80
const SmallVector< StringRef > & blockIdxZFnName()
Definition CoreLLVMCUDA.hpp:65
const SmallVector< StringRef > & gridDimZFnName()
Definition CoreLLVMCUDA.hpp:35
const SmallVector< StringRef > & gridDimXFnName()
Definition CoreLLVMCUDA.hpp:25
const SmallVector< StringRef > & blockIdxXFnName()
Definition CoreLLVMCUDA.hpp:55
Expected< std::unique_ptr< TargetMachine > > createTargetMachine(Module &M, StringRef Arch, unsigned OptLevel=3)
Definition CoreLLVM.hpp:52
const SmallVector< StringRef > & threadIdxYFnName()
Definition CoreLLVMCUDA.hpp:75
const SmallVector< StringRef > & blockIdxYFnName()
Definition CoreLLVMCUDA.hpp:60
const SmallVector< StringRef > & blockDimYFnName()
Definition CoreLLVMCUDA.hpp:45
const SmallVector< StringRef > & blockDimZFnName()
Definition CoreLLVMCUDA.hpp:50
const SmallVector< StringRef > & blockDimXFnName()
Definition CoreLLVMCUDA.hpp:40
Definition Dispatcher.cpp:14
void codegenPTX(Module &M, StringRef DeviceArch, SmallVectorImpl< char > &PTXStr)
Definition CoreLLVMCUDA.hpp:117
CodegenOption
Definition Config.hpp:10
std::unique_ptr< MemoryBuffer > codegenObject(Module &M, StringRef DeviceArch, SmallPtrSetImpl< void * > &GlobalLinkedBinaries, CodegenOption CGOption=CodegenOption::RTC)
Definition CoreLLVMCUDA.hpp:155
void setLaunchBoundsForKernel(Module &M, Function &F, size_t, int BlockSize)
Definition CoreLLVMCUDA.hpp:87
std::string toString(CodegenOption Option)
Definition Config.hpp:23