1#ifndef PROTEUS_CORE_LLVM_CUDA_H
2#define PROTEUS_CORE_LLVM_CUDA_H
10#include <llvm/ADT/SmallVector.h>
11#include <llvm/ADT/StringRef.h>
12#include <llvm/CodeGen/MachineModuleInfo.h>
13#include <llvm/IR/LegacyPassManager.h>
14#include <llvm/IR/Module.h>
15#include <llvm/Support/MemoryBufferRef.h>
16#include <llvm/Support/TargetSelect.h>
17#include <llvm/Target/TargetMachine.h>
26 static SmallVector<StringRef> Names = {
"llvm.nvvm.read.ptx.sreg.nctaid.x"};
31 static SmallVector<StringRef> Names = {
"llvm.nvvm.read.ptx.sreg.nctaid.y"};
36 static SmallVector<StringRef> Names = {
"llvm.nvvm.read.ptx.sreg.nctaid.z"};
41 static SmallVector<StringRef> Names = {
"llvm.nvvm.read.ptx.sreg.ntid.x"};
46 static SmallVector<StringRef> Names = {
"llvm.nvvm.read.ptx.sreg.ntid.y"};
51 static SmallVector<StringRef> Names = {
"llvm.nvvm.read.ptx.sreg.ntid.z"};
56 static SmallVector<StringRef> Names = {
"llvm.nvvm.read.ptx.sreg.ctaid.x"};
61 static SmallVector<StringRef> Names = {
"llvm.nvvm.read.ptx.sreg.ctaid.y"};
66 static SmallVector<StringRef> Names = {
"llvm.nvvm.read.ptx.sreg.ctaid.z"};
71 static SmallVector<StringRef> Names = {
"llvm.nvvm.read.ptx.sreg.tid.x"};
76 static SmallVector<StringRef> Names = {
"llvm.nvvm.read.ptx.sreg.tid.y"};
81 static SmallVector<StringRef> Names = {
"llvm.nvvm.read.ptx.sreg.tid.z"};
88 int MinBlocksPerSM = 0) {
89 auto *M = F.getParent();
90 NamedMDNode *NvvmAnnotations = M->getNamedMetadata(
"nvvm.annotations");
91 assert(NvvmAnnotations &&
"Expected non-null nvvm.annotations metadata");
92 auto *FuncMetadata = ConstantAsMetadata::get(&F);
94 auto SetMDNode = [&](
const char *MDName,
int MDValue) {
95 auto *MDNodeName = MDString::get(M->getContext(), MDName);
96 auto *MDNodeValue = ConstantAsMetadata::get(
97 ConstantInt::get(Type::getInt32Ty(M->getContext()), MDValue));
99 for (
auto *MetadataNode : NvvmAnnotations->operands()) {
100 if (MetadataNode->getNumOperands() != 3)
103 auto *PtrMetadata = MetadataNode->getOperand(0).get();
104 auto *DescMetadata = MetadataNode->getOperand(1).get();
105 if (PtrMetadata == FuncMetadata && MDNodeName == DescMetadata) {
106 MetadataNode->replaceOperandWith(2, MDNodeValue);
110 Metadata *MDVals[] = {FuncMetadata, MDNodeName, MDNodeValue};
111 NvvmAnnotations->addOperand(MDNode::get(M->getContext(), MDVals));
116 SetMDNode(
"maxntid", std::min(1024, MaxThreadsPerSM));
117 if (MinBlocksPerSM != 0)
118 SetMDNode(
"minctasm", MinBlocksPerSM);
122 SmallVectorImpl<char> &PTXStr) {
135 std::unique_ptr<TargetMachine> TM = std::move(*TMExpected);
136 TargetLibraryInfoImpl TLII(Triple(M.getTargetTriple()));
137 M.setDataLayout(TM->createDataLayout());
139 legacy::PassManager PM;
140 PM.add(
new TargetLibraryInfoWrapperPass(TLII));
141 MachineModuleInfoWrapperPass *MMIWP =
142#if LLVM_VERSION_MAJOR >= 20
143 new MachineModuleInfoWrapperPass(TM.get());
145 new MachineModuleInfoWrapperPass(
146 reinterpret_cast<LLVMTargetMachine *
>(TM.get()));
149 raw_svector_ostream PTXOS(PTXStr);
150#if LLVM_VERSION_MAJOR >= 18
151 TM->addPassesToEmitFile(PM, PTXOS,
nullptr, CodeGenFileType::AssemblyFile,
154 TM->addPassesToEmitFile(PM, PTXOS,
nullptr, CGFT_AssemblyFile,
161 <<
"Codegen ptx " << T.
elapsed() <<
" ms\n");
164inline std::unique_ptr<MemoryBuffer>
166 SmallPtrSetImpl<void *> &GlobalLinkedBinaries,
170 SmallVector<char, 4096> PTXStr;
174 PTXStr.push_back(
'\0');
177 nvPTXCompilerHandle PTXCompiler;
179 nvPTXCompilerCreate(&PTXCompiler, PTXStr.size(), PTXStr.data()));
180 std::string ArchOpt = (
"--gpu-name=" + DeviceArch).str();
181 std::string RDCOption =
"";
182 if (!GlobalLinkedBinaries.empty())
186 const char *CompileOptions[] = {ArchOpt.c_str(),
"--verbose",
188 size_t NumCompileOptions = 2 + (RDCOption.empty() ? 0 : 1);
190 nvPTXCompilerCompile(PTXCompiler, NumCompileOptions, CompileOptions));
192 const char *CompileOptions[] = {ArchOpt.c_str(), RDCOption.c_str()};
193 size_t NumCompileOptions = 1 + (RDCOption.empty() ? 0 : 1);
195 nvPTXCompilerCompile(PTXCompiler, NumCompileOptions, CompileOptions));
199 nvPTXCompilerGetCompiledProgramSize(PTXCompiler, &BinSize));
200 auto ObjBuf = WritableMemoryBuffer::getNewUninitMemBuffer(BinSize);
202 nvPTXCompilerGetCompiledProgram(PTXCompiler, ObjBuf->getBufferStart()));
207 nvPTXCompilerGetInfoLogSize(PTXCompiler, &LogSize));
208 auto Log = std::make_unique<char[]>(LogSize);
210 nvPTXCompilerGetInfoLog(PTXCompiler, Log.get()));
211 Logger::logs(
"proteus") <<
"=== nvPTXCompiler Log\n" << Log.get() <<
"\n";
216 std::unique_ptr<MemoryBuffer> FinalObjBuf;
217 if (!GlobalLinkedBinaries.empty()) {
231 CUlinkState CULinkState;
233 for (
auto *Ptr : GlobalLinkedBinaries) {
243 CULinkState, CU_JIT_INPUT_FATBINARY,
244 static_cast<void *
>(ObjBuf->getBufferStart()), 1,
"", 0, 0, 0));
249 FinalObjBuf = MemoryBuffer::getMemBufferCopy(
250 StringRef{
static_cast<char *
>(BinOut), BinSize});
252 FinalObjBuf = std::move(ObjBuf);
256 <<
"Codegen CUDA RTC " << T.
elapsed() <<
" ms\n");
#define PROTEUS_TIMER_OUTPUT(x)
Definition TimeTracing.h:54
#define proteusNvPTXCompilerErrCheck(CALL)
Definition UtilsCUDA.h:39
#define proteusCuErrCheck(CALL)
Definition UtilsCUDA.h:28
static Config & get()
Definition Config.h:334
bool ProteusDebugOutput
Definition Config.h:350
static llvm::raw_ostream & outs(const std::string &Name)
Definition Logger.h:25
static llvm::raw_ostream & logs(const std::string &Name)
Definition Logger.h:19
Definition TimeTracing.h:40
uint64_t elapsed()
Definition TimeTracing.cpp:51
Definition CompiledLibrary.h:7
const SmallVector< StringRef > & threadIdxXFnName()
Definition CoreLLVMCUDA.h:70
const SmallVector< StringRef > & gridDimYFnName()
Definition CoreLLVMCUDA.h:30
const SmallVector< StringRef > & threadIdxZFnName()
Definition CoreLLVMCUDA.h:80
const SmallVector< StringRef > & blockIdxZFnName()
Definition CoreLLVMCUDA.h:65
const SmallVector< StringRef > & gridDimZFnName()
Definition CoreLLVMCUDA.h:35
const SmallVector< StringRef > & gridDimXFnName()
Definition CoreLLVMCUDA.h:25
const SmallVector< StringRef > & blockIdxXFnName()
Definition CoreLLVMCUDA.h:55
Expected< std::unique_ptr< TargetMachine > > createTargetMachine(Module &M, StringRef Arch, unsigned OptLevel=3)
Definition CoreLLVM.h:52
const SmallVector< StringRef > & threadIdxYFnName()
Definition CoreLLVMCUDA.h:75
const SmallVector< StringRef > & blockIdxYFnName()
Definition CoreLLVMCUDA.h:60
const SmallVector< StringRef > & blockDimYFnName()
Definition CoreLLVMCUDA.h:45
const SmallVector< StringRef > & blockDimZFnName()
Definition CoreLLVMCUDA.h:50
const SmallVector< StringRef > & blockDimXFnName()
Definition CoreLLVMCUDA.h:40
Definition MemoryCache.h:26
void codegenPTX(Module &M, StringRef DeviceArch, SmallVectorImpl< char > &PTXStr)
Definition CoreLLVMCUDA.h:121
void setLaunchBoundsForKernel(Function &F, int MaxThreadsPerSM, int MinBlocksPerSM=0)
Definition CoreLLVMCUDA.h:87
void reportFatalError(const llvm::Twine &Reason, const char *FILE, unsigned Line)
Definition Error.cpp:14
CodegenOption
Definition Config.h:16
std::unique_ptr< MemoryBuffer > codegenObject(Module &M, StringRef DeviceArch, SmallPtrSetImpl< void * > &GlobalLinkedBinaries, CodegenOption CGOption=CodegenOption::RTC)
Definition CoreLLVMCUDA.h:165
std::string toString(CodegenOption Option)
Definition Config.h:28