87 NamedMDNode *NvvmAnnotations = M.getNamedMetadata(
"nvvm.annotations");
88 assert(NvvmAnnotations &&
"Expected non-null nvvm.annotations metadata");
92 int MaxThreads = std::min(1024, BlockSize);
93 auto *FuncMetadata = ConstantAsMetadata::get(&F);
94 auto *MaxntidxMetadata = MDString::get(M.getContext(),
"maxntidx");
95 auto *MaxThreadsMetadata = ConstantAsMetadata::get(
96 ConstantInt::get(Type::getInt32Ty(M.getContext()), MaxThreads));
99 for (
auto *MetadataNode : NvvmAnnotations->operands()) {
101 assert(MetadataNode->getNumOperands() == 3);
103 auto *PtrMetadata = MetadataNode->getOperand(0).get();
104 auto *DescMetadata = MetadataNode->getOperand(1).get();
105 if (PtrMetadata == FuncMetadata && MaxntidxMetadata == DescMetadata) {
106 MetadataNode->replaceOperandWith(2, MaxThreadsMetadata);
112 Metadata *MDVals[] = {FuncMetadata, MaxntidxMetadata, MaxThreadsMetadata};
113 NvvmAnnotations->addOperand(MDNode::get(M.getContext(), MDVals));
116 SmallVectorImpl<char> &PTXStr) {
127 std::unique_ptr<TargetMachine> TM = std::move(*TMExpected);
128 TargetLibraryInfoImpl TLII(Triple(M.getTargetTriple()));
130 legacy::PassManager PM;
131 PM.add(
new TargetLibraryInfoWrapperPass(TLII));
132 MachineModuleInfoWrapperPass *MMIWP =
new MachineModuleInfoWrapperPass(
133 reinterpret_cast<LLVMTargetMachine *
>(TM.get()));
135 raw_svector_ostream PTXOS(PTXStr);
136#if LLVM_VERSION_MAJOR >= 18
137 TM->addPassesToEmitFile(PM, PTXOS,
nullptr, CodeGenFileType::AssemblyFile,
140 TM->addPassesToEmitFile(PM, PTXOS,
nullptr, CGFT_AssemblyFile,
149 SmallPtrSetImpl<void *> &GlobalLinkedBinaries,
150 [[maybe_unused]]
bool UseRTC =
true) {
151 assert(UseRTC &&
"Expected RTC compilation true for CUDA");
152 SmallVector<char, 4096> PTXStr;
156 PTXStr.push_back(
'\0');
158 nvPTXCompilerHandle PTXCompiler;
160 nvPTXCompilerCreate(&PTXCompiler, PTXStr.size(), PTXStr.data()));
161 std::string ArchOpt = (
"--gpu-name=" + DeviceArch).str();
162 std::string RDCOption =
"";
163 if (!GlobalLinkedBinaries.empty())
165#if PROTEUS_ENABLE_DEBUG
166 const char *CompileOptions[] = {ArchOpt.c_str(),
"--verbose",
168 size_t NumCompileOptions = 2 + (RDCOption.empty() ? 0 : 1);
170 const char *CompileOptions[] = {ArchOpt.c_str(), RDCOption.c_str()};
171 size_t NumCompileOptions = 1 + (RDCOption.empty() ? 0 : 1);
174 nvPTXCompilerCompile(PTXCompiler, NumCompileOptions, CompileOptions));
176 nvPTXCompilerGetCompiledProgramSize(PTXCompiler, &BinSize));
177 auto ObjBuf = WritableMemoryBuffer::getNewUninitMemBuffer(BinSize);
179 nvPTXCompilerGetCompiledProgram(PTXCompiler, ObjBuf->getBufferStart()));
180#if PROTEUS_ENABLE_DEBUG
184 nvPTXCompilerGetInfoLogSize(PTXCompiler, &LogSize));
185 auto Log = std::make_unique<char[]>(LogSize);
187 nvPTXCompilerGetInfoLog(PTXCompiler, Log.get()));
188 Logger::logs(
"proteus") <<
"=== nvPTXCompiler Log\n" << Log.get() <<
"\n";
193 std::unique_ptr<MemoryBuffer> FinalObjBuf;
194 if (!GlobalLinkedBinaries.empty()) {
201 CUresult CURes = cuCtxGetDevice(&CUDev);
202 if (CURes == CUDA_ERROR_INVALID_CONTEXT or !CUDev)
210 CUlinkState CULinkState;
212 for (
auto *Ptr : GlobalLinkedBinaries) {
222 CULinkState, CU_JIT_INPUT_FATBINARY,
223 static_cast<void *
>(ObjBuf->getBufferStart()), 1,
"", 0, 0, 0));
228 FinalObjBuf = MemoryBuffer::getMemBufferCopy(
229 StringRef{
static_cast<char *
>(BinOut), BinSize});
231 FinalObjBuf = std::move(ObjBuf);
void setLaunchBoundsForKernel(Module &M, Function &F, size_t GridSize, int BlockSize)
Definition CoreLLVMCUDA.hpp:85
void codegenPTX(Module &M, StringRef DeviceArch, SmallVectorImpl< char > &PTXStr)
Definition CoreLLVMCUDA.hpp:115
std::unique_ptr< MemoryBuffer > codegenObject(Module &M, StringRef DeviceArch, SmallPtrSetImpl< void * > &GlobalLinkedBinaries, bool UseRTC=true)
Definition CoreLLVMCUDA.hpp:148