Proteus
Programmable JIT compilation and optimization for C/C++ using LLVM
Loading...
Searching...
No Matches
CoreLLVMHIP.hpp
Go to the documentation of this file.
1#ifndef PROTEUS_CORE_LLVM_HIP_HPP
2#define PROTEUS_CORE_LLVM_HIP_HPP
3
4#include <llvm/Bitcode/BitcodeWriter.h>
5#include <llvm/CodeGen/MachineModuleInfo.h>
6#include <llvm/IR/Function.h>
7#include <llvm/IR/LegacyPassManager.h>
8#include <llvm/IR/Module.h>
9#include <llvm/Support/CodeGen.h>
10#include <llvm/Support/FileSystem.h>
11#include <llvm/Support/MemoryBuffer.h>
12#include <llvm/Support/Path.h>
13#include <llvm/Support/TargetSelect.h>
14#include <llvm/Target/TargetMachine.h>
15
16#if LLVM_VERSION_MAJOR == 18
17#include <lld/Common/Driver.h>
18LLD_HAS_DRIVER(elf)
19#endif
20
21#include "proteus/Debug.h"
22#include "proteus/Error.h"
23#include "proteus/Logger.hpp"
24#include "proteus/UtilsHIP.h"
25
26namespace proteus {
27
28using namespace llvm;
29
30namespace detail {
31
32inline const SmallVector<StringRef> &gridDimXFnName() {
33 static SmallVector<StringRef> Names = {
34 "_ZNK17__HIP_CoordinatesI13__HIP_GridDimE3__XcvjEv",
35 "llvm.amdgcn.num.workgroups.x"};
36 return Names;
37}
38
39inline const SmallVector<StringRef> &gridDimYFnName() {
40 static SmallVector<StringRef> Names = {
41 "_ZNK17__HIP_CoordinatesI13__HIP_GridDimE3__YcvjEv",
42 "llvm.amdgcn.num.workgroups.y"};
43 return Names;
44}
45
46inline const SmallVector<StringRef> &gridDimZFnName() {
47 static SmallVector<StringRef> Names = {
48 "_ZNK17__HIP_CoordinatesI13__HIP_GridDimE3__ZcvjEv",
49 "llvm.amdgcn.num.workgroups.z"};
50 return Names;
51}
52
53inline const SmallVector<StringRef> &blockDimXFnName() {
54 static SmallVector<StringRef> Names = {
55 "_ZNK17__HIP_CoordinatesI14__HIP_BlockDimE3__XcvjEv",
56 "llvm.amdgcn.workgroup.size.x"};
57 return Names;
58}
59
60inline const SmallVector<StringRef> &blockDimYFnName() {
61 static SmallVector<StringRef> Names = {
62 "_ZNK17__HIP_CoordinatesI14__HIP_BlockDimE3__YcvjEv",
63 "llvm.amdgcn.workgroup.size.y"};
64 return Names;
65}
66
67inline const SmallVector<StringRef> &blockDimZFnName() {
68 static SmallVector<StringRef> Names = {
69 "_ZNK17__HIP_CoordinatesI14__HIP_BlockDimE3__ZcvjEv",
70 "llvm.amdgcn.workgroup.size.z"};
71 return Names;
72}
73
74inline const SmallVector<StringRef> &blockIdxXFnName() {
75 static SmallVector<StringRef> Names = {
76 "_ZNK17__HIP_CoordinatesI14__HIP_BlockIdxE3__XcvjEv",
77 "llvm.amdgcn.workgroup.id.x"};
78 return Names;
79};
80
81inline const SmallVector<StringRef> &blockIdxYFnName() {
82 static SmallVector<StringRef> Names = {
83 "_ZNK17__HIP_CoordinatesI14__HIP_BlockIdxE3__YcvjEv",
84 "llvm.amdgcn.workgroup.id.y"};
85 return Names;
86};
87
88inline const SmallVector<StringRef> &blockIdxZFnName() {
89 static SmallVector<StringRef> Names = {
90 "_ZNK17__HIP_CoordinatesI14__HIP_BlockIdxE3__ZcvjEv",
91 "llvm.amdgcn.workgroup.id.z"};
92 return Names;
93}
94
95inline const SmallVector<StringRef> &threadIdxXFnName() {
96 static SmallVector<StringRef> Names = {
97 "_ZNK17__HIP_CoordinatesI15__HIP_ThreadIdxE3__XcvjEv",
98 "llvm.amdgcn.workitem.id.x"};
99 return Names;
100};
101
102inline const SmallVector<StringRef> &threadIdxYFnName() {
103 static SmallVector<StringRef> Names = {
104 "_ZNK17__HIP_CoordinatesI15__HIP_ThreadIdxE3__YcvjEv",
105 "llvm.amdgcn.workitem.id.y"};
106 return Names;
107};
108
109inline const SmallVector<StringRef> &threadIdxZFnName() {
110 static SmallVector<StringRef> Names = {
111 "_ZNK17__HIP_CoordinatesI15__HIP_ThreadIdxE3__ZcvjEv",
112 "llvm.amdgcn.workitem.id.z"};
113 return Names;
114};
115
116} // namespace detail
117
118inline void setLaunchBoundsForKernel(Module &M, Function &F, size_t GridSize,
119 int BlockSize) {
120 // TODO: fix calculation of launch bounds.
121 // TODO: find maximum (hardcoded 1024) from device info.
122 // TODO: Setting as 1, BlockSize to replicate launch bounds settings
123 // Does setting it as BlockSize, BlockSize help?
124 // Setting the attribute override any previous setting.
125 F.addFnAttr("amdgpu-flat-work-group-size",
126 "1," + std::to_string(std::min(1024, BlockSize)));
127 // TODO: find warp size (hardcoded 64) from device info.
128 // int WavesPerEU = (GridSize * BlockSize) / 64 / 110 / 4 / 2;
129 [[maybe_unused]] int WavesPerEU = 0;
130 // F->addFnAttr("amdgpu-waves-per-eu", std::to_string(WavesPerEU));
131 PROTEUS_DBG(Logger::logs("proteus")
132 << "BlockSize " << BlockSize << " GridSize " << GridSize
133 << " => Set Wokgroup size " << BlockSize
134 << " WavesPerEU (unused) " << WavesPerEU << "\n");
135}
136
137inline std::unique_ptr<MemoryBuffer>
138codegenObject(Module &M, StringRef DeviceArch,
139 SmallPtrSetImpl<void *> &GlobalLinkedBinaries,
140 bool UseRTC = true) {
141 assert(GlobalLinkedBinaries.empty() &&
142 "Expected empty linked binaries for HIP");
143 if (UseRTC) {
144 char *BinOut;
145 size_t BinSize;
146
147 SmallString<4096> ModuleBuf;
148 raw_svector_ostream ModuleBufOS(ModuleBuf);
149 WriteBitcodeToFile(M, ModuleBufOS);
150
151 hiprtcLinkState HipLinkStatePtr;
152
153 // NOTE: This code is an example of passing custom, AMD-specific
154 // options to the compiler/linker.
155 // NOTE: Unrolling can have a dramatic (time-consuming) effect on JIT
156 // compilation time and on the resulting optimization, better or worse
157 // depending on code specifics.
158 std::string MArchOpt = ("-march=" + DeviceArch).str();
159 const char *OptArgs[] = {"-mllvm", "-unroll-threshold=1000",
160 MArchOpt.c_str()};
161 std::vector<hiprtcJIT_option> JITOptions = {
162 HIPRTC_JIT_IR_TO_ISA_OPT_EXT, HIPRTC_JIT_IR_TO_ISA_OPT_COUNT_EXT};
163 size_t OptArgsSize = 3;
164 const void *JITOptionsValues[] = {(void *)OptArgs, (void *)(OptArgsSize)};
165 proteusHiprtcErrCheck(hiprtcLinkCreate(JITOptions.size(), JITOptions.data(),
166 (void **)JITOptionsValues,
167 &HipLinkStatePtr));
168 // NOTE: the following version of te code does not set options.
169 // proteusHiprtcErrCheck(hiprtcLinkCreate(0, nullptr, nullptr,
170 // &hip_link_state_ptr));
171
172 proteusHiprtcErrCheck(hiprtcLinkAddData(
173 HipLinkStatePtr, HIPRTC_JIT_INPUT_LLVM_BITCODE,
174 (void *)ModuleBuf.data(), ModuleBuf.size(), "", 0, nullptr, nullptr));
176 hiprtcLinkComplete(HipLinkStatePtr, (void **)&BinOut, &BinSize));
177
178 return MemoryBuffer::getMemBuffer(StringRef{BinOut, BinSize});
179 }
180
181#if LLVM_VERSION_MAJOR == 18
182 auto TMExpected = proteus::detail::createTargetMachine(M, DeviceArch);
183 if (!TMExpected)
184 PROTEUS_FATAL_ERROR(toString(TMExpected.takeError()));
185
186 std::unique_ptr<TargetMachine> TM = std::move(*TMExpected);
187 TargetLibraryInfoImpl TLII(Triple(M.getTargetTriple()));
188
189 legacy::PassManager PM;
190 PM.add(new TargetLibraryInfoWrapperPass(TLII));
191 MachineModuleInfoWrapperPass *MMIWP = new MachineModuleInfoWrapperPass(
192 reinterpret_cast<LLVMTargetMachine *>(TM.get()));
193
194 SmallVector<char, 4096> ObjectCode;
195 raw_svector_ostream OS(ObjectCode);
196 TM->addPassesToEmitFile(PM, OS, nullptr, CodeGenFileType::ObjectFile,
197 /* DisableVerify */ true, MMIWP);
198
199 PM.run(M);
200
201 SmallString<64> TempDir;
202 SmallString<64> ObjectPath;
203 SmallString<64> SharedObjectPath;
204 // The LLD linker interfaces are not thread-safe, so we use a mutex.
205 static std::mutex Mutex;
206 {
207 sys::path::system_temp_directory(true, TempDir);
208 int ObjectFD;
209 if (auto EC = sys::fs::createUniqueFile(TempDir + "/proteus-jit-%%%%%%%.o",
210 ObjectFD, ObjectPath))
211 PROTEUS_FATAL_ERROR(EC.message());
212
213 raw_fd_ostream OS(ObjectFD, true);
214 OS << StringRef{ObjectCode.data(), ObjectCode.size()};
215 OS.close();
216
217 if (auto EC = sys::fs::createUniqueFile(TempDir + "/proteus-jit-%%%%%%%.so",
218 SharedObjectPath))
219 PROTEUS_FATAL_ERROR(EC.message());
220
221 std::vector<const char *> Args{"ld.lld", "--no-undefined",
222 "-shared", ObjectPath.c_str(),
223 "-o", SharedObjectPath.c_str()};
224
225 {
226 std::lock_guard LockGuard{Mutex};
227 lld::Result S = lld::lldMain(Args, llvm::outs(), llvm::errs(),
228 {{lld::Gnu, &lld::elf::link}});
229 if (S.retCode)
230 PROTEUS_FATAL_ERROR("Error: lld failed");
231 }
232 }
233
234 ErrorOr<std::unique_ptr<MemoryBuffer>> Buffer =
235 MemoryBuffer::getFileAsStream(SharedObjectPath);
236 if (!Buffer)
237 PROTEUS_FATAL_ERROR("Error reading file: " + Buffer.getError().message());
238
239 sys::fs::remove(ObjectPath);
240 sys::fs::remove(SharedObjectPath);
241
242 return std::move(*Buffer);
243#else
244 PROTEUS_FATAL_ERROR("Expected LLVM18 for non-RTC codegen");
245#endif
246}
247
248} // namespace proteus
249
250#endif
char int void ** Args
Definition CompilerInterfaceHost.cpp:20
#define PROTEUS_DBG(x)
Definition Debug.h:7
#define PROTEUS_FATAL_ERROR(x)
Definition Error.h:4
#define proteusHiprtcErrCheck(CALL)
Definition UtilsHIP.h:28
static llvm::raw_ostream & logs(const std::string &Name)
Definition Logger.hpp:18
const SmallVector< StringRef > & threadIdxXFnName()
Definition CoreLLVMCUDA.hpp:68
const SmallVector< StringRef > & gridDimYFnName()
Definition CoreLLVMCUDA.hpp:28
const SmallVector< StringRef > & threadIdxZFnName()
Definition CoreLLVMCUDA.hpp:78
const SmallVector< StringRef > & blockIdxZFnName()
Definition CoreLLVMCUDA.hpp:63
const SmallVector< StringRef > & gridDimZFnName()
Definition CoreLLVMCUDA.hpp:33
const SmallVector< StringRef > & gridDimXFnName()
Definition CoreLLVMCUDA.hpp:23
const SmallVector< StringRef > & blockIdxXFnName()
Definition CoreLLVMCUDA.hpp:53
Expected< std::unique_ptr< TargetMachine > > createTargetMachine(Module &M, StringRef Arch, unsigned OptLevel=3)
Definition CoreLLVM.hpp:48
const SmallVector< StringRef > & threadIdxYFnName()
Definition CoreLLVMCUDA.hpp:73
const SmallVector< StringRef > & blockIdxYFnName()
Definition CoreLLVMCUDA.hpp:58
const SmallVector< StringRef > & blockDimYFnName()
Definition CoreLLVMCUDA.hpp:43
const SmallVector< StringRef > & blockDimZFnName()
Definition CoreLLVMCUDA.hpp:48
const SmallVector< StringRef > & blockDimXFnName()
Definition CoreLLVMCUDA.hpp:38
Definition JitEngine.cpp:20
void setLaunchBoundsForKernel(Module &M, Function &F, size_t GridSize, int BlockSize)
Definition CoreLLVMCUDA.hpp:85
std::unique_ptr< MemoryBuffer > codegenObject(Module &M, StringRef DeviceArch, SmallPtrSetImpl< void * > &GlobalLinkedBinaries, bool UseRTC=true)
Definition CoreLLVMCUDA.hpp:148