rustc_codegen_llvm/builder/
gpu_offload.rs

1use std::ffi::CString;
2
3use llvm::Linkage::*;
4use rustc_abi::Align;
5use rustc_codegen_ssa::traits::BaseTypeCodegenMethods;
6use rustc_middle::ty::offload_meta::OffloadMetadata;
7
8use crate::builder::SBuilder;
9use crate::llvm::AttributePlace::Function;
10use crate::llvm::{self, BasicBlock, Linkage, Type, Value};
11use crate::{SimpleCx, attributes};
12
13// ; Function Attrs: nounwind
14// declare i32 @__tgt_target_kernel(ptr, i64, i32, i32, ptr, ptr) #2
15fn generate_launcher<'ll>(cx: &'ll SimpleCx<'_>) -> (&'ll llvm::Value, &'ll llvm::Type) {
16    let tptr = cx.type_ptr();
17    let ti64 = cx.type_i64();
18    let ti32 = cx.type_i32();
19    let args = vec![tptr, ti64, ti32, ti32, tptr, tptr];
20    let tgt_fn_ty = cx.type_func(&args, ti32);
21    let name = "__tgt_target_kernel";
22    let tgt_decl = declare_offload_fn(&cx, name, tgt_fn_ty);
23    let nounwind = llvm::AttributeKind::NoUnwind.create_attr(cx.llcx);
24    attributes::apply_to_llfn(tgt_decl, Function, &[nounwind]);
25    (tgt_decl, tgt_fn_ty)
26}
27
28// What is our @1 here? A magic global, used in our data_{begin/update/end}_mapper:
29// @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
30// @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @0 }, align 8
31// FIXME(offload): @0 should include the file name (e.g. lib.rs) in which the function to be
32// offloaded was defined.
33fn generate_at_one<'ll>(cx: &'ll SimpleCx<'_>) -> &'ll llvm::Value {
34    let unknown_txt = ";unknown;unknown;0;0;;";
35    let c_entry_name = CString::new(unknown_txt).unwrap();
36    let c_val = c_entry_name.as_bytes_with_nul();
37    let initializer = crate::common::bytes_in_context(cx.llcx, c_val);
38    let at_zero = add_unnamed_global(&cx, &"", initializer, PrivateLinkage);
39    llvm::set_alignment(at_zero, Align::ONE);
40
41    // @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @0 }, align 8
42    let struct_ident_ty = cx.type_named_struct("struct.ident_t");
43    let struct_elems = vec![
44        cx.get_const_i32(0),
45        cx.get_const_i32(2),
46        cx.get_const_i32(0),
47        cx.get_const_i32(22),
48        at_zero,
49    ];
50    let struct_elems_ty: Vec<_> = struct_elems.iter().map(|&x| cx.val_ty(x)).collect();
51    let initializer = crate::common::named_struct(struct_ident_ty, &struct_elems);
52    cx.set_struct_body(struct_ident_ty, &struct_elems_ty, false);
53    let at_one = add_unnamed_global(&cx, &"", initializer, PrivateLinkage);
54    llvm::set_alignment(at_one, Align::EIGHT);
55    at_one
56}
57
58pub(crate) struct TgtOffloadEntry {
59    //   uint64_t Reserved;
60    //   uint16_t Version;
61    //   uint16_t Kind;
62    //   uint32_t Flags; Flags associated with the entry (see Target Region Entry Flags)
63    //   void *Address; Address of global symbol within device image (function or global)
64    //   char *SymbolName;
65    //   uint64_t Size; Size of the entry info (0 if it is a function)
66    //   uint64_t Data;
67    //   void *AuxAddr;
68}
69
70impl TgtOffloadEntry {
71    pub(crate) fn new_decl<'ll>(cx: &'ll SimpleCx<'_>) -> &'ll llvm::Type {
72        let offload_entry_ty = cx.type_named_struct("struct.__tgt_offload_entry");
73        let tptr = cx.type_ptr();
74        let ti64 = cx.type_i64();
75        let ti32 = cx.type_i32();
76        let ti16 = cx.type_i16();
77        // For each kernel to run on the gpu, we will later generate one entry of this type.
78        // copied from LLVM
79        let entry_elements = vec![ti64, ti16, ti16, ti32, tptr, tptr, ti64, ti64, tptr];
80        cx.set_struct_body(offload_entry_ty, &entry_elements, false);
81        offload_entry_ty
82    }
83
84    fn new<'ll>(
85        cx: &'ll SimpleCx<'_>,
86        region_id: &'ll Value,
87        llglobal: &'ll Value,
88    ) -> [&'ll Value; 9] {
89        let reserved = cx.get_const_i64(0);
90        let version = cx.get_const_i16(1);
91        let kind = cx.get_const_i16(1);
92        let flags = cx.get_const_i32(0);
93        let size = cx.get_const_i64(0);
94        let data = cx.get_const_i64(0);
95        let aux_addr = cx.const_null(cx.type_ptr());
96        [reserved, version, kind, flags, region_id, llglobal, size, data, aux_addr]
97    }
98}
99
100// Taken from the LLVM APITypes.h declaration:
101struct KernelArgsTy {
102    //  uint32_t Version = 0; // Version of this struct for ABI compatibility.
103    //  uint32_t NumArgs = 0; // Number of arguments in each input pointer.
104    //  void **ArgBasePtrs =
105    //      nullptr;                 // Base pointer of each argument (e.g. a struct).
106    //  void **ArgPtrs = nullptr;    // Pointer to the argument data.
107    //  int64_t *ArgSizes = nullptr; // Size of the argument data in bytes.
108    //  int64_t *ArgTypes = nullptr; // Type of the data (e.g. to / from).
109    //  void **ArgNames = nullptr;   // Name of the data for debugging, possibly null.
110    //  void **ArgMappers = nullptr; // User-defined mappers, possibly null.
111    //  uint64_t Tripcount =
112    // 0; // Tripcount for the teams / distribute loop, 0 otherwise.
113    // struct {
114    //    uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause.
115    //    uint64_t IsCUDA : 1; // Was this kernel spawned via CUDA.
116    //    uint64_t Unused : 62;
117    //  } Flags = {0, 0, 0}; // totals to 64 Bit, 8 Byte
118    //  // The number of teams (for x,y,z dimension).
119    //  uint32_t NumTeams[3] = {0, 0, 0};
120    //  // The number of threads (for x,y,z dimension).
121    //  uint32_t ThreadLimit[3] = {0, 0, 0};
122    //  uint32_t DynCGroupMem = 0; // Amount of dynamic cgroup memory requested.
123}
124
125impl KernelArgsTy {
126    const OFFLOAD_VERSION: u64 = 3;
127    const FLAGS: u64 = 0;
128    const TRIPCOUNT: u64 = 0;
129    fn new_decl<'ll>(cx: &'ll SimpleCx<'_>) -> &'ll Type {
130        let kernel_arguments_ty = cx.type_named_struct("struct.__tgt_kernel_arguments");
131        let tptr = cx.type_ptr();
132        let ti64 = cx.type_i64();
133        let ti32 = cx.type_i32();
134        let tarr = cx.type_array(ti32, 3);
135
136        let kernel_elements =
137            vec![ti32, ti32, tptr, tptr, tptr, tptr, tptr, tptr, ti64, ti64, tarr, tarr, ti32];
138
139        cx.set_struct_body(kernel_arguments_ty, &kernel_elements, false);
140        kernel_arguments_ty
141    }
142
143    fn new<'ll>(
144        cx: &'ll SimpleCx<'_>,
145        num_args: u64,
146        memtransfer_types: &'ll Value,
147        geps: [&'ll Value; 3],
148    ) -> [(Align, &'ll Value); 13] {
149        let four = Align::from_bytes(4).expect("4 Byte alignment should work");
150        let eight = Align::EIGHT;
151
152        let ti32 = cx.type_i32();
153        let ci32_0 = cx.get_const_i32(0);
154        [
155            (four, cx.get_const_i32(KernelArgsTy::OFFLOAD_VERSION)),
156            (four, cx.get_const_i32(num_args)),
157            (eight, geps[0]),
158            (eight, geps[1]),
159            (eight, geps[2]),
160            (eight, memtransfer_types),
161            // The next two are debug infos. FIXME(offload): set them
162            (eight, cx.const_null(cx.type_ptr())), // dbg
163            (eight, cx.const_null(cx.type_ptr())), // dbg
164            (eight, cx.get_const_i64(KernelArgsTy::TRIPCOUNT)),
165            (eight, cx.get_const_i64(KernelArgsTy::FLAGS)),
166            (four, cx.const_array(ti32, &[cx.get_const_i32(2097152), ci32_0, ci32_0])),
167            (four, cx.const_array(ti32, &[cx.get_const_i32(256), ci32_0, ci32_0])),
168            (four, cx.get_const_i32(0)),
169        ]
170    }
171}
172
173// Contains LLVM values needed to manage offloading for a single kernel.
174pub(crate) struct OffloadKernelData<'ll> {
175    pub offload_sizes: &'ll llvm::Value,
176    pub memtransfer_types: &'ll llvm::Value,
177    pub region_id: &'ll llvm::Value,
178    pub offload_entry: &'ll llvm::Value,
179}
180
181fn gen_tgt_data_mappers<'ll>(
182    cx: &'ll SimpleCx<'_>,
183) -> (&'ll llvm::Value, &'ll llvm::Value, &'ll llvm::Value, &'ll llvm::Type) {
184    let tptr = cx.type_ptr();
185    let ti64 = cx.type_i64();
186    let ti32 = cx.type_i32();
187
188    let args = vec![tptr, ti64, ti32, tptr, tptr, tptr, tptr, tptr, tptr];
189    let mapper_fn_ty = cx.type_func(&args, cx.type_void());
190    let mapper_begin = "__tgt_target_data_begin_mapper";
191    let mapper_update = "__tgt_target_data_update_mapper";
192    let mapper_end = "__tgt_target_data_end_mapper";
193    let begin_mapper_decl = declare_offload_fn(&cx, mapper_begin, mapper_fn_ty);
194    let update_mapper_decl = declare_offload_fn(&cx, mapper_update, mapper_fn_ty);
195    let end_mapper_decl = declare_offload_fn(&cx, mapper_end, mapper_fn_ty);
196
197    let nounwind = llvm::AttributeKind::NoUnwind.create_attr(cx.llcx);
198    attributes::apply_to_llfn(begin_mapper_decl, Function, &[nounwind]);
199    attributes::apply_to_llfn(update_mapper_decl, Function, &[nounwind]);
200    attributes::apply_to_llfn(end_mapper_decl, Function, &[nounwind]);
201
202    (begin_mapper_decl, update_mapper_decl, end_mapper_decl, mapper_fn_ty)
203}
204
205fn add_priv_unnamed_arr<'ll>(cx: &SimpleCx<'ll>, name: &str, vals: &[u64]) -> &'ll llvm::Value {
206    let ti64 = cx.type_i64();
207    let mut size_val = Vec::with_capacity(vals.len());
208    for &val in vals {
209        size_val.push(cx.get_const_i64(val));
210    }
211    let initializer = cx.const_array(ti64, &size_val);
212    add_unnamed_global(cx, name, initializer, PrivateLinkage)
213}
214
215pub(crate) fn add_unnamed_global<'ll>(
216    cx: &SimpleCx<'ll>,
217    name: &str,
218    initializer: &'ll llvm::Value,
219    l: Linkage,
220) -> &'ll llvm::Value {
221    let llglobal = add_global(cx, name, initializer, l);
222    llvm::LLVMSetUnnamedAddress(llglobal, llvm::UnnamedAddr::Global);
223    llglobal
224}
225
226pub(crate) fn add_global<'ll>(
227    cx: &SimpleCx<'ll>,
228    name: &str,
229    initializer: &'ll llvm::Value,
230    l: Linkage,
231) -> &'ll llvm::Value {
232    let c_name = CString::new(name).unwrap();
233    let llglobal: &'ll llvm::Value = llvm::add_global(cx.llmod, cx.val_ty(initializer), &c_name);
234    llvm::set_global_constant(llglobal, true);
235    llvm::set_linkage(llglobal, l);
236    llvm::set_initializer(llglobal, initializer);
237    llglobal
238}
239
240// This function returns a memtransfer value which encodes how arguments to this kernel shall be
241// mapped to/from the gpu. It also returns a region_id with the name of this kernel, to be
242// concatenated into the list of region_ids.
243pub(crate) fn gen_define_handling<'ll>(
244    cx: &SimpleCx<'ll>,
245    offload_entry_ty: &'ll llvm::Type,
246    metadata: &[OffloadMetadata],
247    types: &[&Type],
248    symbol: &str,
249) -> OffloadKernelData<'ll> {
250    // It seems like non-pointer values are automatically mapped. So here, we focus on pointer (or
251    // reference) types.
252    let ptr_meta = types.iter().zip(metadata).filter_map(|(&x, meta)| match cx.type_kind(x) {
253        rustc_codegen_ssa::common::TypeKind::Pointer => Some(meta),
254        _ => None,
255    });
256
257    // FIXME(Sa4dUs): add `OMP_MAP_TARGET_PARAM = 0x20` only if necessary
258    let (ptr_sizes, ptr_transfer): (Vec<_>, Vec<_>) =
259        ptr_meta.map(|m| (m.payload_size, m.mode.bits() | 0x20)).unzip();
260
261    let offload_sizes = add_priv_unnamed_arr(&cx, &format!(".offload_sizes.{symbol}"), &ptr_sizes);
262    // Here we figure out whether something needs to be copied to the gpu (=1), from the gpu (=2),
263    // or both to and from the gpu (=3). Other values shouldn't affect us for now.
264    // A non-mutable reference or pointer will be 1, an array that's not read, but fully overwritten
265    // will be 2. For now, everything is 3, until we have our frontend set up.
266    // 1+2+32: 1 (MapTo), 2 (MapFrom), 32 (Add one extra input ptr per function, to be used later).
267    let memtransfer_types =
268        add_priv_unnamed_arr(&cx, &format!(".offload_maptypes.{symbol}"), &ptr_transfer);
269
270    // Next: For each function, generate these three entries. A weak constant,
271    // the llvm.rodata entry name, and  the llvm_offload_entries value
272
273    let name = format!(".{symbol}.region_id");
274    let initializer = cx.get_const_i8(0);
275    let region_id = add_unnamed_global(&cx, &name, initializer, WeakAnyLinkage);
276
277    let c_entry_name = CString::new(symbol).unwrap();
278    let c_val = c_entry_name.as_bytes_with_nul();
279    let offload_entry_name = format!(".offloading.entry_name.{symbol}");
280
281    let initializer = crate::common::bytes_in_context(cx.llcx, c_val);
282    let llglobal = add_unnamed_global(&cx, &offload_entry_name, initializer, InternalLinkage);
283    llvm::set_alignment(llglobal, Align::ONE);
284    llvm::set_section(llglobal, c".llvm.rodata.offloading");
285
286    let name = format!(".offloading.entry.{symbol}");
287
288    // See the __tgt_offload_entry documentation above.
289    let elems = TgtOffloadEntry::new(&cx, region_id, llglobal);
290
291    let initializer = crate::common::named_struct(offload_entry_ty, &elems);
292    let c_name = CString::new(name).unwrap();
293    let offload_entry = llvm::add_global(cx.llmod, offload_entry_ty, &c_name);
294    llvm::set_global_constant(offload_entry, true);
295    llvm::set_linkage(offload_entry, WeakAnyLinkage);
296    llvm::set_initializer(offload_entry, initializer);
297    llvm::set_alignment(offload_entry, Align::EIGHT);
298    let c_section_name = CString::new("llvm_offload_entries").unwrap();
299    llvm::set_section(offload_entry, &c_section_name);
300
301    OffloadKernelData { offload_sizes, memtransfer_types, region_id, offload_entry }
302}
303
304fn declare_offload_fn<'ll>(
305    cx: &'ll SimpleCx<'_>,
306    name: &str,
307    ty: &'ll llvm::Type,
308) -> &'ll llvm::Value {
309    crate::declare::declare_simple_fn(
310        cx,
311        name,
312        llvm::CallConv::CCallConv,
313        llvm::UnnamedAddr::No,
314        llvm::Visibility::Default,
315        ty,
316    )
317}
318
319// For each kernel *call*, we now use some of our previous declared globals to move data to and from
320// the gpu. For now, we only handle the data transfer part of it.
321// If two consecutive kernels use the same memory, we still move it to the host and back to the gpu.
322// Since in our frontend users (by default) don't have to specify data transfer, this is something
323// we should optimize in the future! We also assume that everything should be copied back and forth,
324// but sometimes we can directly zero-allocate on the device and only move back, or if something is
325// immutable, we might only copy it to the device, but not back.
326//
327// Current steps:
328// 0. Alloca some variables for the following steps
329// 1. set insert point before kernel call.
330// 2. generate all the GEPS and stores, to be used in 3)
331// 3. generate __tgt_target_data_begin calls to move data to the GPU
332//
333// unchanged: keep kernel call. Later move the kernel to the GPU
334//
335// 4. set insert point after kernel call.
336// 5. generate all the GEPS and stores, to be used in 6)
337// 6. generate __tgt_target_data_end calls to move data from the GPU
338pub(crate) fn gen_call_handling<'ll>(
339    cx: &SimpleCx<'ll>,
340    bb: &BasicBlock,
341    offload_data: &OffloadKernelData<'ll>,
342    args: &[&'ll Value],
343    types: &[&Type],
344    metadata: &[OffloadMetadata],
345) {
346    let OffloadKernelData { offload_sizes, offload_entry, memtransfer_types, region_id } =
347        offload_data;
348    let (tgt_decl, tgt_target_kernel_ty) = generate_launcher(&cx);
349    // %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr }
350    let tptr = cx.type_ptr();
351    let ti32 = cx.type_i32();
352    let tgt_bin_desc_ty = vec![ti32, tptr, tptr, tptr];
353    let tgt_bin_desc = cx.type_named_struct("struct.__tgt_bin_desc");
354    cx.set_struct_body(tgt_bin_desc, &tgt_bin_desc_ty, false);
355
356    let tgt_kernel_decl = KernelArgsTy::new_decl(&cx);
357    let (begin_mapper_decl, _, end_mapper_decl, fn_ty) = gen_tgt_data_mappers(&cx);
358
359    let mut builder = SBuilder::build(cx, bb);
360
361    let num_args = types.len() as u64;
362    let ip = unsafe { llvm::LLVMRustGetInsertPoint(&builder.llbuilder) };
363
364    // FIXME(Sa4dUs): dummy loads are a temp workaround, we should find a proper way to prevent these
365    // variables from being optimized away
366    for val in [offload_sizes, offload_entry] {
367        unsafe {
368            let dummy = llvm::LLVMBuildLoad2(
369                &builder.llbuilder,
370                llvm::LLVMTypeOf(val),
371                val,
372                b"dummy\0".as_ptr() as *const _,
373            );
374            llvm::LLVMSetVolatile(dummy, llvm::TRUE);
375        }
376    }
377
378    // Step 0)
379    // %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr }
380    // %6 = alloca %struct.__tgt_bin_desc, align 8
381    let llfn = unsafe { llvm::LLVMGetBasicBlockParent(bb) };
382    unsafe {
383        llvm::LLVMRustPositionBuilderPastAllocas(&builder.llbuilder, llfn);
384    }
385    let tgt_bin_desc_alloca = builder.direct_alloca(tgt_bin_desc, Align::EIGHT, "EmptyDesc");
386
387    let ty = cx.type_array(cx.type_ptr(), num_args);
388    // Baseptr are just the input pointer to the kernel, stored in a local alloca
389    let a1 = builder.direct_alloca(ty, Align::EIGHT, ".offload_baseptrs");
390    // Ptrs are the result of a gep into the baseptr, at least for our trivial types.
391    let a2 = builder.direct_alloca(ty, Align::EIGHT, ".offload_ptrs");
392    // These represent the sizes in bytes, e.g. the entry for `&[f64; 16]` will be 8*16.
393    let ty2 = cx.type_array(cx.type_i64(), num_args);
394    let a4 = builder.direct_alloca(ty2, Align::EIGHT, ".offload_sizes");
395
396    //%kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
397    let a5 = builder.direct_alloca(tgt_kernel_decl, Align::EIGHT, "kernel_args");
398
399    // Step 1)
400    unsafe {
401        llvm::LLVMRustRestoreInsertPoint(&builder.llbuilder, ip);
402    }
403    builder.memset(tgt_bin_desc_alloca, cx.get_const_i8(0), cx.get_const_i64(32), Align::EIGHT);
404
405    // Now we allocate once per function param, a copy to be passed to one of our maps.
406    let mut vals = vec![];
407    let mut geps = vec![];
408    let i32_0 = cx.get_const_i32(0);
409    for &v in args {
410        let gep = builder.inbounds_gep(cx.type_f32(), v, &[i32_0]);
411        vals.push(v);
412        geps.push(gep);
413    }
414
415    let mapper_fn_ty = cx.type_func(&[cx.type_ptr()], cx.type_void());
416    let register_lib_decl = declare_offload_fn(&cx, "__tgt_register_lib", mapper_fn_ty);
417    let unregister_lib_decl = declare_offload_fn(&cx, "__tgt_unregister_lib", mapper_fn_ty);
418    let init_ty = cx.type_func(&[], cx.type_void());
419    let init_rtls_decl = declare_offload_fn(cx, "__tgt_init_all_rtls", init_ty);
420
421    // FIXME(offload): Later we want to add them to the wrapper code, rather than our main function.
422    // call void @__tgt_register_lib(ptr noundef %6)
423    builder.call(mapper_fn_ty, register_lib_decl, &[tgt_bin_desc_alloca], None);
424    // call void @__tgt_init_all_rtls()
425    builder.call(init_ty, init_rtls_decl, &[], None);
426
427    for i in 0..num_args {
428        let idx = cx.get_const_i32(i);
429        let gep1 = builder.inbounds_gep(ty, a1, &[i32_0, idx]);
430        builder.store(vals[i as usize], gep1, Align::EIGHT);
431        let gep2 = builder.inbounds_gep(ty, a2, &[i32_0, idx]);
432        builder.store(geps[i as usize], gep2, Align::EIGHT);
433        let gep3 = builder.inbounds_gep(ty2, a4, &[i32_0, idx]);
434        // FIXME(offload): write an offload frontend and handle arbitrary types.
435        builder.store(cx.get_const_i64(metadata[i as usize].payload_size), gep3, Align::EIGHT);
436    }
437
438    // For now we have a very simplistic indexing scheme into our
439    // offload_{baseptrs,ptrs,sizes}. We will probably improve this along with our gpu frontend pr.
440    fn get_geps<'a, 'll>(
441        builder: &mut SBuilder<'a, 'll>,
442        cx: &'ll SimpleCx<'ll>,
443        ty: &'ll Type,
444        ty2: &'ll Type,
445        a1: &'ll Value,
446        a2: &'ll Value,
447        a4: &'ll Value,
448    ) -> [&'ll Value; 3] {
449        let i32_0 = cx.get_const_i32(0);
450
451        let gep1 = builder.inbounds_gep(ty, a1, &[i32_0, i32_0]);
452        let gep2 = builder.inbounds_gep(ty, a2, &[i32_0, i32_0]);
453        let gep3 = builder.inbounds_gep(ty2, a4, &[i32_0, i32_0]);
454        [gep1, gep2, gep3]
455    }
456
457    fn generate_mapper_call<'a, 'll>(
458        builder: &mut SBuilder<'a, 'll>,
459        cx: &'ll SimpleCx<'ll>,
460        geps: [&'ll Value; 3],
461        o_type: &'ll Value,
462        fn_to_call: &'ll Value,
463        fn_ty: &'ll Type,
464        num_args: u64,
465        s_ident_t: &'ll Value,
466    ) {
467        let nullptr = cx.const_null(cx.type_ptr());
468        let i64_max = cx.get_const_i64(u64::MAX);
469        let num_args = cx.get_const_i32(num_args);
470        let args =
471            vec![s_ident_t, i64_max, num_args, geps[0], geps[1], geps[2], o_type, nullptr, nullptr];
472        builder.call(fn_ty, fn_to_call, &args, None);
473    }
474
475    // Step 2)
476    let s_ident_t = generate_at_one(&cx);
477    let geps = get_geps(&mut builder, &cx, ty, ty2, a1, a2, a4);
478    generate_mapper_call(
479        &mut builder,
480        &cx,
481        geps,
482        memtransfer_types,
483        begin_mapper_decl,
484        fn_ty,
485        num_args,
486        s_ident_t,
487    );
488    let values = KernelArgsTy::new(&cx, num_args, memtransfer_types, geps);
489
490    // Step 3)
491    // Here we fill the KernelArgsTy, see the documentation above
492    for (i, value) in values.iter().enumerate() {
493        let ptr = builder.inbounds_gep(tgt_kernel_decl, a5, &[i32_0, cx.get_const_i32(i as u64)]);
494        builder.store(value.1, ptr, value.0);
495    }
496
497    let args = vec![
498        s_ident_t,
499        // FIXME(offload) give users a way to select which GPU to use.
500        cx.get_const_i64(u64::MAX), // MAX == -1.
501        // FIXME(offload): Don't hardcode the numbers of threads in the future.
502        cx.get_const_i32(2097152),
503        cx.get_const_i32(256),
504        region_id,
505        a5,
506    ];
507    builder.call(tgt_target_kernel_ty, tgt_decl, &args, None);
508    // %41 = call i32 @__tgt_target_kernel(ptr @1, i64 -1, i32 2097152, i32 256, ptr @.kernel_1.region_id, ptr %kernel_args)
509
510    // Step 4)
511    let geps = get_geps(&mut builder, &cx, ty, ty2, a1, a2, a4);
512    generate_mapper_call(
513        &mut builder,
514        &cx,
515        geps,
516        memtransfer_types,
517        end_mapper_decl,
518        fn_ty,
519        num_args,
520        s_ident_t,
521    );
522
523    builder.call(mapper_fn_ty, unregister_lib_decl, &[tgt_bin_desc_alloca], None);
524
525    drop(builder);
526}