GPUs in MLIR
MLIR Open Design Meeting
December 12, 2019
Stephan Herhut, Alex Zinenko
Outline
Modeling GPU execution as dialect(s)
Design goals
Design non-goals
Scope
Kernel-side dialect
Kernel launch and grid specification
// ...
gpu.launch
Kernel launches from host code
Kernel launch and grid specification
// ...
%c1 = constant 1 : index
%c32 = constant 32 : index
gpu.launch
blocks(%bx, %by, %bz) in (%nbx = %c1, %nby = %c1, %nby = %c1)
threads(%tx, %ty, %tz) in (%ntx = %c32, %nty = %c1, %nty = %c1)
Grid specification
Kernel launch and grid specification
Additional arguments
// ...
%c1 = constant 1 : index
%c32 = constant 32 : index
gpu.launch
blocks(%bx, %by, %bz) in (%nbx = %c1, %nby = %c1, %nby = %c1)
threads(%tx, %ty, %tz) in (%ntx = %c32, %nty = %c1, %nty = %c1)
args(%arg0 = %0, %arg1 = %1) : f32, memref<?xf32> {
Kernel launch and grid specification
Work item ID available inside the region
// ...
%c1 = constant 1 : index
%c32 = constant 32 : index
gpu.launch
blocks(%bx, %by, %bz) in (%nbx = %c1, %nby = %c1, %nby = %c1)
threads(%tx, %ty, %tz) in (%ntx = %c32, %nty = %c1, %nty = %c1)
args(%arg0 = %0, %arg1 = %1) : f32, memref<?xf32> {
// any dialect here
%2 = index_cast %tx : index to i64
%3 = add %2, %2 : i64
// ...
gpu.return
}
// ...
Kernel launch and grid specification
Any dialect can be inside (but not other launch*)
*not yet modeling dynamic parallelism
// ...
%c1 = constant 1 : index
%c32 = constant 32 : index
gpu.launch
blocks(%bx, %by, %bz) in (%nbx = %c1, %nby = %c1, %nby = %c1)
threads(%tx, %ty, %tz) in (%ntx = %c32, %nty = %c1, %nty = %c1)
args(%arg0 = %0, %arg1 = %1) : f32, memref<?xf32> {
// any dialect here
%2 = index_cast %tx : index to i64
%3 = add %2, %2 : i64
// ...
gpu.return
}
// ...
Kernel launch and grid specification
Any dialect can be inside (but not other launch*)
*not yet modeling dynamic parallelism
Grid specification
Additional arguments
Work item ID available as region argument
// ...
%c1 = constant 1 : index
%c32 = constant 32 : index
gpu.launch
blocks(%bx, %by, %bz) in (%nbx = %c1, %nby = %c1, %nby = %c1)
threads(%tx, %ty, %tz) in (%ntx = %c32, %nty = %c1, %nty = %c1)
args(%arg0 = %0, %arg1 = %1) : f32, memref<?xf32> {
// any dialect here
%2 = index_cast %tx : index to i64
%3 = add %2, %2 : i64
// ...
gpu.return
}
// ...
Kernel launches from host code
“Inline” form simplifies host/device code motion
// ...
%c0 = constant 0 : index
%c1 = constant 1 : index
%c32 = constant 32 : index
gpu.launch
blocks(%bx, %by, %bz) in (%nbx = %c1, %nby = %c1, %nby = %c1)
threads(%tx, %ty, %tz) in (%ntx = %c32, %nty = %c1, %nty = %c1)
args(%arg0 = %c0, %arg1 = %0) : index, memref<?xi64> {
%1 = index_cast %arg0 : index to i64
%c0_1 = constant 0 : index
store %1, %arg1[%c0_1] : memref<?xi64>
gpu.return
}
// ...
“Inline” form simplifies host/device code motion
// ...
%c0 = constant 0 : index
%c1 = constant 1 : index
%c32 = constant 32 : index
gpu.launch
blocks(%bx, %by, %bz) in (%nbx = %c1, %nby = %c1, %nby = %c1)
threads(%tx, %ty, %tz) in (%ntx = %c32, %nty = %c1, %nty = %c1)
args(%arg0 = %c0, %arg1 = %0) : index, memref<?xi64> {
%1 = index_cast %arg0 : index to i64
%c0_1 = constant 0 : index
store %1, %arg1[%c0_1] : memref<?xi64>
gpu.return
}
// no more uses of %c0 here
“Inline” form simplifies host/device code motion
// ...
.
%c1 = constant 1 : index
%c32 = constant 32 : index
gpu.launch
blocks(%bx, %by, %bz) in (%nbx = %c1, %nby = %c1, %nby = %c1)
threads(%tx, %ty, %tz) in (%ntx = %c32, %nty = %c1, %nty = %c1)
args( %arg1 = %0) : memref<?xi64> {
%c0 = constant 0 : index
%1 = index_cast %c0 : index to i64
%c0_1 = constant 0 : index
store %1, %arg1[%c0_1] : memref<?xi64>
gpu.return
}
// no more uses of %c0 here
E.g., constant propagation can operate�across host and device.
“Inline” form simplifies host/device code motion
// ...
%c1 = constant 1 : index
%c32 = constant 32 : index
gpu.launch
blocks(%bx, %by, %bz) in (%nbx = %c1, %nby = %c1, %nby = %c1)
threads(%tx, %ty, %tz) in (%ntx = %c32, %nty = %c1, %nty = %c1)
args( %arg1 = %0) : memref<?xi64> {
%c0 = constant 0 : index
%1 = index_cast %c0 : index to i64
.
store %1, %arg1[%c0] : memref<?xi64>
gpu.return
}
// no more uses of %c0 here
E.g., constant propagation can operate�across host and device.
Which in turn enables other transformations.
Outlined form
// ...
%c1 = constant 1 : index
%c32 = constant 32 : index
"gpu.launch_func"(%c1, %c1, %c1,
%c32, %c1, %c1,
%0)
{ kernel_module = “mod”,
kernel = “func” }
: (index, index, index, index, index, index, memref<?xi64>) -> ()
// ...
module @mod attributes {gpu.kernel_module} {
gpu.func @func(%arg0: memref<?xi64) kernel {
%c0 = constant 0 : index
%0 = index_cast %c0 : index to i64
store %0, %arg1[%c0] : memref<?xi64>
gpu.return
}
}
The kernel body can get outlined into
a new kernel function inside a separate module.
Outlined form
// ...
%c1 = constant 1 : index
%c32 = constant 32 : index
"gpu.launch_func"(%c1, %c1, %c1,
%c32, %c1, %c1,
%0)
{ kernel_module = “mod”,
kernel = “func” }
: (index, index, index, index, index, index, memref<?xi64>) -> ()
// ...
module @mod attributes {gpu.kernel_module} {
gpu.func @func(%arg0: memref<?xi64) kernel {
%c0 = constant 0 : index
%0 = index_cast %c0 : index to i64
store %0, %arg1[%c0] : memref<?xi64>
gpu.return
}
}
Since modules are just Ops, they can be nested.
module {
}
Outlined form
// ...
%c1 = constant 1 : index
%c32 = constant 32 : index
"gpu.launch_func"(%c1, %c1, %c1,
%c32, %c1, %c1,
%0)
{ kernel_module = “mod”,
kernel = “func” }
: (index, index, index, index, index, index, memref<?xi64>) -> ()
// ...
module @mod attributes {gpu.kernel_module} {
gpu.func @func(%arg0: memref<?xi64) kernel {
%c0 = constant 0 : index
%0 = index_cast %c0 : index to i64
store %0, %arg1[%c0] : memref<?xi64>
gpu.return
}
}
Since modules are just Ops, they can be nested.
module {
}
Full-fledged module:
Outlined form
// ...
%c1 = constant 1 : index
%c32 = constant 32 : index
"gpu.launch_func"(%c1, %c1, %c1,
%c32, %c1, %c1,
%0)
{ kernel_module = “mod”,
kernel = “func” }
: (index, index, index, index, index, index, memref<?xi64>) -> ()
// ...
module @mod attributes {gpu.kernel_module} {
gpu.func @func(%arg0: memref<?xi64) kernel {
%c0 = constant 0 : index
%0 = index_cast %c0 : index to i64
store %0, %arg1[%c0] : memref<?xi64>
gpu.return
}
}
module {
}
module @mod {
llvm.func @func(%arg0: !llvm<"{i64*, i64*, i64, i64[1], i64[1]}">)
attributes { nvvm.kernel } {
%0 = llvm.constant(0: index) : !llvm.i64
%1 = llvm.constant(1: index) : !llvm.i64
%2 = llvm.extractvalue %arg0[%1]
: !llvm<"{i64*, i64*, i64, i64[1], i64[1]"}
-> !llvm<"i64*">
%3 = llvm.getelementptr %2[%0] : !llvm<"i64*">
llvm.store %0, %3 : !llvm<"i64*">
// ...
nvvm.something // ...
// ...
}
}
Lower to NVVM
Outlined form
// ...
%c1 = constant 1 : index
%c32 = constant 32 : index
"gpu.launch_func"(%c1, %c1, %c1,
%c32, %c1, %c1,
%0)
{ kernel_module = “mod”,
kernel = “func” }
: (index, index, index, index, index, index, memref<?xi64>) -> ()
// ...
module @mod attributes {gpu.kernel_module} {
gpu.func @func(%arg0: memref<?xi64) kernel {
%c0 = constant 0 : index
%0 = index_cast %c0 : index to i64
store %0, %arg1[%c0] : memref<?xi64>
gpu.return
}
}
module {
}
module @mod {
llvm.func @func(%arg0: !llvm<"{i64*, i64*, i64, i64[1], i64[1]}">)
attributes { nvvm.kernel } {
%0 = llvm.constant(0: index) : !llvm.i64
%1 = llvm.constant(1: index) : !llvm.i64
%2 = llvm.extractvalue %arg0[%1]
: !llvm<"{i64*, i64*, i64, i64[1], i64[1]"}
-> !llvm<"i64*">
%3 = llvm.getelementptr %2[%0] : !llvm<"i64*">
llvm.store %0, %3 : !llvm<"i64*">
// ...
nvvm.something // ...
// ...
}
}
Lower to SPIR-V
module @mod {
spv.func @func(%arg0) {
// ...
spv.something // ...
// ...
}
}
Outlined form
// ...
%c1 = constant 1 : index
%c32 = constant 32 : index
"gpu.launch_func"(%c1, %c1, %c1,
%c32, %c1, %c1,
%0)
{ kernel_module = “mod”,
kernel = “func” }
: (index, index, index, index, index, index, memref<?xi64>) -> ()
// ...
module @mod attributes {gpu.kernel_module} {
gpu.func @func(%arg0: memref<?xi64) kernel {
%c0 = constant 0 : index
%0 = index_cast %c0 : index to i64
store %0, %arg1[%c0] : memref<?xi64>
gpu.return
}
}
module {
}
module @mod {
llvm.func @func(%arg0: !llvm<"{i64*, i64*, i64, i64[1], i64[1]}">)
attributes { nvvm.kernel } {
%0 = llvm.constant(0: index) : !llvm.i64
%1 = llvm.constant(1: index) : !llvm.i64
%2 = llvm.extractvalue %arg0[%1]
: !llvm<"{i64*, i64*, i64, i64[1], i64[1]"}
-> !llvm<"i64*">
%3 = llvm.getelementptr %2[%0] : !llvm<"i64*">
llvm.store %0, %3 : !llvm<"i64*">
// ...
nvvm.something // ...
// ...
}
}
Lower to ROCm
module @mod {
spv.func @func(%arg0) {
// ...
spv.something // ...
// ...
}
}
module @mod {
llvm.func @func(%arg0) {
// ...
rocm.something // ...
// ...
}
}
Separate compilation with nested modules
// ...
%c1 = constant 1 : index
%c32 = constant 32 : index
"gpu.launch_func"(%c1, %c1, %c1,
%c32, %c1, %c1,
%0)
{ kernel_module = “mod”,
kernel = “func” }
: (index, index, index, index, index, index, memref<?xi64>) -> ()
// ...
module @mod attributes {gpu.kernel_module} {
gpu.func @func(%arg0: memref<?xi64) kernel {
%c0 = constant 0 : index
%0 = index_cast %c0 : index to i64
store %0, %arg1[%c0] : memref<?xi64>
gpu.return
}
}
module {
}
module @mod {
llvm.func @func(%arg0: !llvm<"{i64*, i64*, i64, i64[1], i64[1]}">)
attributes { nvvm.kernel } {
%0 = llvm.constant(0: index) : !llvm.i64
%1 = llvm.constant(1: index) : !llvm.i64
%2 = llvm.extractvalue %arg0[%1]
: !llvm<"{i64*, i64*, i64, i64[1], i64[1]"}
-> !llvm<"i64*">
%3 = llvm.getelementptr %2[%0] : !llvm<"i64*">
llvm.store %0, %3 : !llvm<"i64*">
// ...
nvvm.something // ...
// ...
}
}
Separate compilation with nested modules
// ...
%c1 = constant 1 : index
%c32 = constant 32 : index
"gpu.launch_func"(%c1, %c1, %c1,
%c32, %c1, %c1,
%0)
{ kernel_module = “mod”,
kernel = “func” }
: (index, index, index, index, index, index, memref<?xi64>) -> ()
// ...
module @mod attributes {gpu.kernel_module} {
gpu.func @func(%arg0: memref<?xi64) kernel {
%c0 = constant 0 : index
%0 = index_cast %c0 : index to i64
store %0, %arg1[%c0] : memref<?xi64>
gpu.return
}
}
module {
}
“NVPTX0000_Some_Binary_Blob_Understood_By_The_Device”
Device-specificcompilation
Separate compilation with nested modules
// ...
%c1 = constant 1 : index
%c32 = constant 32 : index
"gpu.launch_func"(%c1, %c1, %c1,
%c32, %c1, %c1,
%0)
{ kernel_module = “mod”,
kernel = “func” }
: (index, index, index, index, index, index, memref<?xi64>) -> ()
// ...
!llvm.global constant @mod_func(“NVPTX0000_Some_Binary_Blob_Understood_By_The_Device”)� : !llvm<"[51xi8]">
module {
}
Separate compilation with nested modules
// ...
%c1 = constant 1 : index
%c32 = constant 32 : index
"gpu.launch_func"(%c1, %c1, %c1,
%c32, %c1, %c1,
%0)
{ kernel_module = “mod”,
kernel = “func” }
: (index, index, index, index, index, index, memref<?xi64>) -> ()
// ...
!llvm.global constant @mod_func(“NVPTX0000_Some_Binary_Blob_Understood_By_The_Device”)� : !llvm<"[51xi8]">
module {
}
Separate compilation with nested modules
%0 = llvm.mlir.addressof @mod_func : !llvm<"[51xi8]*">
%1 = llvm.mlir.constant(0 : index) : !llvm.i64
%2 = llvm.getelementptr %0[%1] // ...
// ...
%42 = llvm.alloca // ... allocate module handle
llvm.call @mcuModuleLoad(%42, %2) : (!llvm<"i8**">, !llvm<"i8*">) -> !llvm.i32
// ... check errors ...
%43 = llvm.alloca // ... allocate function handle
llvm.call @mcuModuleGetFunction // ...
// ... check errors ...
%44 = llvm.mlir.constant(32: index) : !llvm.i64
%45 = llvm.mlir.constant(1: index) !llvm.i64
// ... prepare arguments ...
llvm.call @mcuLaunchKernel(%43, %45, %45, %45, %44, %45, ...)
// ... check errors ...
!llvm.global constant @mod_func(“NVPTX0000_Some_Binary_Blob_Understood_By_The_Device”)� : !llvm<"[51xi8]">
module {
}
Lower, e.g., to LLVM function calls.
Host-side dialect to come.
Separate compilation with nested modules
%0 = llvm.mlir.addressof @mod_func : !llvm<"[51xi8]*">
%1 = llvm.mlir.constant(0 : index) : !llvm.i64
%2 = llvm.getelementptr %0[%1] // ...
// ...
%42 = llvm.alloca // ... allocate module handle
llvm.call @mcuModuleLoad(%42, %2) : (!llvm<"i8**">, !llvm<"i8*">) -> !llvm.i32
// ... check errors ...
%43 = llvm.alloca // ... allocate function handle
llvm.call @mcuModuleGetFunction // ...
// ... check errors ...
%44 = llvm.mlir.constant(32: index) : !llvm.i64
%45 = llvm.mlir.constant(1: index) !llvm.i64
// ... prepare arguments ...
llvm.call @mcuLaunchKernel(%43, %45, %45, %45, %44, %45, ...)
// ... check errors ...
!llvm.global constant @mod_func(“NVPTX0000_Some_Binary_Blob_Understood_By_The_Device”)� : !llvm<"[51xi8]">
module {
}
Thin wrapper for instrumentation and compatibility
Sample compilation flow
Operation Selection
“Guidelines” for adding new operations
Example: Subgroup communication primitives
Example: Shuffle operations in targets
shfl.sync
(up, down, permute, swizzle)
swizzle
(swizzle)
permute
(permute)
bpermute
(up, down)
subgroupShuffle
(permute)
subgroupShuffleXor
(swizzle)
subgroupShuffleUp
(up)
subgroupShuffleDown
(down)
Example: Subgroup communication primitives
Example: Workgroup reduction
Execution Model
Execution model
Execution model
Open design!
How do we make transformations aware of this?
Execution model: LLVM approach
Execution model: LLVM approach problems
Execution model: LLVM approach problems
Some operations cannot be made control-dependent on more values.
Some others cannot be made control-dependent on less values.
Execution model: LLVM approach problems for MLIR
Some operations cannot be made control-dependent on more values.
Some others cannot be made control-dependent on less values.
Control flow in MLIR regions is defined by the surrounding operation.
We would like to keep MLIR extensible.
Execution model: attributes / traits?
gpu.launch(...) {
some.op()
call @func() // HasTrait: NoMoreControl
another.op() // HasTrait: NoLessControl
}
Execution model: attributes / traits?
gpu.launch(...) {
some.op()
call @func() // HasTrait: NoMoreControl
another.op() // HasTrait: NoLessControl
}
Core IR property?
Execution model: structured control flow only?
gpu.launch(...) {
// ...
gpu.barrier
if (%0) {
// ...
} else {
// ...
}
}
Execution model: structured control flow only?
gpu.launch(...) {
// ...
gpu.barrier
if (%0) {
// ...
} else {
// ...
}
}
Execution model: structured control flow only?
gpu.launch(...) {
// ...
gpu.barrier
if (%0) {
// ...
} else {
// ...
}
}
Cannot use any dialect inside GPU.
Potential op duplication (gpu.if ?)
Execution model: structured control flow only?
gpu.launch(...) attributes {gpu.region} {
// ...
gpu.barrier
if (%0) attributes {gpu.region} {
// ...
} else {
// ...
}
}
Execution model: structured control flow only?
gpu.launch(...) attributes {gpu.region} {
// ...
gpu.barrier
if (%0) attributes {gpu.region} {
// ...
} else {
// ...
}
}
Execution model: structured control flow only?
gpu.launch(...) attributes {gpu.region} {
// ...
gpu.barrier
if (%0) attributes {gpu.region} {
// ...
} else {
// ...
}
}
Core IR property?
Execution model: explicit vector mask?
gpu.launch(...) mask(@m) {
// ...
gpu.barrier @m
%42 = cmpf "olt" %41, %40 : float
gpu.cond_br @m %42, ^bb1, ^bb2
^bb1:
// ...
gpu.ballot @m
^bb2:
// ...
}
Execution model: explicit vector mask?
gpu.launch(...) mask(@m) {
// ...
gpu.barrier
%42 = cmpf "olt" %41, %40 : float
gpu.cond_br @m %42, ^bb1, ^bb2
^bb1:
// ...
gpu.ballot @m
^bb2:
// ...
}
Execution model: explicit vector mask?
gpu.launch(...) mask(@m) {
// ...
gpu.barrier
%42 = cmpf "olt" %41, %40 : float
gpu.cond_br @m %42, ^bb1, ^bb2
^bb1:
// ...
gpu.ballot @m
^bb2:
// ...
}
Cannot use any dialect inside GPU.
Potential op duplication (gpu.if ?)
Execution model dilemma
Restrict composability
Expose the model to core IR
Execution model dilemma
Restrict composability
Expose the model to core IR
Open design!
We don’t have an answer
Memory Attribution
GPU memory hierarchy
GPU memory hierarchy
memref<42xf32, 1>
memref<42xf32, 3>
memref<42xf32, 5>
GPU memory hierarchy
memref<42xf32, 1>
memref<42xf32, 3>
memref<42xf32, 5>
memref<42xf32, 4>
Modelling Lifetime: Using globals
PTX and SPIR-V model this as global values on the module level.
module attributes {gpu.kernel_module} {
llvm.mlir.global @wg_memory() {addr_space = 3 : i32} : !llvm<"[32 x float]">
llvm.mlir.global @priv_memory() {addr_space = 5 : i32} : !llvm<"[1 x float]">
}
Modelling Lifetime: Using globals
PTX and SPIR-V model this as global values on the module level.
module attributes {gpu.kernel_module} {
llvm.mlir.global @wg_memory() {addr_space = 3 : i32} : !llvm<"[32 x float]">
llvm.mlir.global @priv_memory() {addr_space = 5 : i32} : !llvm<"[1 x float]">
llvm.func @kernel() {
%0 = llvm.mlir.addressof @wg_memory : !llvm<"[32 x float] addrspace(3)*">
// ...
}
}
Modelling Lifetime: Using globals
PTX and SPIR-V model this as global values on the module level.
module attributes {gpu.kernel_module} {
llvm.mlir.global @wg_memory() {addr_space = 3 : i32} : !llvm<"[32 x float]">
llvm.mlir.global @priv_memory() {addr_space = 5 : i32} : !llvm<"[1 x float]">
llvm.func @kernel() {
%0 = llvm.mlir.addressof @wg_memory : !llvm<"[32 x float] addrspace(3)*">
// ...
}
llvm.func @kernelTwo() {
%0 = llvm.mlir.addressof @wg_memory : !llvm<"[32 x float] addrspace(3)*">
// ...
}
}
Using globals suggests lifetime aligned with the module. In reality it is aligned with kernel invocation.
This leads to unintuitive aliasing semantics.
Modelling Lifetime: Alloca
We could model this with “dynamic” allocation.
module attributes {gpu.kernel_module} {
func @kernel() {
%0 = gpu.alloca() : memref<32xf32, 3> // allocate in workgroup memory
%1 = gpu.alloca() : memref<1xf32, 5> // allocate in private memory
// ...
}
}
Modelling Lifetime: Alloca
We could model this with “dynamic” allocation.
module attributes {gpu.kernel_module} {
func @kernel() {
%0 = gpu.alloca() : memref<32xf32, 3> // allocate in workgroup memory
%1 = gpu.alloca() : memref<1xf32, 5> // allocate in private memory
// ...
kernel() : () -> ()
// ...
}
}
Using alloca suggests stack allocation while this in reality is static.
Recursion would have unintuitive aliasing semantics.
Modelling Lifetime: Function attribution
Use static memory declarations at the function level.
module attributes {gpu.kernel_module} {
gpu.func @kernel()
workgroup(%0: memref<32xf32, 3>)
private(%1: memref<1xf32, 5>) {
// ...
}
}
Modelling Lifetime: Function attribution
Use static memory declarations at the function level.
module attributes {gpu.kernel_module} {
gpu.func @kernel()
workgroup(%0: memref<32xf32, 3>)
private(%1: memref<1xf32, 5>) {
// ...
}
}
Modelling Lifetime: Function attribution
Use static memory declarations at the function level.
module attributes {gpu.kernel_module} {
gpu.func @kernel()
workgroup(%0: memref<32xf32, 3>)
private(%1: memref<1xf32, 5>) {
// ...
}
}
Modelling Lifetime: Function attribution
Use static memory declarations at the function level.
module attributes {gpu.kernel_module} {
gpu.func @kernel()
workgroup(%0: memref<32xf32, 3>)
private(%1: memref<1xf32, 5>) {
// ...
}
}
Lifetime is coupled with invocation of kernel function.
Allocation is static.
Host-side dialect
Status Quo
// ...
%c1 = constant 1 : index
%c32 = constant 32 : index
gpu.launch
blocks(%bx, %by, %bz) in (%nbx = %c1, %nby = %c1, %nbz = %c1)
threads(%tx, %ty, %tz) in (%ntx = %c32, %nty = %c1, %ntz = %c1)
Kernels launches are synchronous.
There is only a single device.
No notion of memory allocation.
How about streams?
// ...
%c1 = constant 1 : index
%c32 = constant 32 : index
gpu.launch
blocks(%bx, %by, %bz) in (%nbx = %c1, %nby = %c1, %nbz = %c1)
threads(%tx, %ty, %tz) in (%ntx = %c32, %nty = %c1, %ntz = %c1)
on(%stream)
Kernels launches would be asynchronous.
Can encode many devices.
Could design allocation around streams.
How about chaining?
// ...
%c1 = constant 1 : index
%c32 = constant 32 : index
%chn1 = gpu.launch
blocks(%bx, %by, %bz) in (%nbx = %c1, %nby = %c1, %nbz = %c1)
threads(%tx, %ty, %tz) in (%ntx = %c32, %nty = %c1, %ntz = %c1)
deps(%chn0)
// ...
%chn2 = gpu.wait(%chn1)
Kernels launches would be asynchronous?
Chains could encode devices?
Could design allocation around chains?
How about futures?
// ...
%c1 = constant 1 : index
%c32 = constant 32 : index
%fut0, %fut1 = gpu.launch
blocks(%bx, %by, %bz) in (%nbx = %c1, %nby = %c1, %nbz = %c1)
threads(%tx, %ty, %tz) in (%ntx = %c32, %nty = %c1, %ntz = %c1)
args(%arg0, %arg1)
// ...
%val = gpu.await(%fut0)
Kernels launches would be asynchronous?
How could we encode devices?
Could design allocation around futures?
mlir@tensorflow.org