GPU Acceleration
Rabab Alomairy & Evelyne Ringoot
Vector Addition Example
2
vector_size = 1024
a = rand(1:4, vector_size)
b = rand(1:4, vector_size)
c = zeros(Int, vector_size)
function vadd(c, a, b)
Thread.@threads for i in 1:vector_size
c[i] = a[i] + b[i]
end
return
end
CPU code
da = CuArray(a)
db = CuArray(b)
dc = CUDA.zeros(Int, size(a))
function vadd(c, a, b)
i = threadIdx().x
c[i] = a[i] + b[i]
return
end
@cuda threads=length(a) vadd(dc, da, db)
GPU (CUDA) kernel
Notice: only local index
Any Ideas what is the limitation here?
One Thread
One Block = one SM (= 1024 Threads on A100)
Vector_size=1024
da = CuArray(rand(vector_size))
db = CuArray (rand(vector_size))
dc = CUDA.zeros(Int, size(a))
function vadd(c, a, b)
i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
c[i] = a[i] + b[i]
return
End
@cuda threads=length(da) vadd(dc, da, db)
Mutiple Blocks (= 1024 Threads on A100)
Vector_size=10240
da = CuArray(rand(vector_size))
db = CuArray (rand(vector_size))
dc = CUDA.zeros(Int, size(a))
function vadd(c, a, b)
i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
c[i] = a[i] + b[i]
return
End
@cuda threads=1024 blocks=cld(length(da), 1024) vadd(dc, da, db)
cld, ceiling division
Different Block can Do Different Operations in parallel
i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
if(blockIdx().x == 1)
c[i] = a[i] + b[i]
else if (blockIdx().x == 2)
c[i] = a[i] * b[i]
Enhanced GPU Vector Addition
7
Vector_size=10240
da = CuArray(rand(vector_size))
db = CuArray (rand(vector_size))
dc = CUDA.zeros(Int, size(a))
function vadd(c, a, b)
i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
c[i] = a[i] + b[i]
return
end
@cuda threads=1024 blocks=cld(length(da), 1024) vadd(dc, da, db)
Gets the global index of the thread in a multidimensional grid
1 | 2 | 3 | … | 1024 |
1 | 2 | 3 | … | 1024 |
1 | 2 | 3 | … | 1024 |
1 | 2 | 3 | … | 1024 |
…
blockIdx().x = 1
blockIdx().x = 2
blockIdx().x = 3
blockIdx().x = 10
threadIdx().x
threadIdx().x
threadIdx().x
threadIdx().x
gridDim().x = 10
Matrix Multiplication Example
Matrix Multiplication Example (CUDA C++ )
1. __global__ void MatMulKernel(const Matrix, const Matrix, Matrix);
2.
3. void MatMul(const Matrix A, const Matrix B, Matrix C)
4. {
5. Matrix d_A;
6. d_A.width = A.width; d_A.height = A.height;
7. size_t size = A.width * A.height * sizeof(float);
8. cudaMalloc(&d_A.elements, size);
9. cudaMemcpy(d_A.elements, A.elements, size, cudaMemcpyHostToDevice);
10. Matrix d_B;
11. d_B.width = B.width; d_B.height = B.height;
12. size = B.width * B.height * sizeof(float);
13. cudaMalloc(&d_B.elements, size);
14. cudaMemcpy(d_B.elements, B.elements, size, cudaMemcpyHostToDevice);
15.
16. Matrix d_C;
17. d_C.width = C.width; d_C.height = C.height;
18. size = C.width * C.height * sizeof(float);
19. cudaMalloc(&d_C.elements, size);
20.
21. dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
22. dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);
23. MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
24.
25. cudaMemcpy(C.elements, d_C.elements, size,
26. cudaMemcpyDeviceToHost);
27.
28. cudaFree(d_A.elements);
29. cudaFree(d_B.elements);
30. cudaFree(d_C.elements);
31. }
32.
33. __global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
34. {
35. float Cvalue = 0;
36. int row = blockIdx.y * blockDim.y + threadIdx.y;
37. int col = blockIdx.x * blockDim.x + threadIdx.x;
38. for (int e = 0; e < A.width; ++e)
39. Cvalue += A.elements[row * A.width + e]
40. * B.elements[e * B.width + col];
41. C.elements[row * C.width + col] = Cvalue;
42. }
CUDA C++ Programming Guide
https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html?highlight=matrix%20multiply
CUDA.jl
using CUDA
function MatrixMultiplication!(A,B,C)
row = (blockIdx().x - 1) * blockDim().x + threadIdx().x
col = (blockIdx().y - 1) * blockDim().y + threadIdx().y
sum = zero(eltype(C))
if row <= size(A, 1) && col < size(B, 2)
for i = 1:size(A, 2)
@inbounds sum += A[row, i] * B[i, col]
end
@inbounds C[row, col] = sum
end
return
end
A = CUDA.rand(128, 128)
B = CUDA.rand(128, 128)
C = CUDA.rand(128, 128)
threads = (16, 16)
blocks = (8, 8)
@cuda threads=threads blocks=blocks MatrixMultiplication!(A, B, C)
AMDGPU.jl
using AMDGPU
function MatrixMultiplication!(A, B,C)
row = (workgroupIdx().x - 1) * workgroupDim(). x + workitemIdx().x
col = (workgroupIdx().y - 1) * workgroupDim(). y + workitemIdx().y
sum = zero (eltype(C))
if row <= size(A, 1) 8& col <= size(B, 2)
for 1 = 1:size(A, 2)
@inbounds sum += A[row, i] * B[i, col]
end
@inbounds C[row, col] = sum
end
return
end
A = AMDGPU.rand(128, 128)
B = AMDGPU.rand(128, 128)
C = AMDGPU.rand(128, 128)
threads = (16, 16)
blocks = (8, 8)
@roc groupsize=threads gridsize=blocks MatrixMultiplication!(A, B, C)
oneAPI.jl and Metal.jl
using oneAPI
function MatrixMultiplication!(A,B,C)
row = get_global_id(0)
col = get_global_id(1)
sum = zero(eltype(C))
if row <= size(A, 1) && COl <= size(B, 2)
for i = 1:size(A, 2)
@inbounds sum += A[row, i] * B[i, col]
end
@inbounds C[row, col] = sum
end
return
end
using Metal
function MatrixMultiplication!(A, B,C)
row, col = thread_position_in_grid_2d()
sum = zero(eltype(C))
if row <= size(A, 1) && col <= size(B, 2)
for i = 1:size(A, 2)
@inbounds sum += A[row, i] * B[i, col]
end
@inbounds C[row, col] = sum
end
return
end
KernelAbstractions.jl
using KernelAbstractions
@kernel function MatrixMultiplication!(A, B, C)
row, col = @index(Global, NTuple)
sum = zero(eltype(C))
if row <= size(A, 1) && col <= size(B, 2)
for i = 1:size(A, 2)
@inbounds sum += A[row, i] * B[i, col]
end
@inbounds C[row, col] = sum
end
end
A = KernelAbstractions.rand(Backend, 128, 128)
B = KernelAbstractions.rand(Backend, 128, 128)
C = KernelAbstractions.rand(Backend, 128, 128)
workgroupsize = (16, 16)
kernel! = MatrixMultiplication!(Backend, workgroupsize)
kernel!(A, B, C, ndrange=(size(C)))
KernelAbstractions.synchronize(Backend)
Kernel language — @kernel
KernelAbstractions defines a language valid within @kernel definitions
Kernel language: Indexing
Indexing Method:
Indexing Kind:
Shared memory vs Global memory
17
Julia GPU Notebook: