1 of 17

GPU Acceleration

Rabab Alomairy & Evelyne Ringoot

2 of 17

Vector Addition Example

2

vector_size = 1024

a = rand(1:4, vector_size)

b = rand(1:4, vector_size)

c = zeros(Int, vector_size)

function vadd(c, a, b)

Thread.@threads for i in 1:vector_size

c[i] = a[i] + b[i]

end

return

end

CPU code

da = CuArray(a)

db = CuArray(b)

dc = CUDA.zeros(Int, size(a))

function vadd(c, a, b)

i = threadIdx().x

c[i] = a[i] + b[i]

return

end

@cuda threads=length(a) vadd(dc, da, db)

GPU (CUDA) kernel

Notice: only local index

Any Ideas what is the limitation here?

3 of 17

One Thread

c[5] = a[5] + b[5]

4 of 17

One Block = one SM (= 1024 Threads on A100)

Vector size greater than 1024 is an error

Vector_size=1024

da = CuArray(rand(vector_size))

db = CuArray (rand(vector_size))

dc = CUDA.zeros(Int, size(a))

function vadd(c, a, b)

i = threadIdx().x + (blockIdx().x - 1) * blockDim().x

c[i] = a[i] + b[i]

return

End

@cuda threads=length(da) vadd(dc, da, db)

5 of 17

Mutiple Blocks (= 1024 Threads on A100)

Vector_size=10240

da = CuArray(rand(vector_size))

db = CuArray (rand(vector_size))

dc = CUDA.zeros(Int, size(a))

function vadd(c, a, b)

i = threadIdx().x + (blockIdx().x - 1) * blockDim().x

c[i] = a[i] + b[i]

return

End

@cuda threads=1024 blocks=cld(length(da), 1024) vadd(dc, da, db)

cld, ceiling division

6 of 17

Different Block can Do Different Operations in parallel

i = threadIdx().x + (blockIdx().x - 1) * blockDim().x

if(blockIdx().x == 1)

c[i] = a[i] + b[i]

else if (blockIdx().x == 2)

c[i] = a[i] * b[i]

7 of 17

Enhanced GPU Vector Addition

7

Vector_size=10240

da = CuArray(rand(vector_size))

db = CuArray (rand(vector_size))

dc = CUDA.zeros(Int, size(a))

function vadd(c, a, b)

i = threadIdx().x + (blockIdx().x - 1) * blockDim().x

c[i] = a[i] + b[i]

return

end

@cuda threads=1024 blocks=cld(length(da), 1024) vadd(dc, da, db)

threadIdx().x + (blockIdx().x - 1) * blockDim().x = 3 + (3-1) * 1024 = 2051

Gets the global index of the thread in a multidimensional grid

1	2	3	…	1024

1	2	3	…	1024

1	2	3	…	1024

1	2	3	…	1024

…

blockIdx().x = 1

blockIdx().x = 2

blockIdx().x = 3

blockIdx().x = 10

threadIdx().x

gridDim().x = 10

8 of 17

Matrix Multiplication Example

Matrix multiplication for computing each element of matrix C from matrices A and B can be written:

9 of 17

Matrix Multiplication Example (CUDA C++ )

1. __global__ void MatMulKernel(const Matrix, const Matrix, Matrix);

2.

3. void MatMul(const Matrix A, const Matrix B, Matrix C)

4. {

5. Matrix d_A;

6. d_A.width = A.width; d_A.height = A.height;

7. size_t size = A.width * A.height * sizeof(float);

8. cudaMalloc(&d_A.elements, size);

9. cudaMemcpy(d_A.elements, A.elements, size, cudaMemcpyHostToDevice);

10. Matrix d_B;

11. d_B.width = B.width; d_B.height = B.height;

12. size = B.width * B.height * sizeof(float);

13. cudaMalloc(&d_B.elements, size);

14. cudaMemcpy(d_B.elements, B.elements, size, cudaMemcpyHostToDevice);

15.

16. Matrix d_C;

17. d_C.width = C.width; d_C.height = C.height;

18. size = C.width * C.height * sizeof(float);

19. cudaMalloc(&d_C.elements, size);

20.

21. dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);

22. dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);

23. MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);

24.

25. cudaMemcpy(C.elements, d_C.elements, size,

26. cudaMemcpyDeviceToHost);

27.

28. cudaFree(d_A.elements);

29. cudaFree(d_B.elements);

30. cudaFree(d_C.elements);

31. }

32.

33. __global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)

34. {

35. float Cvalue = 0;

36. int row = blockIdx.y * blockDim.y + threadIdx.y;

37. int col = blockIdx.x * blockDim.x + threadIdx.x;

38. for (int e = 0; e < A.width; ++e)

39. Cvalue += A.elements[row * A.width + e]

40. * B.elements[e * B.width + col];

41. C.elements[row * C.width + col] = Cvalue;

42. }

CUDA C++ Programming Guide

https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html?highlight=matrix%20multiply

10 of 17

CUDA.jl

using CUDA

function MatrixMultiplication!(A,B,C)

row = (blockIdx().x - 1) * blockDim().x + threadIdx().x

col = (blockIdx().y - 1) * blockDim().y + threadIdx().y

sum = zero(eltype(C))

if row <= size(A, 1) && col < size(B, 2)

for i = 1:size(A, 2)

@inbounds sum += A[row, i] * B[i, col]

end

@inbounds C[row, col] = sum

end

return

end

A = CUDA.rand(128, 128)

B = CUDA.rand(128, 128)

C = CUDA.rand(128, 128)

threads = (16, 16)

blocks = (8, 8)

@cuda threads=threads blocks=blocks MatrixMultiplication!(A, B, C)

11 of 17

AMDGPU.jl

using AMDGPU

function MatrixMultiplication!(A, B,C)

row = (workgroupIdx().x - 1) * workgroupDim(). x + workitemIdx().x

col = (workgroupIdx().y - 1) * workgroupDim(). y + workitemIdx().y

sum = zero (eltype(C))

if row <= size(A, 1) 8& col <= size(B, 2)

for 1 = 1:size(A, 2)

@inbounds sum += A[row, i] * B[i, col]

end

@inbounds C[row, col] = sum

end

return

end

A = AMDGPU.rand(128, 128)

B = AMDGPU.rand(128, 128)

C = AMDGPU.rand(128, 128)

threads = (16, 16)

blocks = (8, 8)

@roc groupsize=threads gridsize=blocks MatrixMultiplication!(A, B, C)

12 of 17

oneAPI.jl and Metal.jl

using oneAPI

function MatrixMultiplication!(A,B,C)

row = get_global_id(0)

col = get_global_id(1)

sum = zero(eltype(C))

if row <= size(A, 1) && COl <= size(B, 2)

for i = 1:size(A, 2)

@inbounds sum += A[row, i] * B[i, col]

end

@inbounds C[row, col] = sum

end

return

end

using Metal

function MatrixMultiplication!(A, B,C)

row, col = thread_position_in_grid_2d()

sum = zero(eltype(C))

if row <= size(A, 1) && col <= size(B, 2)

for i = 1:size(A, 2)

@inbounds sum += A[row, i] * B[i, col]

end

@inbounds C[row, col] = sum

end

return

end

13 of 17

KernelAbstractions.jl

using KernelAbstractions

@kernel function MatrixMultiplication!(A, B, C)

row, col = @index(Global, NTuple)

sum = zero(eltype(C))

if row <= size(A, 1) && col <= size(B, 2)

for i = 1:size(A, 2)

@inbounds sum += A[row, i] * B[i, col]

end

@inbounds C[row, col] = sum

end

A = KernelAbstractions.rand(Backend, 128, 128)

B = KernelAbstractions.rand(Backend, 128, 128)

C = KernelAbstractions.rand(Backend, 128, 128)

workgroupsize = (16, 16)

kernel! = MatrixMultiplication!(Backend, workgroupsize)

kernel!(A, B, C, ndrange=(size(C)))

KernelAbstractions.synchronize(Backend)

14 of 17

Kernel language — @kernel

KernelAbstractions defines a language valid within @kernel definitions

@Const: Declares an argument to not be aliased or written to
@index: Which kernel element are we operating upon?
@localmem: Allocate local (shared) memory
@synchronize: Synchronize warps/wavefronts
@uniform: Declares a variable to be unchanging across lanes
@private: Allocate private memory for a lane
@print: Unified printing of device-side data

15 of 17

Kernel language: Indexing

Indexing Method:

@index(Global, Kind): Global index for accessing input arguments
@index(Group, Kind): Which work-group does the lane belong to?
@index(Local, Kind): Which item inside a work-group is this lane?

Indexing Kind:

@index(Locale, Cartesian): Index as a CartesianIndex
@index(Locale, NTuple): Index as a tuple
@index(Locale, Linear): Index as a scalar

16 of 17

Shared memory vs Global memory

17 of 17

17

Julia GPU Notebook:

https://github.com/mitmath/Parallel-Computing-Spoke/blob/main/notebooks/gpu_introduction.ipynb