I have to solve the following problem:
Implement a kernel for matrix products as outer product vectors. In this version, each block of K threads counts a square piece of the result matrix of size KxK by implementing the matrix outer product formula. The kernel uses shared memory to store the corresponding column vectors from matrix A and matrix B, and to store the corresponding fragment of the result array.
I wrote the following kernel:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <iostream>
#define real float
#define IDX(i, j, n) ((i) * (n) + (j))
#define K 32
const int SIZE_1 = 32;
const int SIZE_2 = 1;
const int N = 3;// size of matrices
// The kernel is working correctly.
__global__ void MultiplyGPUv2(real *A, real *B, real *C, int n) {
__shared__ real sA[K][K];
__shared__ real sB[K][K];
int row = blockIdx.y * K + threadIdx.y;
int col = blockIdx.x * K + threadIdx.x;
real sum = 0.0;
for (int r = 0; r < (n + K - 1) / K; r++) {
if (row < n && (r * K + threadIdx.x) < n) {
sA[threadIdx.y][threadIdx.x] = A[IDX(row, r * K + threadIdx.x, n)];
} else {
sA[threadIdx.y][threadIdx.x] = 0.0;
}
if ((r * K + threadIdx.y) < n && col < n) {
sB[threadIdx.y][threadIdx.x] = B[IDX(r * K + threadIdx.y, col, n)];
} else {
sB[threadIdx.y][threadIdx.x] = 0.0;
}
__syncthreads();
for (int i = 0; i < K; i++) {
sum += sA[threadIdx.y][i] * sB[i][threadIdx.x];
}
__syncthreads();
}
if (row < n && col < n) {
C[IDX(row, col, n)] = sum;
}
}
int main()
{
real *A, *B, *C; // Host matrices
real *d_A, *d_B, *d_C; // Device matrices
// Allocate memory for host matrices A, B, and C
A = (real *)malloc(N * N * sizeof(real));
B = (real *)malloc(N * N * sizeof(real));
C = (real *)malloc(N * N * sizeof(real));
// Initialize host matrices A, B, and C
for (int i = 0; i < N * N; i++) {
A[i] = i + 1;
B[i] = i + 1;
C[i] = 0;
}
// Allocate memory for device matrices A, B, and C
cudaMalloc((void **)&d_A, N * N * sizeof(real));
cudaMalloc((void **)&d_B, N * N * sizeof(real));
cudaMalloc((void **)&d_C, N * N * sizeof(real));
// Copy data from host to device
cudaMemcpy(d_A, A, N * N * sizeof(real), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, N * N * sizeof(real), cudaMemcpyHostToDevice);
// Set the grid and block dimensions
dim3 block_size(K, K, 1);
dim3 grid_size((N + K - 1) / K, (N + K - 1) / K);
//dim3 block_size(SIZE_1, SIZE_2, 1);
//dim3 grid_size((N + SIZE_1 - 1) / SIZE_1, (N + SIZE_2 - 1) / SIZE_2);
// Perform matrix multiplication on device
MultiplyGPUv2<<<grid_size, block_size>>>(d_A, d_B, d_C, N);
// Copy results from device to host
cudaMemcpy(C, d_C, N * N * sizeof(real), cudaMemcpyDeviceToHost);
if(N==3) printMatrix(C, N);
// Free memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(A);
free(B);
free(C);
return 0;
}
I have a confusion here.
I am told to change the SIZE_1xSIZE_2
=32x1, 64x1, 128x1, … and record the execution time of the kernel. The problem also mentions another value K
. If I use block_size
=32x1 and k=32, the above listing doesn’t produce correct result for the multiplication.
Is there a formula for the relationship between K
and SIZE_1xSIZE_2
? And, how do I accommodate N
in this relationship?