dim3 threadsPerBlock(TILE_WIDTH,TILE_WIDTH);
dim3 numBlocks(N /threadsPerBlock.x, P /threadsPerBlock.y);
simpleMultiply<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N, M, P); // it should perform C=A*B where A[N][M], B[M][P], C[N][P]
…
}
global void simpleMultiply(float* A, float* B, float* C, int NN, int MM , int PP) {
int bx = blockIdx.x; int by = blockIdx.y;
int tx = threadIdx.x; int ty = threadIdx.y;
//we are processing element C[row][col];
// TILE_WIDTH is the block dimension BLOCK = [TILE_WIDTH][TILE_WIDTH]
int row = by * TILE_WIDTH + ty;
int col = bx * TILE_WIDTH + tx;
if ((row<N) && (col<M)){
// Matricies in Cuda are arranged by columns --> A[i][j] = A[i + N*j] where dim A[N x M]
float sum = 0.0f;
for (int i = 0; i < MM; i++) {
sum += A[row + NN*i] * B[i + MM*col]; // sum += A[row][i] * B[i][col]
}
C[row + col*NN] = sum;
}
dim3 threadsPerBlock(TILE_WIDTH, TILE_WIDTH);
dim3 numBlocks((N + TILE_WIDTH - 1) / TILE_WIDTH, (P + TILE_WIDTH - 1) / TILE_WIDTH);
simpleMultiply<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N, M, P); // it should perform C=A*B where A[N][M], B[M][P], C[N][P]
…
}
global void simpleMultiply(float* A, float* B, float* C, int NN, int MM , int PP) {
int bx = blockIdx.x; int by = blockIdx.y;
int tx = threadIdx.x; int ty = threadIdx.y;
//we are processing element C[row][col];
// TILE_WIDTH is the block dimension BLOCK = [TILE_WIDTH][TILE_WIDTH]
int row = by * TILE_WIDTH + ty;
int col = bx * TILE_WIDTH + tx;
if ((row<NN) && (col<PP)){
// Matricies in Cuda are arranged by columns --> A[i][j] = A[i + N*j] where dim A[N x M]
float sum = 0.0f;
for (int i = 0; i < MM; i++) {
sum += A[row + NN*i] * B[i + MM*col]; // sum += A[row][i] * B[i][col]
}
C[row + col*NN] = sum;
}
dim3 threadsPerBlock(TILE_WIDTH, TILE_WIDTH);
dim3 numBlocks((N + TILE_WIDTH - 1) / TILE_WIDTH, (P + TILE_WIDTH - 1) / TILE_WIDTH);
simpleMultiply<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N, M, P); // it should perform C=A*B where A[N][M], B[M][P], C[N][P]
…
}
global void simpleMultiply(float* A, float* B, float* C, int NN, int MM , int PP) {
int bx = blockIdx.x; int by = blockIdx.y;
int tx = threadIdx.x; int ty = threadIdx.y;
//we are processing element C[row][col];
// TILE_WIDTH is the block dimension BLOCK = [TILE_WIDTH][TILE_WIDTH]
int row = by * TILE_WIDTH + ty;
int col = bx * TILE_WIDTH + tx;
if ((row<NN) && (col<PP)){
// Matricies in Cuda are arranged by columns --> A[i][j] = A[i + N*j] where dim A[N x M]
float sum = 0.0f;
for (int i = 0; i < MM; i++) {
sum += A[row + NN*i] * B[i + MM*col]; // sum += A[row][i] * B[i][col]
}
C[row + col*NN] = sum;
}
global void simpleMultiply(float* A, float* B, float* C, int NN, int MM , int PP) {
int bx = blockIdx.x; int by = blockIdx.y;
int tx = threadIdx.x; int ty = threadIdx.y;
//we are processing element C[row][col];
// TILE_WIDTH is the block dimension BLOCK = [TILE_WIDTH][TILE_WIDTH]
int row = by * TILE_WIDTH + ty;
int col = bx * TILE_WIDTH + tx;
if ((row<NN) && (col<PP)){
// Matricies in Cuda are arranged by columns --> A[i][j] = A[i + N*j] where dim A[N x M]
float sum = 0.0f;
for (int i = 0; i < MM; i++) {
sum += A[row + MM*i] * B[i + PP*col]; // sum += A[row][i] * B[i][col]
}
C[row + col*NN] = sum;
}
}[/codebox]
and the result is
27 36
0 0
0 49
66 0
0 0
but the right result is
22 28
49 64
75 98
103 136
130 172
I am sure that all other function are fine…so the problem is the GPU kernel.
I found one error on matrix indices :(, but it still doesn’t work…the code is the same but with
[codebox]global void simpleMultiply(float* A, float* B, float* C, int NN, int MM , int PP) {
int bx = blockIdx.x; int by = blockIdx.y;
int tx = threadIdx.x; int ty = threadIdx.y;
//we are processing element C[row][col];
// TILE_WIDTH is the block dimension BLOCK = [TILE_WIDTH][TILE_WIDTH]
int row = by * TILE_WIDTH + ty;
int col = bx * TILE_WIDTH + tx;
if ((row<NN) && (col<PP)){
float sum = 0.0f;
int i;
for (i = 0; i < MM; i++) {
sum += A[row*MM + i] * B[i*PP + col]; // sum += A[row][i] * B[i][col]
}
C[row*PP + col] = sum;
}