CUDA Matrix Multiplication: One thread computes multiple elements

Hi experts,

I am new to CUDA programming. I am trying to make a program that one thread can compute a multiple elements of product matrix. The program uses multiple thread blocks, and each thread will be assigned
to a tile of tile_width x tile_width entries. I have compiled but it gets wrong result. I get no problem with multiple thread blocks, one thread compute one element of matrix product.

Here is my kernel:

__global__ void gpu_matrixMul(int *a, int *b, int *c, int Width, int tile_width){

int start_row = blockDim.y*blockIdx.y + threadIdx.y*tile_width;
int end_row = start_row + tile_width;
int start_col = blockDim.x*blockIdx.x + threadIdx.x*tile_width;
int end_col = start_col + tile_width;

for (int row = start_row; row < end_row; row++) {
     for(int col = start_col; col < end_col; col++) {
         float sum = 0;
         for (int k = 0; k < Width; k++) {
              sum += a[row * Width + k]*b[k * Width + col];
         }
         d_p[row*Width+col] = P_val;
         }
     }
}

Please help to advise if possible.

It would help if you show a complete code.

If each thread in your 2D thread array is responsible for a tile_width*tile_width portion of the matrix, then I don’t think these calculations are correct:

int start_row = blockDim.y*blockIdx.y + threadIdx.y*tile_width;
...
int start_col = blockDim.x*blockIdx.x + threadIdx.x*tile_width;

I think they should be like this:

int start_row = (blockDim.y*blockIdx.y + threadIdx.y)*tile_width;
...
int start_col = (blockDim.x*blockIdx.x + threadIdx.x)*tile_width;

Also, this line in your kernel doesn’t make sense:

d_p[row*Width+col] = P_val;

d_p and P_val aren’t defined anywhere in your kernel. So I don’t see how you could actually be running this code. Probably you meant something like this:

c[row*Width+col] = sum;

Nevertheless I’m confused by this statement:

“I have compiled but it gets wrong result.”

Since I don’t see how you could have compiled this code you have shown.

Anyway, with the above changes, I was able to compute a sensible result in a piece of test code.

If you’re still having trouble, make sure you are using proper cuda error checking (hint: google “proper cuda error checking”) and if you still want help please post a complete code, that someone else could copy, paste and compile, without having to add anything or change anything.

Hi,

Thank you for your attention. Here is my complete code. Please take a look.
In this program, I want to calculate multiple elements of a matrix by using
one thread. Please help to advise. Thanks a lot!

/*
-----1 Thread compute multiple elements of matrix product-----
*/

#include <stdio.h>
#include <conio.h>

#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#pragma comment(lib, "cudart")

//Function executed on host
void cpu_matrixMul(int *a, int *b, int *c, int N){

	int row, col, k, sum;
	
	for (row=0; row<N; row++)
		for (col=0; col<N; col++){
			sum = 0;
			for (k=0; k<N; k++)
			sum += a[row*N+k]*b[k*N+col];	
			c[row*N+col]=sum;
		}
}

//GPU kernel, each thread computes an area of data size TWxTW 
//using one thread block

__global__ void gpu_matrixMul1(int *a, int *b, int *c, int N, int TW){
	
	int start_row = threadIdx.y*TW;
	int end_row = start_row+TW;
	int start_col = threadIdx.x;
	int end_col = start_col+TW;
	int k, sum = 0;
	int row, col;

	if ((row < N) && (col <N)){	
		for (row = start_row; row < end_row; row++){
			for(col = start_col; col < end_col; col++){
				for(k = 0; k< N; k++){
					sum += a[row*N+k]*b[k*N+col];
					c[row*N+col] = sum;
					}
				}
			}
		}
	}

//GPU kernel, each thread computes an area of data size TWxTW 
//using multiple thread blocks

__global__ void gpu_matrixMul2(int *a, int *b, int *c, int N, int TW){
	int start_row = blockDim.y*blockIdx.y+threadIdx.y*TW;
	int end_row = start_row + TW;
	int start_col = blockDim.x*blockIdx.x+threadIdx.x*TW;
	int end_col = start_col + TW;

	int k, sum = 0;
	for (int row = start_row; row < end_row; row++){
		for (int col = start_col; col <end_col; col++){
			for (k = 0; k < N; k++){
				sum += a[row*N+k]*b[k*N+col];
				c[row*N+col] = sum;
			}
		}
	}
}

int main (int argc, char *argv[]){
	
							/Declare variables
	char key;
	
	int Grid_Dim = 1;				//Grid structure
	int Block_Dim = 1;				//Block structure

	int N=10;						//Size of matrix in one side
	int TW=2;						//size of data area computed by one thread
		
	int *a, *b, *c, *d;
	int *dev_a, *dev_b, *dev_c;
	int size;					

									
	

//Input data

do{
	printf("Input N, current N is %d: ", N);
	scanf("%d", &N);
	
	printf("Input TW, current TW %d: ", TW);
	scanf("%d", &N);

	printf("\nInput number of threads in x/y dimension in a block, current number %d: ", Block_Dim);
	scanf("%d", &Block_Dim);

	printf("\nInput number if blocks in x/y dimension in a grid, current number %d: ", Grid_Dim);
	scanf("%d", &Grid_Dim);

	dim3 Grid(Grid_Dim, Grid_Dim);			//grid structure
	dim3 Block(Block_Dim, Block_Dim);		//block structure

	size = N*N*sizeof(int);				//size of matrix
	
	a=(int*)malloc(size);					
	b=(int*)malloc(size);	
	c=(int*)malloc(size);					
	d=(int*)malloc(size);					

//data sample
for(int i= 0; i<N; i++)
	for(int j=0; j<N; j++){
		a[i*N+j]=j;
		b[i*N+j]=j*1;
	}

//Print sample data
printf("\nMatrix A and B:\n");
	for (int i=0; i<N; i++){
		for(int j=0; j<N; j++)
			printf("%d ", a[i*N+j]);				
			printf("\n");
	}
	
//Compute on GPU

cudaMalloc((void**)&dev_a, size);
cudaMalloc((void**)&dev_b, size);
cudaMalloc((void**)&dev_c, size);

cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, size, cudaMemcpyHostToDevice);
cudaMemcpy(dev_c, c, size, cudaMemcpyHostToDevice);

gpu_matrixMul2<<<Grid, Block>>>(dev_a, dev_b, dev_c, N, TW);	

cudaMemcpy(c, dev_c, size, cudaMemcpyDeviceToHost);			

//Compute on CPU
								

cpu_matrixMul(a, b, d, N);	

//Compare results
	for(int i=0; i<N*N; i++){
	if(c[i] = d[i])
		printf("\nCORRECT!!! CPU and  GPU create same anwser\n");
	else		
		printf("\nERROR!!! CPU and GPU create different anwser\n");
		break;
	}

	printf("\nMatrix result from GPU:\n");
	for (int i=0; i<N; i++){
		for(int j=0; j<N; j++)
			printf("%d ", c[i*N+j]);				
			printf("\n");
	}

	printf("\nMatrix result from CPU:\n");
	for (int i=0; i<N; i++){
		for(int j=0; j<N; j++)
			printf("%d ", d[i*N+j]);				
			printf("\n");
	}
	
	printf("\nType n to start a new computation\n");
	scanf("%c", &key);
	scanf("%c", &key);

	}while (key=='n');				//loop of complete program
	
//Free the memory
free(a);
free(b);
free(c);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);

cudaEventDestroy(start);
cudaEventDestroy(stop);

return 0;
getch();

}

So apparently you have not read my answer.

  1. You didn’t apply the parenthesis fix I suggested.
  2. You did not add proper cuda error checking.

The following code is based on your original post and works correctly. You may wish to study it. You cannot input arbitrary grid and block dimensions with the kernel code you have created.

#include <stdio.h>
#define MWIDTH 4096
#define MTILE 16
#define BWIDTH 16

__global__ void gpu_matrixMul(int *a, int *b, int *c, int Width, int tile_width){

  int start_row = (blockDim.y*blockIdx.y + threadIdx.y)*tile_width;
  int end_row = start_row + tile_width;
  int start_col = (blockDim.x*blockIdx.x + threadIdx.x)*tile_width;
  int end_col = start_col + tile_width;

  for (int row = start_row; row < end_row; row++) {
    for(int col = start_col; col < end_col; col++) {
      float sum = 0;
      for (int k = 0; k < Width; k++) {
        sum += a[row * Width + k]*b[k * Width + col];
      }
      c[row*Width+col] = sum;
    }
  }
}



int main(){

  int *h_a, *h_b, *h_c, *d_a, *d_b, *d_c;
  h_a = (int *)malloc(MWIDTH*MWIDTH*sizeof(int));
  h_b = (int *)malloc(MWIDTH*MWIDTH*sizeof(int));
  h_c = (int *)malloc(MWIDTH*MWIDTH*sizeof(int));
  cudaMalloc(&d_a, MWIDTH*MWIDTH*sizeof(int));
  cudaMalloc(&d_b, MWIDTH*MWIDTH*sizeof(int));
  cudaMalloc(&d_c, MWIDTH*MWIDTH*sizeof(int));

  for (int i = 0; i < MWIDTH*MWIDTH; i++) {
    h_a[i] = 1;
    h_b[i] = 1;
    h_c[i] = 0;}

  cudaMemcpy(d_a, h_a, MWIDTH*MWIDTH*sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, h_b, MWIDTH*MWIDTH*sizeof(int), cudaMemcpyHostToDevice);
  cudaMemset(d_c, 0, MWIDTH*MWIDTH*sizeof(int));

  gpu_matrixMul<<<dim3((MWIDTH/(MTILE*BWIDTH)), (MWIDTH/(MTILE*BWIDTH))), dim3(BWIDTH,BWIDTH)>>>(d_a, d_b, d_c, MWIDTH, MTILE);
  cudaMemcpy(h_c, d_c, MWIDTH*MWIDTH*sizeof(int), cudaMemcpyDeviceToHost);
  for (int i=0; i < MWIDTH*MWIDTH; i++)
    if (h_c[i] != MWIDTH) {printf("Mismatch at offset %d, was: %d, should be: %d\n", i, h_c[i], MWIDTH); return 1;}
  printf("Success!\n");
  return 0;
}

Hi,

Thank you very much for your help!

I have applied the suggested code and see that in this problem, it works correctly in case the size of data area (tile_width) covered by one thread must be equal between threads. So, I have to define (grid, block) structure to fit with the data.

So, how to handle in cases MWIDTH/(MTILE*BWIDTH) in not an integer?
Of course, in all cases we can use one thread to compute all elements like CPU does. For example, I have matrices A and B are 16x16. If I use 4 threads (BWIDTH=2), then each thread computes a matrix 8x8 (tile_width = 8). In other case, still use 4 threads, assign tile_width = 10, then the result goes wrong.

Besides, I also have some related questions, please help to clarify.
2) How the structure of grid and block affect the performance of program? How to choose the best structure of grid and block to have highest effect?
3) I have tried with some data samples (by increase significantly the size of matrix) the compare the time calculate by GPU and CPU. I see that the time computed by GPU is larger than by CPU. And it increases more times when the size of matrix increased. So, can you suggest if I’m missing sth here.
Here is my code to measure time in GPU and CPU.

//measure time computed in GPU
cudaEventCreate(&start);						
cudaEventCreate(&stop);

cudaEventRecord(start, 0);

gpu_matrixMul2<<<Grid, Block>>>(d_a, d_b, d_c, MWidth, tile_width);	
cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);			


cudaEventRecord(stop, 0);				
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed_time_ms_gpu, start, stop);
//measure time computed in CPU
cudaEventRecord(start, 0);				

cpu_matrixMul(h_a, h_b, h_d, MWidth);			

cudaEventRecord(stop, 0);				
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed_time_ms_cpu, start, stop);

Thanks and Best Regards!