CUDA Matrix Multiplication: One thread computes multiple elements

ducdq · December 27, 2014, 8:34am

Hi experts,

I am new to CUDA programming. I am trying to make a program that one thread can compute a multiple elements of product matrix. The program uses multiple thread blocks, and each thread will be assigned
to a tile of tile_width x tile_width entries. I have compiled but it gets wrong result. I get no problem with multiple thread blocks, one thread compute one element of matrix product.

Here is my kernel:

__global__ void gpu_matrixMul(int *a, int *b, int *c, int Width, int tile_width){

int start_row = blockDim.y*blockIdx.y + threadIdx.y*tile_width;
int end_row = start_row + tile_width;
int start_col = blockDim.x*blockIdx.x + threadIdx.x*tile_width;
int end_col = start_col + tile_width;

for (int row = start_row; row < end_row; row++) {
     for(int col = start_col; col < end_col; col++) {
         float sum = 0;
         for (int k = 0; k < Width; k++) {
              sum += a[row * Width + k]*b[k * Width + col];
         }
         d_p[row*Width+col] = P_val;
         }
     }
}

Please help to advise if possible.

Robert_Crovella · December 27, 2014, 5:01pm

It would help if you show a complete code.

If each thread in your 2D thread array is responsible for a tile_width*tile_width portion of the matrix, then I don’t think these calculations are correct:

int start_row = blockDim.y*blockIdx.y + threadIdx.y*tile_width;
...
int start_col = blockDim.x*blockIdx.x + threadIdx.x*tile_width;

I think they should be like this:

int start_row = (blockDim.y*blockIdx.y + threadIdx.y)*tile_width;
...
int start_col = (blockDim.x*blockIdx.x + threadIdx.x)*tile_width;

Also, this line in your kernel doesn’t make sense:

d_p[row*Width+col] = P_val;

d_p and P_val aren’t defined anywhere in your kernel. So I don’t see how you could actually be running this code. Probably you meant something like this:

c[row*Width+col] = sum;

Nevertheless I’m confused by this statement:

“I have compiled but it gets wrong result.”

Since I don’t see how you could have compiled this code you have shown.

Anyway, with the above changes, I was able to compute a sensible result in a piece of test code.

If you’re still having trouble, make sure you are using proper cuda error checking (hint: google “proper cuda error checking”) and if you still want help please post a complete code, that someone else could copy, paste and compile, without having to add anything or change anything.

ducdq · December 27, 2014, 5:48pm

Hi,

Thank you for your attention. Here is my complete code. Please take a look.
In this program, I want to calculate multiple elements of a matrix by using
one thread. Please help to advise. Thanks a lot!

/*
-----1 Thread compute multiple elements of matrix product-----
*/

#include <stdio.h>
#include <conio.h>

#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#pragma comment(lib, "cudart")

//Function executed on host
void cpu_matrixMul(int *a, int *b, int *c, int N){

	int row, col, k, sum;
	
	for (row=0; row<N; row++)
		for (col=0; col<N; col++){
			sum = 0;
			for (k=0; k<N; k++)
			sum += a[row*N+k]*b[k*N+col];	
			c[row*N+col]=sum;
		}
}

//GPU kernel, each thread computes an area of data size TWxTW 
//using one thread block

__global__ void gpu_matrixMul1(int *a, int *b, int *c, int N, int TW){
	
	int start_row = threadIdx.y*TW;
	int end_row = start_row+TW;
	int start_col = threadIdx.x;
	int end_col = start_col+TW;
	int k, sum = 0;
	int row, col;

	if ((row < N) && (col <N)){	
		for (row = start_row; row < end_row; row++){
			for(col = start_col; col < end_col; col++){
				for(k = 0; k< N; k++){
					sum += a[row*N+k]*b[k*N+col];
					c[row*N+col] = sum;
					}
				}
			}
		}
	}

//GPU kernel, each thread computes an area of data size TWxTW 
//using multiple thread blocks

__global__ void gpu_matrixMul2(int *a, int *b, int *c, int N, int TW){
	int start_row = blockDim.y*blockIdx.y+threadIdx.y*TW;
	int end_row = start_row + TW;
	int start_col = blockDim.x*blockIdx.x+threadIdx.x*TW;
	int end_col = start_col + TW;

	int k, sum = 0;
	for (int row = start_row; row < end_row; row++){
		for (int col = start_col; col <end_col; col++){
			for (k = 0; k < N; k++){
				sum += a[row*N+k]*b[k*N+col];
				c[row*N+col] = sum;
			}
		}
	}
}

int main (int argc, char *argv[]){
	
							/Declare variables
	char key;
	
	int Grid_Dim = 1;				//Grid structure
	int Block_Dim = 1;				//Block structure

	int N=10;						//Size of matrix in one side
	int TW=2;						//size of data area computed by one thread
		
	int *a, *b, *c, *d;
	int *dev_a, *dev_b, *dev_c;
	int size;					

									
	

//Input data

do{
	printf("Input N, current N is %d: ", N);
	scanf("%d", &N);
	
	printf("Input TW, current TW %d: ", TW);
	scanf("%d", &N);

	printf("\nInput number of threads in x/y dimension in a block, current number %d: ", Block_Dim);
	scanf("%d", &Block_Dim);

	printf("\nInput number if blocks in x/y dimension in a grid, current number %d: ", Grid_Dim);
	scanf("%d", &Grid_Dim);

	dim3 Grid(Grid_Dim, Grid_Dim);			//grid structure
	dim3 Block(Block_Dim, Block_Dim);		//block structure

	size = N*N*sizeof(int);				//size of matrix
	
	a=(int*)malloc(size);					
	b=(int*)malloc(size);	
	c=(int*)malloc(size);					
	d=(int*)malloc(size);					

//data sample
for(int i= 0; i<N; i++)
	for(int j=0; j<N; j++){
		a[i*N+j]=j;
		b[i*N+j]=j*1;
	}

//Print sample data
printf("\nMatrix A and B:\n");
	for (int i=0; i<N; i++){
		for(int j=0; j<N; j++)
			printf("%d ", a[i*N+j]);				
			printf("\n");
	}
	
//Compute on GPU

cudaMalloc((void**)&dev_a, size);
cudaMalloc((void**)&dev_b, size);
cudaMalloc((void**)&dev_c, size);

cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, size, cudaMemcpyHostToDevice);
cudaMemcpy(dev_c, c, size, cudaMemcpyHostToDevice);

gpu_matrixMul2<<<Grid, Block>>>(dev_a, dev_b, dev_c, N, TW);	

cudaMemcpy(c, dev_c, size, cudaMemcpyDeviceToHost);			

//Compute on CPU
								

cpu_matrixMul(a, b, d, N);	

//Compare results
	for(int i=0; i<N*N; i++){
	if(c[i] = d[i])
		printf("\nCORRECT!!! CPU and  GPU create same anwser\n");
	else		
		printf("\nERROR!!! CPU and GPU create different anwser\n");
		break;
	}

	printf("\nMatrix result from GPU:\n");
	for (int i=0; i<N; i++){
		for(int j=0; j<N; j++)
			printf("%d ", c[i*N+j]);				
			printf("\n");
	}

	printf("\nMatrix result from CPU:\n");
	for (int i=0; i<N; i++){
		for(int j=0; j<N; j++)
			printf("%d ", d[i*N+j]);				
			printf("\n");
	}
	
	printf("\nType n to start a new computation\n");
	scanf("%c", &key);
	scanf("%c", &key);

	}while (key=='n');				//loop of complete program
	
//Free the memory
free(a);
free(b);
free(c);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);

cudaEventDestroy(start);
cudaEventDestroy(stop);

return 0;
getch();

}

Robert_Crovella · December 27, 2014, 7:40pm

So apparently you have not read my answer.

You didn’t apply the parenthesis fix I suggested.
You did not add proper cuda error checking.

The following code is based on your original post and works correctly. You may wish to study it. You cannot input arbitrary grid and block dimensions with the kernel code you have created.

#include <stdio.h>
#define MWIDTH 4096
#define MTILE 16
#define BWIDTH 16

__global__ void gpu_matrixMul(int *a, int *b, int *c, int Width, int tile_width){

  int start_row = (blockDim.y*blockIdx.y + threadIdx.y)*tile_width;
  int end_row = start_row + tile_width;
  int start_col = (blockDim.x*blockIdx.x + threadIdx.x)*tile_width;
  int end_col = start_col + tile_width;

  for (int row = start_row; row < end_row; row++) {
    for(int col = start_col; col < end_col; col++) {
      float sum = 0;
      for (int k = 0; k < Width; k++) {
        sum += a[row * Width + k]*b[k * Width + col];
      }
      c[row*Width+col] = sum;
    }
  }
}



int main(){

  int *h_a, *h_b, *h_c, *d_a, *d_b, *d_c;
  h_a = (int *)malloc(MWIDTH*MWIDTH*sizeof(int));
  h_b = (int *)malloc(MWIDTH*MWIDTH*sizeof(int));
  h_c = (int *)malloc(MWIDTH*MWIDTH*sizeof(int));
  cudaMalloc(&d_a, MWIDTH*MWIDTH*sizeof(int));
  cudaMalloc(&d_b, MWIDTH*MWIDTH*sizeof(int));
  cudaMalloc(&d_c, MWIDTH*MWIDTH*sizeof(int));

  for (int i = 0; i < MWIDTH*MWIDTH; i++) {
    h_a[i] = 1;
    h_b[i] = 1;
    h_c[i] = 0;}

  cudaMemcpy(d_a, h_a, MWIDTH*MWIDTH*sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, h_b, MWIDTH*MWIDTH*sizeof(int), cudaMemcpyHostToDevice);
  cudaMemset(d_c, 0, MWIDTH*MWIDTH*sizeof(int));

  gpu_matrixMul<<<dim3((MWIDTH/(MTILE*BWIDTH)), (MWIDTH/(MTILE*BWIDTH))), dim3(BWIDTH,BWIDTH)>>>(d_a, d_b, d_c, MWIDTH, MTILE);
  cudaMemcpy(h_c, d_c, MWIDTH*MWIDTH*sizeof(int), cudaMemcpyDeviceToHost);
  for (int i=0; i < MWIDTH*MWIDTH; i++)
    if (h_c[i] != MWIDTH) {printf("Mismatch at offset %d, was: %d, should be: %d\n", i, h_c[i], MWIDTH); return 1;}
  printf("Success!\n");
  return 0;
}

ducdq · December 28, 2014, 7:42pm

Hi,

Thank you very much for your help!

I have applied the suggested code and see that in this problem, it works correctly in case the size of data area (tile_width) covered by one thread must be equal between threads. So, I have to define (grid, block) structure to fit with the data.

So, how to handle in cases MWIDTH/(MTILE*BWIDTH) in not an integer?
Of course, in all cases we can use one thread to compute all elements like CPU does. For example, I have matrices A and B are 16x16. If I use 4 threads (BWIDTH=2), then each thread computes a matrix 8x8 (tile_width = 8). In other case, still use 4 threads, assign tile_width = 10, then the result goes wrong.

Besides, I also have some related questions, please help to clarify.
2) How the structure of grid and block affect the performance of program? How to choose the best structure of grid and block to have highest effect?
3) I have tried with some data samples (by increase significantly the size of matrix) the compare the time calculate by GPU and CPU. I see that the time computed by GPU is larger than by CPU. And it increases more times when the size of matrix increased. So, can you suggest if I’m missing sth here.
Here is my code to measure time in GPU and CPU.

//measure time computed in GPU
cudaEventCreate(&start);						
cudaEventCreate(&stop);

cudaEventRecord(start, 0);

gpu_matrixMul2<<<Grid, Block>>>(d_a, d_b, d_c, MWidth, tile_width);	
cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);			


cudaEventRecord(stop, 0);				
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed_time_ms_gpu, start, stop);

//measure time computed in CPU
cudaEventRecord(start, 0);				

cpu_matrixMul(h_a, h_b, h_d, MWidth);			

cudaEventRecord(stop, 0);				
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed_time_ms_cpu, start, stop);

Thanks and Best Regards!

Topic		Replies	Views
Timing comparison(ms) in calculation of the sum of matrix rows CUDA Programming and Performance cuda , kernel	1	475	October 26, 2022
Matrix Multiplication Garbage value :( CUDA Programming and Performance	10	3408	July 25, 2009
32 x 32 Matrix Multiplication CUDA Programming and Performance	2	2871	March 5, 2010
Cuda matrix multiplication too slow CUDA Programming and Performance	5	13334	February 17, 2010
Matrix Multiplication In CUDA CUDA Programming and Performance	6	2540	May 11, 2015
Problems of matrix multiplication With and without CUDA CUDA Programming and Performance	15	10006	January 18, 2012
Efficient use of shared memory CUDA Programming and Performance	29	4509	December 2, 2019
Matrix multiplication CUDA Programming and Performance	7	2155	July 2, 2010
matrix multiply reduction CUDA Programming and Performance	41	35554	January 15, 2011
CUDA Matrix Multiplication Issues threads and blocks problem CUDA Programming and Performance	2	3631	March 1, 2009

CUDA Matrix Multiplication: One thread computes multiple elements

Related topics