Matrix multiplication from CUDA programming guide

jafar · November 23, 2009, 7:12pm

Hi,

I’m learning cuda with the “NVIDIA CUDA programming guide”.

I implemented the “Matrix Multipliation without Shared Memory” (pg 20-22) but it doesn’t work very well.

Here’s my code :

#include <stdio.h>

#include <stdlib.h>

#include <cuda.h>

#include <cuda_runtime.h>

#include <time.h> 

// Matrices are stored in row-major order:

// M(row, col) = *(M.elements + row * M.width + col)

typedef struct {

	int width;

	int height;

	float *elements;

} Matrix;

// Thread block size

#define BLOCK_SIZE 16

// Forward declaration of the matrix multiplication kernel

__global__ void MatMulKernel(const Matrix, const Matrix, Matrix);

// Matrix multiplication - Host code

// Matrix dimensions are assumed to be multiples of BLOCK_SIZE

void matrixMulGPU(const Matrix A, const Matrix B, Matrix C)

{

	// Load A and B to device memory

	Matrix d_A;

	d_A.width = A.width; d_A.height = A.height;

	size_t size = A.width * A.height * sizeof(float);

	cudaMalloc((void**)&d_A.elements, size);

	cudaMemcpy(d_A.elements, A.elements, size,

			   cudaMemcpyHostToDevice);

	

	Matrix d_B;

	d_B.width = B.width; d_B.height = B.height;

	size = B.width * B.height * sizeof(float);

	cudaMalloc((void**)&d_B.elements, size);

	cudaMemcpy(d_B.elements, B.elements, size,

			   cudaMemcpyHostToDevice);

	

	// Allocate C in device memory

	Matrix d_C;

	d_C.width = C.width; d_C.height = C.height;

	size = C.width * C.height * sizeof(float);

	cudaMalloc((void**)&d_C.elements, size);

	

	// Invoke kernel

	dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);

	dim3 dimGrid(B.width / dimBlock.x + 1, A.height / dimBlock.y + 1);

	MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);

	

	// Read C from device memory

	cudaMemcpy(C.elements, d_C.elements, size,

			   cudaMemcpyDeviceToHost);

	

	// Free device memory

	cudaFree(d_A.elements);

	cudaFree(d_B.elements);

	cudaFree(d_C.elements);

	

	cudaThreadExit();

}

// Matrix multiplication kernel called by MatMul()

__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)

{

	// Each thread computes one element of C

	// by accumulating results into Cvalue

	float Cvalue = 0;

	int row = blockIdx.y * blockDim.y + threadIdx.y;

	int col = blockIdx.x * blockDim.x + threadIdx.x;

	for (int e = 0; e < A.width; ++e) {

			Cvalue += A.elements[row * A.width + e] 

					* B.elements[e * B.width + col];

	}

	__syncthreads();

	C.elements[row * C.width + col] = Cvalue;

}

float floatRand(float MaxVal) {

	return ( (float)rand() / (float)RAND_MAX ) * MaxVal;

}

void timedate() {

	time_t temps_act; 

	time(&temps_act); 

	printf("%s\n", ctime(&temps_act)); 

}

void matrixMulCPU(Matrix M, Matrix N, Matrix P) {

	

	int i, j, k;

	for(i=0; i<M.height; i++) {

		for(j=0; j<N.width; j++) {

			for(k=0; k<M.width; k++) {

				P.elements[i*N.width + j] += M.elements[i*M.width + k] 

										   * N.elements[k*N.width + j];

			}

		}

	}

}

int main() {

	

	int size = 10;

	int i, j;

	int print = true;

	Matrix A;

	Matrix B;

	Matrix C;

	Matrix D;

	

	

	A.width = size;

	A.height = size;

	A.elements = (float*) malloc(A.width*A.height*sizeof(float));

	

	B.width = size;

	B.height = size;

	B.elements = (float*) malloc(B.width*B.height*sizeof(float));

	

	C.width = size;

	C.height = size;

	C.elements = (float*) malloc(C.width*C.height*sizeof(float));

	

	D.width = size;

	D.height = size;

	D.elements = (float*) malloc(D.width*D.height*sizeof(float));

	

	

	printf("matrix generation\n");

	

	srand ( time(NULL) );

	for(i=0; i<size*size; i++) {

		A.elements[i] = 2; //floatRand(2);

		B.elements[i] = 2; //floatRand(2);

//		A.elements[i] = 3;

//		B.elements[i] = 3;

		C.elements[i] = 0;

		D.elements[i] = 0;

	}

	

	if(print) {

		printf("A =\n");

		for(i=0; i<A.height; i++) {

			for(j=0; j<A.width; j++) {

				printf("%f ",A.elements[i*size + j]);

			}

		   printf("\n");

		}

		printf("\n\n");

		printf("B =\n");

		for(i=0; i<B.height; i++) {

			for(j=0; j<B.width; j++) {

				printf("%f ",B.elements[i*size + j]);

			}

		   printf("\n");

		}

		

	}

	

	

	timedate();

	printf("GPU matrix multiplication\n");

	matrixMulGPU(A, B, C);

	timedate();

	

	

	timedate();

	printf("CPU matrix multiplication\n");

	matrixMulCPU(A, B, D);

	timedate();

	

	

	//print result

	if(print) {

		printf("C =\n");

		for(i=0; i<C.height; i++) {

			for(j=0; j<C.width; j++) {

				printf("%f ",C.elements[i*size + j]);

			}

		   printf("\n");

		}

		printf("\n\n");

		printf("D =\n");

		for(i=0; i<D.height; i++) {

			for(j=0; j<D.width; j++) {

				printf("%f ",D.elements[i*size + j]);

			}

		   printf("\n");

		}

		

	}

	

}

and here’s the result I have with my Geforce 9650M GT :

matrix generation

A =

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

B =

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

Mon Nov 23 20:02:55 2009

GPU matrix multiplication

Mon Nov 23 20:02:55 2009

Mon Nov 23 20:02:55 2009

CPU matrix multiplication

Mon Nov 23 20:02:55 2009

C =

40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 

40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 

40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 

40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 

nan -20769186196199271228741710417756160.000000 32.000000 nan nan nan 40.000000 40.000000 40.000000 40.000000 

40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 

nan -20769186196199271228741710417756160.000000 32.000000 nan nan nan 40.000000 40.000000 40.000000 40.000000 

40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 

nan -20769186196199271228741710417756160.000000 32.000000 nan nan nan 40.000000 40.000000 40.000000 40.000000 

40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 

D =

40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 

40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 

40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 

40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 

40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 

40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 

40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 

40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 

40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 

40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000

Some strange values or NaN appear on the gpu. I should missed something… help! :huh:

Thanks in advance External Image

ps: I’m running ubuntu 9.10 with forceware 190.42 (from https://launchpad.net/~nvidia-vdpau/+archive/ppa)

Topic		Replies	Views
Matrix Multiplication Inconsistency Different values output in every run of the matixMul program CUDA Programming and Performance	29	8750	December 16, 2009
Cuda matrix multiplication too slow CUDA Programming and Performance	5	13323	February 17, 2010
Why different shape matrix multiplication have different performance? CUDA Programming and Performance	2	743	August 26, 2018
32 x 32 Matrix Multiplication CUDA Programming and Performance	2	2853	March 5, 2010
matrix multiplication--wrong answer CUDA Programming and Performance	6	3788	August 20, 2009
Matrix Multiplication In CUDA CUDA Programming and Performance	6	2525	May 11, 2015
CUDA Matrix Multiplication Issues threads and blocks problem CUDA Programming and Performance	2	3629	March 1, 2009
Matrix multiplication CUDA Programming and Performance	7	2150	July 2, 2010
Problems of matrix multiplication With and without CUDA CUDA Programming and Performance	15	9993	January 18, 2012
nVidia CUDA Programming Guide and shared memory CUDA Programming and Performance	0	1462	January 12, 2010

Matrix multiplication from CUDA programming guide

Related topics