Matrix multiplication from CUDA programming guide

Hi,

I’m learning cuda with the “NVIDIA CUDA programming guide”.

I implemented the “Matrix Multipliation without Shared Memory” (pg 20-22) but it doesn’t work very well.

Here’s my code :

#include <stdio.h>

#include <stdlib.h>

#include <cuda.h>

#include <cuda_runtime.h>

#include <time.h> 

// Matrices are stored in row-major order:

// M(row, col) = *(M.elements + row * M.width + col)

typedef struct {

	int width;

	int height;

	float *elements;

} Matrix;

// Thread block size

#define BLOCK_SIZE 16

// Forward declaration of the matrix multiplication kernel

__global__ void MatMulKernel(const Matrix, const Matrix, Matrix);

// Matrix multiplication - Host code

// Matrix dimensions are assumed to be multiples of BLOCK_SIZE

void matrixMulGPU(const Matrix A, const Matrix B, Matrix C)

{

	// Load A and B to device memory

	Matrix d_A;

	d_A.width = A.width; d_A.height = A.height;

	size_t size = A.width * A.height * sizeof(float);

	cudaMalloc((void**)&d_A.elements, size);

	cudaMemcpy(d_A.elements, A.elements, size,

			   cudaMemcpyHostToDevice);

	

	Matrix d_B;

	d_B.width = B.width; d_B.height = B.height;

	size = B.width * B.height * sizeof(float);

	cudaMalloc((void**)&d_B.elements, size);

	cudaMemcpy(d_B.elements, B.elements, size,

			   cudaMemcpyHostToDevice);

	

	// Allocate C in device memory

	Matrix d_C;

	d_C.width = C.width; d_C.height = C.height;

	size = C.width * C.height * sizeof(float);

	cudaMalloc((void**)&d_C.elements, size);

	

	// Invoke kernel

	dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);

	dim3 dimGrid(B.width / dimBlock.x + 1, A.height / dimBlock.y + 1);

	MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);

	

	// Read C from device memory

	cudaMemcpy(C.elements, d_C.elements, size,

			   cudaMemcpyDeviceToHost);

	

	// Free device memory

	cudaFree(d_A.elements);

	cudaFree(d_B.elements);

	cudaFree(d_C.elements);

	

	cudaThreadExit();

}

// Matrix multiplication kernel called by MatMul()

__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)

{

	// Each thread computes one element of C

	// by accumulating results into Cvalue

	float Cvalue = 0;

	int row = blockIdx.y * blockDim.y + threadIdx.y;

	int col = blockIdx.x * blockDim.x + threadIdx.x;

	for (int e = 0; e < A.width; ++e) {

			Cvalue += A.elements[row * A.width + e] 

					* B.elements[e * B.width + col];

	}

	__syncthreads();

	C.elements[row * C.width + col] = Cvalue;

}

float floatRand(float MaxVal) {

	return ( (float)rand() / (float)RAND_MAX ) * MaxVal;

}

void timedate() {

	time_t temps_act; 

	time(&temps_act); 

	printf("%s\n", ctime(&temps_act)); 

}

void matrixMulCPU(Matrix M, Matrix N, Matrix P) {

	

	int i, j, k;

	for(i=0; i<M.height; i++) {

		for(j=0; j<N.width; j++) {

			for(k=0; k<M.width; k++) {

				P.elements[i*N.width + j] += M.elements[i*M.width + k] 

										   * N.elements[k*N.width + j];

			}

		}

	}

}

int main() {

	

	int size = 10;

	int i, j;

	int print = true;

	Matrix A;

	Matrix B;

	Matrix C;

	Matrix D;

	

	

	A.width = size;

	A.height = size;

	A.elements = (float*) malloc(A.width*A.height*sizeof(float));

	

	B.width = size;

	B.height = size;

	B.elements = (float*) malloc(B.width*B.height*sizeof(float));

	

	C.width = size;

	C.height = size;

	C.elements = (float*) malloc(C.width*C.height*sizeof(float));

	

	D.width = size;

	D.height = size;

	D.elements = (float*) malloc(D.width*D.height*sizeof(float));

	

	

	printf("matrix generation\n");

	

	srand ( time(NULL) );

	for(i=0; i<size*size; i++) {

		A.elements[i] = 2; //floatRand(2);

		B.elements[i] = 2; //floatRand(2);

//		A.elements[i] = 3;

//		B.elements[i] = 3;

		C.elements[i] = 0;

		D.elements[i] = 0;

	}

	

	if(print) {

		printf("A =\n");

		for(i=0; i<A.height; i++) {

			for(j=0; j<A.width; j++) {

				printf("%f ",A.elements[i*size + j]);

			}

		   printf("\n");

		}

		printf("\n\n");

		printf("B =\n");

		for(i=0; i<B.height; i++) {

			for(j=0; j<B.width; j++) {

				printf("%f ",B.elements[i*size + j]);

			}

		   printf("\n");

		}

		

	}

	

	

	timedate();

	printf("GPU matrix multiplication\n");

	matrixMulGPU(A, B, C);

	timedate();

	

	

	timedate();

	printf("CPU matrix multiplication\n");

	matrixMulCPU(A, B, D);

	timedate();

	

	

	//print result

	if(print) {

		printf("C =\n");

		for(i=0; i<C.height; i++) {

			for(j=0; j<C.width; j++) {

				printf("%f ",C.elements[i*size + j]);

			}

		   printf("\n");

		}

		printf("\n\n");

		printf("D =\n");

		for(i=0; i<D.height; i++) {

			for(j=0; j<D.width; j++) {

				printf("%f ",D.elements[i*size + j]);

			}

		   printf("\n");

		}

		

	}

	

}

and here’s the result I have with my Geforce 9650M GT :

matrix generation

A =

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

B =

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 

Mon Nov 23 20:02:55 2009

GPU matrix multiplication

Mon Nov 23 20:02:55 2009

Mon Nov 23 20:02:55 2009

CPU matrix multiplication

Mon Nov 23 20:02:55 2009

C =

40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 

40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 

40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 

40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 

nan -20769186196199271228741710417756160.000000 32.000000 nan nan nan 40.000000 40.000000 40.000000 40.000000 

40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 

nan -20769186196199271228741710417756160.000000 32.000000 nan nan nan 40.000000 40.000000 40.000000 40.000000 

40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 

nan -20769186196199271228741710417756160.000000 32.000000 nan nan nan 40.000000 40.000000 40.000000 40.000000 

40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 

D =

40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 

40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 

40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 

40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 

40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 

40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 

40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 

40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 

40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 

40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000

Some strange values or NaN appear on the gpu. I should missed something… help! :huh:

Thanks in advance :thanks:

ps: I’m running ubuntu 9.10 with forceware 190.42 (from https://launchpad.net/~nvidia-vdpau/+archive/ppa)