Hi,
I’m learning cuda with the “NVIDIA CUDA programming guide”.
I implemented the “Matrix Multipliation without Shared Memory” (pg 20-22) but it doesn’t work very well.
Here’s my code :
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <time.h>
// Matrices are stored in row-major order:
// M(row, col) = *(M.elements + row * M.width + col)
typedef struct {
int width;
int height;
float *elements;
} Matrix;
// Thread block size
#define BLOCK_SIZE 16
// Forward declaration of the matrix multiplication kernel
__global__ void MatMulKernel(const Matrix, const Matrix, Matrix);
// Matrix multiplication - Host code
// Matrix dimensions are assumed to be multiples of BLOCK_SIZE
void matrixMulGPU(const Matrix A, const Matrix B, Matrix C)
{
// Load A and B to device memory
Matrix d_A;
d_A.width = A.width; d_A.height = A.height;
size_t size = A.width * A.height * sizeof(float);
cudaMalloc((void**)&d_A.elements, size);
cudaMemcpy(d_A.elements, A.elements, size,
cudaMemcpyHostToDevice);
Matrix d_B;
d_B.width = B.width; d_B.height = B.height;
size = B.width * B.height * sizeof(float);
cudaMalloc((void**)&d_B.elements, size);
cudaMemcpy(d_B.elements, B.elements, size,
cudaMemcpyHostToDevice);
// Allocate C in device memory
Matrix d_C;
d_C.width = C.width; d_C.height = C.height;
size = C.width * C.height * sizeof(float);
cudaMalloc((void**)&d_C.elements, size);
// Invoke kernel
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(B.width / dimBlock.x + 1, A.height / dimBlock.y + 1);
MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
// Read C from device memory
cudaMemcpy(C.elements, d_C.elements, size,
cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(d_A.elements);
cudaFree(d_B.elements);
cudaFree(d_C.elements);
cudaThreadExit();
}
// Matrix multiplication kernel called by MatMul()
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
{
// Each thread computes one element of C
// by accumulating results into Cvalue
float Cvalue = 0;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
for (int e = 0; e < A.width; ++e) {
Cvalue += A.elements[row * A.width + e]
* B.elements[e * B.width + col];
}
__syncthreads();
C.elements[row * C.width + col] = Cvalue;
}
float floatRand(float MaxVal) {
return ( (float)rand() / (float)RAND_MAX ) * MaxVal;
}
void timedate() {
time_t temps_act;
time(&temps_act);
printf("%s\n", ctime(&temps_act));
}
void matrixMulCPU(Matrix M, Matrix N, Matrix P) {
int i, j, k;
for(i=0; i<M.height; i++) {
for(j=0; j<N.width; j++) {
for(k=0; k<M.width; k++) {
P.elements[i*N.width + j] += M.elements[i*M.width + k]
* N.elements[k*N.width + j];
}
}
}
}
int main() {
int size = 10;
int i, j;
int print = true;
Matrix A;
Matrix B;
Matrix C;
Matrix D;
A.width = size;
A.height = size;
A.elements = (float*) malloc(A.width*A.height*sizeof(float));
B.width = size;
B.height = size;
B.elements = (float*) malloc(B.width*B.height*sizeof(float));
C.width = size;
C.height = size;
C.elements = (float*) malloc(C.width*C.height*sizeof(float));
D.width = size;
D.height = size;
D.elements = (float*) malloc(D.width*D.height*sizeof(float));
printf("matrix generation\n");
srand ( time(NULL) );
for(i=0; i<size*size; i++) {
A.elements[i] = 2; //floatRand(2);
B.elements[i] = 2; //floatRand(2);
// A.elements[i] = 3;
// B.elements[i] = 3;
C.elements[i] = 0;
D.elements[i] = 0;
}
if(print) {
printf("A =\n");
for(i=0; i<A.height; i++) {
for(j=0; j<A.width; j++) {
printf("%f ",A.elements[i*size + j]);
}
printf("\n");
}
printf("\n\n");
printf("B =\n");
for(i=0; i<B.height; i++) {
for(j=0; j<B.width; j++) {
printf("%f ",B.elements[i*size + j]);
}
printf("\n");
}
}
timedate();
printf("GPU matrix multiplication\n");
matrixMulGPU(A, B, C);
timedate();
timedate();
printf("CPU matrix multiplication\n");
matrixMulCPU(A, B, D);
timedate();
//print result
if(print) {
printf("C =\n");
for(i=0; i<C.height; i++) {
for(j=0; j<C.width; j++) {
printf("%f ",C.elements[i*size + j]);
}
printf("\n");
}
printf("\n\n");
printf("D =\n");
for(i=0; i<D.height; i++) {
for(j=0; j<D.width; j++) {
printf("%f ",D.elements[i*size + j]);
}
printf("\n");
}
}
}
and here’s the result I have with my Geforce 9650M GT :
matrix generation
A =
2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000
2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000
2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000
2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000
2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000
2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000
2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000
2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000
2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000
2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000
B =
2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000
2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000
2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000
2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000
2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000
2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000
2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000
2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000
2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000
2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000 2.000000
Mon Nov 23 20:02:55 2009
GPU matrix multiplication
Mon Nov 23 20:02:55 2009
Mon Nov 23 20:02:55 2009
CPU matrix multiplication
Mon Nov 23 20:02:55 2009
C =
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
nan -20769186196199271228741710417756160.000000 32.000000 nan nan nan 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
nan -20769186196199271228741710417756160.000000 32.000000 nan nan nan 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
nan -20769186196199271228741710417756160.000000 32.000000 nan nan nan 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
D =
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000 40.000000
Some strange values or NaN appear on the gpu. I should missed something… help! :huh:
Thanks in advance External Image
ps: I’m running ubuntu 9.10 with forceware 190.42 (from https://launchpad.net/~nvidia-vdpau/+archive/ppa)