Hi, everyone! I’m just a CUDA novice.

I’m reading the book “Programming Massively Parallel Processors, A Hands-on Approach”.

I followed the example of the matrix multiplication with multiple thread blocks in Chapter 4, and made minor modifications for exercises. I found that it works for smaller matrices, say 1280x1280. However, it doesn’t work for large matrices, say 2400x2400. Could anyone help me to figure out the problem? Thank you in advance!

output:

xwu@xth3:~> uname -a

Linux xth3 2.6.34.7-0.7-desktop #1 SMP PREEMPT 2010-12-13 11:13:53 +0100 x86_64 x86_64 x86_64 GNU/Linux

xwu@xth3:~> /sbin/lspci | grep VGA

02:00.0 VGA compatible controller: nVidia Corporation G96 [Quadro FX 580] (rev a1)

xwu@xth3:~> nvcc test.cu

xwu@xth3:~> ./a.out 1280

```
1280 serial 20.73 sec
```

cuda 4.54 sec

xwu@xth3:~> ./a.out 2400

```
2400 serial 150.90 sec
```

cuda 9.63 sec

```
0 p 5.99841 gp 0.00000
```

a.out: test.cu:103: int main(int, char**): Assertion `p[i]==gp[i]’ failed.

Aborted

xwu@xth3:~>

```
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <math.h>
#include <time.h>
#include <cuda.h>
#define TILE_WIDTH 16
// serial implementation
void MatMul(float *m, float *n, float *p, int width) {
int i, j, k;
float a, b, sum;
for (i=0; i<width; i++) {
for (j=0; j<width; j++) {
sum=0.0;
for (k=0; k<width; k++) {
a=m[i*width+k];
b=n[k*width+j];
sum+=a*b;
}
p[i*width+j]=sum;
}
}
}
// cuda gpu implementation kernel
__global__ void gMatMulKernel (float *m_d, float *n_d, float *gp_d, int width) {
int k, row, col;
float sum;
row=blockIdx.y*TILE_WIDTH+threadIdx.y;
col=blockIdx.x*TILE_WIDTH+threadIdx.x;
sum=0.0;
for (k=0; k<width; k++)
sum+=m_d[row*width+k]*n_d[k*width+col];
gp_d[row*width+col]=sum;
}
// cuda gpu implementation
void gMatMul(float *m, float *n, float *gp, int width) {
int size;
float *m_d, *n_d, *gp_d;
dim3 dimGrid(width/TILE_WIDTH,width/TILE_WIDTH,1);
dim3 dimBlock(TILE_WIDTH,TILE_WIDTH,1);
size=width*width*sizeof(float);
cudaMalloc((void **) &m_d, size);
cudaMemcpy(m_d, m, size, cudaMemcpyHostToDevice);
cudaMalloc((void **) &n_d, size);
cudaMemcpy(n_d, n, size, cudaMemcpyHostToDevice);
cudaMalloc((void **) &gp_d, size);
gMatMulKernel<<<dimGrid, dimBlock>>>(m_d, n_d, gp_d, width);
cudaMemcpy(gp, gp_d, size, cudaMemcpyDeviceToHost);
cudaFree(m_d);
cudaFree(n_d);
cudaFree(gp_d);
}
int main (int argc, char *argv[]) {
int i, width, size;
float *m, *n, *p, *gp;
clock_t t1, t2;
sscanf(argv[1], "%d", &width);
if (width%TILE_WIDTH != 0) return 0;
size=width*width;
m = (float *) malloc(size*sizeof(float));
n = (float *) malloc(size*sizeof(float));
p = (float *) malloc(size*sizeof(float));
gp = (float *) malloc(size*sizeof(float));
srand (time(NULL));
for (i=0; i<size; i++) {
m[i] = (float) rand() / (float) RAND_MAX;
m[i] = m[i]/10.0;
n[i] = (float) rand() / (float) RAND_MAX;
n[i] = n[i]/10.0;
}
t1=clock();
MatMul(m,n,p,width);
t2=clock();
printf("%10d serial %10.2lf sec\n", width, (t2-t1)/(double) CLOCKS_PER_SEC);
t1=clock();
gMatMul(m,n,gp,width);
t2=clock();
printf("cuda %10.2lf sec\n", (t2-t1)/(double) CLOCKS_PER_SEC);
for (i=0; i<size; i++) {
if (fabs(p[i]-gp[i])>0.001) {
printf("%10d\tp %10.5f\tgp %10.5f\n", i, p[i], gp[i]);
assert(p[i]==gp[i]);
}
}
free(m);
free(n);
free(p);
free(gp);
return 0;
}
```