Hi–
I’m trying to learn CUDA and my simple ‘hello world’ / ‘hello cuda’ program isn’t working.
I’m just trying to multiply two matrices together, where the matrix can be defined across several blocks.
According to the output the matrix that comes back is zero-filled (but should have non-zero numbers everywhere).
C:\CUDA\Projects\MatrixMultiply\x64\Release>MatrixMultiply.exe
CUDA initialized.
M =
1.000000 1.000000 1.000000 1.000000
2.000000 2.000000 2.000000 2.000000
3.000000 3.000000 3.000000 3.000000
4.000000 4.000000 4.000000 4.000000
N =
2.000000 2.000000 2.000000 2.000000
3.000000 3.000000 3.000000 3.000000
4.000000 4.000000 4.000000 4.000000
5.000000 5.000000 5.000000 5.000000
hostresult =
0.000000 0.000000 0.000000 0.000000
0.000000 0.000000 0.000000 0.000000
0.000000 0.000000 0.000000 0.000000
0.000000 0.000000 0.000000 0.000000
Press ENTER to exit…
My kernel:
global void MatrixMulKernel(float* Md, float* Nd, float* Pd, int width, int tile_width)
{
int row = blockIdx.y * tile_width + threadIdx.y;
int col = blockIdx.x * tile_width + threadIdx.x;
float Pvalue = 0;
int k;
for (k = 0; k < width; ++k)
Pvalue += Md[row * width + k] * Nd[k * width + col];
Pd[row * width + col] = Pvalue;
}
And this function launches the kernel:
void MatrixMulOnDevice(float* M, float* N, float* P, int width, int tile_width)
{
int size = width * width * sizeof(float);
float* Md; float* Nd; float* Pd;
//allocate Md, Nd on device & copy host-generated values
cudaMalloc((void**)&Md, size);
cudaMemcpy((void**)&Md, M, size, cudaMemcpyHostToDevice);
cudaMalloc((void**)&Nd, size);
cudaMemcpy((void**)&Nd, N, size, cudaMemcpyHostToDevice);
//allocate Pd on device
cudaMalloc((void**)&Pd, size);
//declaring 2 threads PER BLOCK
dim3 dimBlock(2,2);
//declaring 2 tiles (1 tile = 1 block) PER GRID
dim3 dimGrid(tile_width, tile_width);
MatrixMulKernel<<<dimGrid, dimBlock>>>(Md,Nd,Pd,width,tile_width);
cudaFree(Md);
cudaFree(Nd);
cudaFree(Pd);
}
Full code also attached. Any help is greatly appreciated
Thanks
-Alan