Im new to CUDA programming so I’ve been trying out Matrix Multiplication with various Block and Grid dimensions and mapping the Matrix elements to threads in different ways.

I have written this kernel to map each thread to one element of the resultant matrix and assigned each thread in the Z direction to carry out one multiplication and finally add the result to P.

The result is correct if I use a 2D grid of threads (i.e blockDim.z=1). If I increase blockDim.z > 1 The results are incorrect. I have verified that the multiplication carried out by each thread is correct in every stage and I believe something goes haywire when the threads add their final results to the destination array.

Is there something obvious that I am missing here?

Also, cudaMemset gives me an invalid argument error but that doesnt seem to affect the result.

```
#include<iostream>
using namespace std;
__global__
void matrix(float *M, float *N, float *P, int X, int Y, int Z)
{
int T = 0;
int iteration = 0;
int iterz = 0;
for (int i = (threadIdx.x*blockDim.y + threadIdx.y);i < X*Y;i += blockDim.x*blockDim.y)
{
iteration++;
for (int k = threadIdx.z;k < Z;k += blockDim.z)
{
iterz++;
P[i] += M[(i / Y)*Z + k] * N[k*Y + (i%Y)];
T += M[(i / Y)*Z + k] * N[k*Y + (i%Y)];
if(i == 0)
printf(" %d, %d, %d = %d\n", threadIdx.x, threadIdx.y, threadIdx.z, T);
}
}
//printf("%d, %d, %d, Iteration is %d %d =%d\n", threadIdx.x, threadIdx.y, threadIdx.z, iteration, iterz,T);
}
void printarray(float *M, int X, int Y)
{
for (int i = 0;i < X;i++)
{
for (int j = 0;j <Y;j++)
cout << M[i*Y + j] << " ";
cout << endl;
}
cout << endl;
}
void initarray(float*M, int X, int Y)
{
for (int i = 0;i < X*Y;i++)
M[i] =(float) i + 1;
}
int main()
{
int Mx = 5, My = 5, Nx = 5, Ny = 5;
float *M, *N, *P;
cudaMallocManaged(&M, Mx*My * sizeof(float));
cudaMallocManaged(&N, Nx*Ny * sizeof(float));
cudaMallocManaged(&P, Mx*Ny*My * sizeof(float));
cout << cudaGetErrorString(cudaGetLastError()) << endl;
cudaMemset(&P, 0, Mx*Ny*My * sizeof(float));
cout << cudaGetErrorString(cudaGetLastError()) << endl;
initarray(M, Mx, My);
initarray(N, Nx, Ny);
printarray(M, Mx, My);
printarray(N, Nx, Ny);
if (My == Nx)
{
//dim3 Griddim(Mx, Ny, 1);
dim3 ThreadBlock(Mx - 1, Ny - 1, 5);
matrix << <1, ThreadBlock >> > (M, N, P, Mx, Ny, My); // M X N
cudaDeviceSynchronize();
printarray(P, Mx, Ny);
}
else
cout << "\nDimension Mismatch\n";
cudaFree(M);
cudaFree(N);
cudaFree(P);
return 0;
}
```