I have wrote a matrix multiplication program for CUDA in matlab. I set my block size to 16 times 16, since the programming guide said that this value is somewhat arbitrary. I set the grid dimension as : dimGrid((B.width+dimBlock.x-1)/dimBlock.x,(A.height+dimBlock.y-1)/dimBlock.y); where B.width and A.height are the dimension of the resultant matrix. The matrix are stored in column major format.
However, there’s some strange behavior when I run my code. The code runs fine for only certain dimension of matrix, and those which work doesn’t seem like a multiple of certain number to me. Also, the code give some strange result (like, some of the entries fill with zeros) for quite a number of matrix. Whenever this happen, my monitor screen will flicker a little.
I am wondering if my thread block size is correct? Or does the problem lies in my grid size setting? If so, how may i fix it?
Below are my course code
[codebox]#include “mex.h”
#include “cuda.h”
typedef struct{
int width;
int height;
float* elements;
}Matrix;
#define BLOCK_SIZE 16
global void MatMulKernel(Matrix A, Matrix B, Matrix C)
{
float Cvalue=0;
int row=blockIdx.y*blockDim.y+threadIdx.y;
int col=blockIdx.x*blockDim.x+threadIdx.x;
for(int e=0; e<A.width; ++e)
{
if(row<A.height&&col<B.width)
{
Cvalue += A.elements[e*A.height+row]B.elements[colB.height + e];
}
}
C.elements[col*C.height +row ]=Cvalue;
}
void mexFunction(int nlhs, mxArray *plhs, int nrhs, const mxArray *prhs)
{
Matrix A,B,C;
Matrix dA,dB,dC;
int dim[2];
A.width=mxGetN(prhs[0]);
A.height=mxGetM(prhs[0]);
A.elements=(float*)mxGetData(prhs[0]);
B.width=mxGetN(prhs[1]);
B.height=mxGetM(prhs[1]);
B.elements=(float*)mxGetData(prhs[1]);
C.width=B.width;
C.height=A.height;
dim[0]=C.height;
dim[1]=C.width;
plhs[0]=mxCreateNumericArray(2,dim,mxSINGLE_CLASS,mxREAL);
C.elements=(float*)mxGetData(plhs[0]);
dA.width=A.width;
dA.height=A.height;
dB.width=B.width;
dB.height=B.height;
dC.width=C.width;
dC.height=C.height;
size_t size=A.widthA.heightsizeof(float);
cudaMalloc((void**)&dA.elements,size);
cudaMemcpy(dA.elements,A.elements,size,cudaMemcpyHostToDevic
e);
size=B.widthB.heightsizeof(float);
cudaMalloc((void**)&dB.elements,size);
cudaMemcpy(dB.elements,B.elements,size,cudaMemcpyHostToDevic
e);
size=C.widthC.heightsizeof(float);
cudaMalloc((void**)&dC.elements,size);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 dimGrid((B.width+dimBlock.x-1)/dimBlock.x,(A.height+dimBlock.y-1)/dimBlock.y);
MatMulKernel<<<dimGrid,dimBlock>>>(dA,dB,dC);
cudaMemcpy(C.elements,dC.elements,size,cudaMemcpyDeviceToHos
t);
cudaFree(dA.elements);
cudaFree(dB.elements);
cudaFree(dC.elements);
}
[/codebox]