Hi,
I am using the compilation utility for running the CUDA kernel from Matlab.
I am implementing the matrix multiplication for brushing up my CUDA skills.
Here is the code that I am using.
#include "cuda.h"
#include "mex.h"
__global__ void mmkernel(float *a, float *b, float *c, int N, int M, int K)
{
int i = blockIdx.x * 64 + threadIdx.x;
int j = blockIdx.y;
int tx = threadIdx.x;
__shared__ float cb[32];
float sum0 = 0.0f, sum1=0.0f;
for (int ks = 0; ks < M; ks+= 32) {
cb[tx] = b[ks+tx+M*j];
__syncthreads();
for (int k = ks; k< ks+32; k++) {
sum0 += a[i+N*k] * cb[k-ks];
sum1 += a[i+32+N*k] * cb[k-ks];
}
__syncthreads();
}
c [i+N*j] = sum0;
c[i+32+N*j] = sum1;
}
void mexFunction(int nlhs,mxArray *plhs[],int nrhs,const mxArray *prhs[])
{
int i,j,m,n,o,p;
double *data1,*data2,*m_ans;
float *h1,*h2,*h3;
float *d_data1,*d_data2,*d_ans;
m = mxGetM(prhs[0]);
n = mxGetN(prhs[0]);
o = mxGetM(prhs[1]);
p = mxGetN(prhs[1]);
plhs[0] = mxCreateDoubleMatrix(m,p,mxREAL);
data1 = mxGetPr(prhs[0]);
data2 = mxGetPr(prhs[1]);
h1 = (float*) mxMalloc(sizeof(float)*m*n);
h2 = (float*) mxMalloc(sizeof(float)*o*p);
for(i=0;i<m*n;i++)
{
h1[i] = float(data1[i]);
}
for(j=0;j< o*p;j++)
{
h2[i] = float(data2[j]);
}
h3 = (float*)mxMalloc(sizeof(float)*m*p);
cudaMalloc((void**)&d_data1,sizeof(float)*m*n);
cudaMalloc((void**)&d_data2,sizeof(float)*o*p);
cudaMalloc((void**)&d_ans,sizeof(float)*m*p);
cudaMemcpy(d_data1,h1,sizeof(float)*m*n,cudaMemcpyHostToDevice);
cudaMemcpy(d_data2,h2,sizeof(float)*o*p,cudaMemcpyHostToDevice);
dim3 dimBlock(32);
dim3 dimGrid(n/64,n);
mmkernel<<<dimGrid,dimBlock>>>(d_data1,d_data2,d_ans,m,n,p);
cudaMemcpy(h3,d_ans,sizeof(float)*m*p,cudaMemcpyDeviceToHost);
m_ans = mxGetPr(plhs[0]);
for(j = 0;j<m*p;j++)
{
m_ans[j] = (double)h3[j];
}
mxFree(h1);
mxFree(h2);
mxFree(h3);
cudaFree(d_data1);
cudaFree(d_data2);
cudaFree(d_ans);
}
After passing the two matrices as
f = ones(512,512) and g = ones(512,512)
to h = matmul(f,g) which is the resultant CUDA kernel for mex, I am getting all values of h as 0.
Can anyone help me by providing me hints, as to where am I going wrong?
Thanks in advance.
Cheers.