Hello Everyone
i am new to Cuda and currently using Nvidia programming guide and David Krik lectures to learn Cuda. On basis of that i have done everything correctly but still getting errors on execution of program. Another important thing is that all the errors are in Kernel Function only. The program is as follows-
global_ void MatrixMulKernel(float *Md, float *Nd, float *Pd, int Width)
{
shared float Mds[2][2];
` shared float Nds[2][2];
float bx = blockIdx.x;
float by=blockIdx.y;
float tx = threadIdx.x;
float ty = threadIdx.y;
// Identify the row and column of the Pd element to work on
float Row = by * 2 + ty;
float Col = bx * 2 + tx;
float Pvalue = 0;
// Loop over the Md and Nd tiles required to compute the Pd element
for (int m = 0; m <(Width/2); ++m)
{
// Coolaborative loading of Md and Nd tiles into shared memory
Mds[ty][tx] = Md[RowWidth+(m2+tx)];
Nds[tx][ty] = Nd[Col+(m*2+ty)*Width];
__syncthreads();
for (int k = 0; k < 2; ++k)
{
Pvalue += Mds[tx][k] * Nds[k][ty];
}
Pd[Row*Width+Col] = Pvalue;
__syncthreads();
}
}
void main()
{
float *Md_h,*Md_d,*Nd_h,*Nd_d,*Pd_h,*Pd_d;
const N=1000;
int i, Width;
size_t size=Nsizeof(float);
Md_h=(float)malloc(size);
cudaMalloc((void**)&Md_d,size);
Nd_h=(float*)malloc(size);
cudaMalloc((void**)&Nd_d,size);
Pd_h=(float*)malloc(size);
cudaMalloc((void**)&Pd_d,size);
printf(“enter the Width of matrix”);
scanf(“%d”,&Width);
printf(“enter the element of 1st matrix”);
for(i=0 ;i<(Width*Width);i++)
{
scanf(“%d”,&Md_h[i]);
}
cudaMemcpy(Md_d,Md_h,size,cudaMemcpyHostToDevice);
printf(“enter the element of 2nd matrix”);
for(i=0;i<(Width*Width);++i)
{
scanf(“%d”,&Nd_h[i]);
}
cudaMemcpy(Nd_d,Nd_h,size,cudaMemcpyHostToDevice);
printf(“maltiplied matrix is:-\n”);
unsigned int timer = 0;
CUT_SAFE_CALL( cutCreateTimer( &timer));
CUT_SAFE_CALL( cutStartTimer( timer));
dim3 dimBlock(Width,Width,1);
dim3 dimGrid(1,1);
MatrixMulKernel<<<dimGrid,dimBlock>>>(Md_d,Nd_d,Pd_d,Width);
CUT_SAFE_CALL( cutStopTimer( timer));
printf(“Processing time: %f (ms)\n”, cutGetTimerValue( timer));
CUT_SAFE_CALL( cutDeleteTimer( timer));
cudaMemcpy(Pd_h,Pd_d,size,cudaMemcpyDeviceToHost);
for(i=0;i<(Width*Width);++i)
{
printf(“%d”,Pd_h[i]);
}
getch();
free(Md_h);
cudaFree(Md_d);
free(Nd_h);
cudaFree(Nd_d);
free(Pd_h);
cudaFree(Pd_d);
}
on execution it is showing errors in shared memory declaration. The errors are
1- Unrecognized token.
2- Nds is undefined.
3- function must have integral of enum type.
There seems some problem in declaration of shared memory. Can anyone please help me with the declaration of this.
Thanks a Lot In Advance.