Hi,
I am a little bit new to CUDA.I have written the following program for matrix multiplication using shared memory. The algorithm for this is given here on page number 9 :
However, I am getting following errors :
1- expression must have constant value, n these lines
i shared float as[tile_width][tile_width];
shared float bs[tile_width][tile_width];
2-expression must have integral or enum type n these lines
(
as[j][i] = a[Rowwidth+(mtile_width+i)];
bs[j][i] = b[(mtile_width+j)+widthCol];
c[Row*width+Col]=sum;
)
The complete program is given below. I would be thankful to you for any useful pointers in this regard.
Thanks,
Ankush
__global__ void matrixmul(int *a,int *b, int *c, int width,int tile_width)
{
__shared__ float as[tile_width][tile_width];
__shared__ float bs[tile_width][tile_width];
float k;
int i=threadIdx.x; nt j=threadIdx.y;
int p=blockIdx.x; int q=blockIdx.y;
float Row= (q * tile_width) +j;
float Col= (p * tile_width) +i;
float sum=0;
for( float m = 0; m<width/tile_width;++m)
{
as[j][i] = a[Row*width+(m*tile_width+i)];
bs[j][i] = b[(m*tile_width+j)+width*Col];
for(float k=0; k< tile_width; ++k)
{
sum + = as[j*width+k] * bs[k*width+i];
c[Row*width+Col]=sum;
}
}
}
int main()
{
int i,j,m,n,sum;
int Row,k,Col,tile_width,width;
int *a_h,*b_h,*a_d,*b_d,*c_h,*c_d;
const int N=1000;
size_t size= N*sizeof(int);
printf("enter the tile_width");
scanf("%d",&tile_width);
printf("enter the width");
scanf("%d",&width);
//Memory allocation on host and device, a_h, a_d
a_h=(int*)malloc(size);
cudaMalloc((void**)&a_d,size);
//Memory allocation on host and device, b_h, b_d
b_h=(int*)malloc(size);
cudaMalloc((void**)&b_d,size);
//Memory allocation on host and device, c_h, c_d
c_h=(int*)malloc(size);
cudaMalloc((void**)&c_d,size);
//User inputs (row=columns)
printf("enter the row & coloum of the 1st matrix m ");
scanf("%d%d",&Row,&k); //row = columns
printf("enter the element of 1st matrix m");
for(i=0;i<(Row*k);i++)
{
scanf("%d",&a_h[i]);
}
for(i=0;i<(Row*k);i++)
printf("\t%d",a_h[i]);
{
printf("\n");
}
//copying data (a_h) from Host to Device in a_d
cudaMemcpy(a_d,a_h,size,cudaMemcpyHostToDevice);
printf("enter the row & coloum of 2nd matrix n");
scanf("%d%d",&k,&Col); //row = columns
printf("enter the element of 2nd matrix n");
for(j=0;j<(k*Col);j++)
{
scanf("%d",&b_h[j]);
}
for(j=0;j<(k*Col);j++)
printf("\t%d",b_h[j]);
{
printf("\n");
}
//copying data (b_h) from Host to Device in b_d
cudaMemcpy(b_d,b_h,size,cudaMemcpyHostToDevice);
//Kernal call
dim3 dimBlock(4,4,1);
dim3 dimGrid(1,1,1);
matrixmul<<<dimGrid, dimBlock>>>(a_d,b_d,c_d,width,tile_width);
//copying data (c_d) from Device to Host in c_h
cudaMemcpy(c_h,c_d,size,cudaMemcpyDeviceToHost);
//printing the results
for(i=0;i<(Row*Col);i++)
{
printf("%d",c_h[i]);
printf("\n");
}
getch();
free(a_h);
cudaFree(a_d);
free(b_h);
cudaFree(b_d);
free(c_h);
cudaFree(c_d);
}