Hello everyone,
I am working on optimization of cuda program. So I first started with optimization of matrix multiplication program. Threading scheme which I have used for parallelization is Blocksize(1, 1),Gridsize(N ,N). I am using surface memory for memory optimization purpose(as use of shared memory is not possible for this threading scheme). When I compare the time after and before optimization, I found that execution takes double time after using surface memory(I have tried with different threading scheme but the problem remains same). From whatever I have read till now, global memory is slower than surface memory. So use of surface memory should take less time.Below I am giving matrix multiplication program with surface memory used. Can somebody tell me what is the problem?
#include<stdio.h>
#include<cuda.h>
//#define N 3
surface<void, 2> a_surf;
surface<void, 2> b_surf;
surface<void, 2> c_surf;
void CUDA_SAFE_CALL(cudaError_t call,int line)
{
switch(call)
{
case cudaSuccess :
break;
default :
printf("ERROR at line :%i.%d' ' %s\n",
line,call,cudaGetErrorString(call));
exit(-1);
break;
}
}
__global__ void mul(int N)
{
int a,b,c,temp;
int i;
unsigned int x = blockIdx.x * blockDim.x + (threadIdx.x);
unsigned int y = blockIdx.y * blockDim.y + (threadIdx.y);
if (x < N && y < N) {
temp=0;
for(i= 0;i < N;i++)
{
surf2Dread(&a, a_surf, (x) * 4, i);
surf2Dread(&b, b_surf, (i) * 4, y);
temp +=a*b;
}
c=temp;
// Write to output surface
surf2Dwrite(c,c_surf, x * 4, y);
}
}
int main()
{
int N=100;
int a[N][N],b[N][N],c[N][N];
int i,j;
int temp;
clock_t t1,t2;
cudaArray *da,*db,*dc;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<int>();
dim3 dimBlock(1, 1);
dim3 dimGrid(N ,N);
temp=0;
for(i=0;i<N;i++)
for(j=0;j<N;j++)
a[i][j]=++temp;
temp=0;
for(i=0;i<N;i++)
for(j=0;j<N;j++)
b[i][j] = ++temp;
CUDA_SAFE_CALL(cudaMallocArray(&da, &channelDesc, N, N, cudaArraySurfaceLoadStore),__LINE__);
CUDA_SAFE_CALL(cudaMallocArray(&db, &channelDesc, N, N, cudaArraySurfaceLoadStore),__LINE__);
CUDA_SAFE_CALL(cudaMallocArray(&dc, &channelDesc, N, N, cudaArraySurfaceLoadStore),__LINE__);
int s = N*N*sizeof(int);
CUDA_SAFE_CALL(cudaMemcpyToArray(da, 0, 0, a, s, cudaMemcpyHostToDevice),__LINE__);
CUDA_SAFE_CALL(cudaMemcpyToArray(db, 0, 0, b, s, cudaMemcpyHostToDevice),__LINE__);
CUDA_SAFE_CALL(cudaBindSurfaceToArray(a_surf, da),__LINE__);
CUDA_SAFE_CALL(cudaBindSurfaceToArray(b_surf, db),__LINE__);
CUDA_SAFE_CALL(cudaBindSurfaceToArray(c_surf, dc),__LINE__);
t1 = clock();
mul<<<dimGrid, dimBlock>>>(N);
t2=clock();
CUDA_SAFE_CALL(cudaMemcpyFromArray(c,dc,0,0,s,cudaMemcpyDeviceToHost),__LINE__);
double t3= (double)t2-(double)t1;
t3= t3/ CLOCKS_PER_SEC;
printf("\n CUDA time :%lf",t3);
CUDA_SAFE_CALL(cudaFreeArray(da),__LINE__);
CUDA_SAFE_CALL(cudaFreeArray(db),__LINE__);
CUDA_SAFE_CALL(cudaFreeArray(dc),__LINE__);
}