Hello I’ve made matrix multiplication code to get solution of C=AB, and I’ve used shared memory architecture but something is odd.
My matrix is 1024 1024 a square matrix. To calculate this problem, I run this device code
(BLOCK_SIZE → 32)
global void gpu_matrixmult_ShMem(double* ad, double* bd, double* cd, int n)
{
shared double as[BLOCK_SIZE][BLOCK_SIZE];
shared double bs[BLOCK_SIZE][BLOCK_SIZE];
int tx = threadIdx.x;
int ty = threadIdx.y;
int x = (blockIdx.x * blockDim.x) + tx;
int y = (blockIdx.y * blockDim.y) + ty;
double v = 0.;
int s = n / BLOCK_SIZE;
for(int m=0; m<s; m++) {
int m32 = m * BLOCK_SIZE;
as[ty][tx] = ad[y*n + (m32 + tx)];
bs[ty][tx] = bd[(m32 + ty) * n + x];
__syncthreads();
for(int i=0; i<32; i++) {
v += as[ty][i] * bs[i][tx];
}
__syncthreads();
}
cd[y*n + x] = v;
}
HOST functional CODE is below
double GPU_KENERL_CODE_ON_HOST_OPTI(double *a,double *b,double *c,int N)
{
Initialize(a,b);
cudaSetDevice(0);
double *a_Ptr;
double *b_Ptr;
double *c_Ptr;
cudaEvent_t start,stop; //set cudaEvent
float time1;
cudaEventCreate(&start); cudaEventCreate(&stop); // start & creation of cudaEvent
cudaEventRecord( start, 0 ); // staring point of cudaEventRecrod.
cudaMalloc((void**)&a_Ptr,sizeof(double)*N*N);
cudaMalloc((void**)&b_Ptr,sizeof(double)*N*N);
cudaMalloc((void**)&c_Ptr,sizeof(double)*N*N);
cudaMemcpy(a_Ptr,a,sizeof(double)*N*N,cudaMemcpyHostToDevice);
cudaMemcpy(b_Ptr,b,sizeof(double)*N*N,cudaMemcpyHostToDevice);
dim3 threads(BLOCK_SIZE,BLOCK_SIZE);
dim3 grid(N/threads.x,N/threads.y);
cudaThreadSynchronize();
gpu_matrixmult_ShMem<<<grid, threads>>>(a_Ptr,b_Ptr,c_Ptr,N);
cudaMemcpy(c,c_Ptr,sizeof(double)*N*N,cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
cudaEventRecord( stop, 0 ); cudaEventSynchronize( stop );
cudaEventElapsedTime( &time1, start, stop ); // Estimation of ElapsedTime
cudaEventDestroy( start ); cudaEventDestroy( stop );
cudaFree(a_Ptr);
cudaFree(b_Ptr);
cudaFree(c_Ptr);
return time1;
}
This input code is not accurate, but if I change threads & grid , such as threads(16,16) grid(8,8), almost output is accurate.(Of course some is also odd.)
What’s the problem?
My infra is GTX670 & WIN7 64BIT.