Shared memory error

Hello I’ve made matrix multiplication code to get solution of C=AB, and I’ve used shared memory architecture but something is odd.

My matrix is 1024 1024 a square matrix. To calculate this problem, I run this device code
(BLOCK_SIZE → 32)

global void gpu_matrixmult_ShMem(double* ad, double* bd, double* cd, int n)
{
shared double as[BLOCK_SIZE][BLOCK_SIZE];
shared double bs[BLOCK_SIZE][BLOCK_SIZE];

int tx = threadIdx.x;
int ty = threadIdx.y;

int x = (blockIdx.x * blockDim.x) + tx;
int y = (blockIdx.y * blockDim.y) + ty;
    
double v = 0.;
int s = n / BLOCK_SIZE;
for(int m=0; m<s; m++) {
    int m32 = m * BLOCK_SIZE;
    as[ty][tx] = ad[y*n + (m32 + tx)];
    bs[ty][tx] = bd[(m32 + ty) * n + x];
    __syncthreads();
    
    for(int i=0; i<32; i++) {
        v += as[ty][i] * bs[i][tx];			
    }
    __syncthreads();
    
}

cd[y*n + x] = v;   

}

HOST functional CODE is below

double GPU_KENERL_CODE_ON_HOST_OPTI(double *a,double *b,double *c,int N)
{
Initialize(a,b);
cudaSetDevice(0);

double *a_Ptr;
double *b_Ptr;
double *c_Ptr;

cudaEvent_t start,stop; //set cudaEvent
float time1;
cudaEventCreate(&start); cudaEventCreate(&stop); // start & creation of cudaEvent
cudaEventRecord( start, 0 ); // staring point of cudaEventRecrod. 


cudaMalloc((void**)&a_Ptr,sizeof(double)*N*N);
cudaMalloc((void**)&b_Ptr,sizeof(double)*N*N);	
cudaMalloc((void**)&c_Ptr,sizeof(double)*N*N);

cudaMemcpy(a_Ptr,a,sizeof(double)*N*N,cudaMemcpyHostToDevice);
cudaMemcpy(b_Ptr,b,sizeof(double)*N*N,cudaMemcpyHostToDevice);	

dim3 threads(BLOCK_SIZE,BLOCK_SIZE);
dim3 grid(N/threads.x,N/threads.y);
cudaThreadSynchronize();
gpu_matrixmult_ShMem<<<grid, threads>>>(a_Ptr,b_Ptr,c_Ptr,N);

cudaMemcpy(c,c_Ptr,sizeof(double)*N*N,cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
cudaEventRecord( stop, 0 ); cudaEventSynchronize( stop );
cudaEventElapsedTime( &time1, start, stop );  // Estimation of ElapsedTime
cudaEventDestroy( start ); cudaEventDestroy( stop );


cudaFree(a_Ptr);
cudaFree(b_Ptr);
cudaFree(c_Ptr);

return time1;

}

This input code is not accurate, but if I change threads & grid , such as threads(16,16) grid(8,8), almost output is accurate.(Of course some is also odd.)

What’s the problem?

My infra is GTX670 & WIN7 64BIT.

Hello,

I think that any problem with out of bonds accesses should be detected using cuda-memcheck.