Large matrix multiplication - output is odd.

Hello.

I wanna run kernel code for cuda large matrix multiplication.(ex -1024x1024, 2048x2048 or 4096x4096)

My code is here

global void GPU_KERNEL_CODE(double *a, double *b,double *c,int N)
{
double Cvalue = 0.0;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if(row > N || col > N) return;
for (int e = 0; e < N; ++e){
Cvalue += (a[row * N + e]) * (b[e * N + col]);
}
c[row * N + col] = Cvalue;

}

(Above code is kernel code which is run on device)

double *a_Ptr;
double *b_Ptr;
double *c_Ptr;

cudaEvent_t start,stop; //set cudaEvent
float time1;
cudaEventCreate(&start); cudaEventCreate(&stop); // start & creation of cudaEvent
cudaEventRecord( start, 0 ); // staring point of cudaEventRecrod. 


cudaMalloc((void**)&a_Ptr,sizeof(double)*N*N);
cudaMalloc((void**)&b_Ptr,sizeof(double)*N*N);	
cudaMalloc((void**)&c_Ptr,sizeof(double)*N*N);

cudaMemcpy(a_Ptr,a,sizeof(double)*N*N,cudaMemcpyHostToDevice);
cudaMemcpy(b_Ptr,b,sizeof(double)*N*N,cudaMemcpyHostToDevice);	


dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 dimGrid((N-1+dimBlock.x)/dimBlock.x,(N-1+dimBlock.y)/dimBlock.y);

GPU_KERNEL_CODE<<<dimGrid, dimBlock>>>(a_Ptr,b_Ptr,c_Ptr,N);
cudaThreadSynchronize();
cudaMemcpy(c,c_Ptr,sizeof(double)*N*N,cudaMemcpyDeviceToHost);

cudaEventRecord( stop, 0 ); cudaEventSynchronize( stop );
cudaEventElapsedTime( &time1, start, stop );  // Estimation of ElapsedTime
cudaEventDestroy( start ); cudaEventDestroy( stop );


cudaFree(a_Ptr);
cudaFree(b_Ptr);
cudaFree(c_Ptr);

Host code is here.

BLOCK SIZE is 16.

But output is wrong.

What’s the problem?

(Such as in 1024x1024, output of element, which index of array is 1000, -331. But my code is -1169.4)

This array is col-major data.

System -> (gtx670 & win7 & vs2010)