Hello.
I wanna run kernel code for cuda large matrix multiplication.(ex -1024x1024, 2048x2048 or 4096x4096)
My code is here
global void GPU_KERNEL_CODE(double *a, double *b,double *c,int N)
{
double Cvalue = 0.0;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if(row > N || col > N) return;
for (int e = 0; e < N; ++e){
Cvalue += (a[row * N + e]) * (b[e * N + col]);
}
c[row * N + col] = Cvalue;
}
(Above code is kernel code which is run on device)
double *a_Ptr;
double *b_Ptr;
double *c_Ptr;
cudaEvent_t start,stop; //set cudaEvent
float time1;
cudaEventCreate(&start); cudaEventCreate(&stop); // start & creation of cudaEvent
cudaEventRecord( start, 0 ); // staring point of cudaEventRecrod.
cudaMalloc((void**)&a_Ptr,sizeof(double)*N*N);
cudaMalloc((void**)&b_Ptr,sizeof(double)*N*N);
cudaMalloc((void**)&c_Ptr,sizeof(double)*N*N);
cudaMemcpy(a_Ptr,a,sizeof(double)*N*N,cudaMemcpyHostToDevice);
cudaMemcpy(b_Ptr,b,sizeof(double)*N*N,cudaMemcpyHostToDevice);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 dimGrid((N-1+dimBlock.x)/dimBlock.x,(N-1+dimBlock.y)/dimBlock.y);
GPU_KERNEL_CODE<<<dimGrid, dimBlock>>>(a_Ptr,b_Ptr,c_Ptr,N);
cudaThreadSynchronize();
cudaMemcpy(c,c_Ptr,sizeof(double)*N*N,cudaMemcpyDeviceToHost);
cudaEventRecord( stop, 0 ); cudaEventSynchronize( stop );
cudaEventElapsedTime( &time1, start, stop ); // Estimation of ElapsedTime
cudaEventDestroy( start ); cudaEventDestroy( stop );
cudaFree(a_Ptr);
cudaFree(b_Ptr);
cudaFree(c_Ptr);
Host code is here.
BLOCK SIZE is 16.
But output is wrong.
What’s the problem?
(Such as in 1024x1024, output of element, which index of array is 1000, -331. But my code is -1169.4)
This array is col-major data.
System → (gtx670 & win7 & vs2010)