Hi!
I’ve been trying to set up a very simple matrix multiplication application, i.e. without shared memory usage.
The algorithm works, but after I copy the results back to the host and display them, some of the rows have the value zero (the rows with zero values change at every new execution).
First I thought that maybe the host starts copying the results from the device before the kernel finishes its execution, but as the execution takes place in the default stream (stream 0), all the calls are synchronous? Or am I wrong?
Here’s the code for those of you whould like to take a look:
/************************************************************
************/
/* main Method */
/************************************************************
************/
int main(int argc, char* argv[])
{
if(!InitCUDA()) {
return 0;
}
mulMatrix(8,8);
cutilExit(argc, argv);
}
////////////////////////////////////////////////////////////////////////////////
//! CPU method
////////////////////////////////////////////////////////////////////////////////
void mulMatrix(int n_rows, int n_cols)
{
int n_blocks_x = (n_cols+TILE_DIM-1)/TILE_DIM;
int n_blocks_y = (n_rows+TILE_DIM-1)/TILE_DIM;
int threads_per_block = TILE_DIM * TILE_DIM;
cudaEvent_t start, stop;
float time;
unsigned int timer_CPU = 0;
unsigned int timer_GPU = 0;
cutilCheckError( cutCreateTimer( &timer_CPU));
cutilCheckError( cutCreateTimer( &timer_GPU));
unsigned int mem_size = sizeof( float) * n_rows * n_cols;
// allocate host memory
float* h_matrix_A = (float*) malloc( mem_size);
float* h_matrix_B = (float*) malloc( mem_size);
float *h_val = (float*) malloc(3*sizeof( float));
// initalize the memory
for( unsigned int i = 0; i < n_rows; ++i)
for( unsigned int j = 0; j < n_cols; ++j)
{
h_matrix_A[i*n_cols+j]=i+j;
h_matrix_B[i*n_cols+j]=i+j;
}
//Start timer
cutilCheckError( cutStartTimer( timer_GPU));
// allocate device memory
float* d_matrix_A;
float* d_matrix_B;
cudaMalloc( (void**) &d_matrix_A, mem_size);
cudaMalloc( (void**) &d_matrix_B, mem_size);
float* d_val;
cudaMalloc( (void**) &d_val, 3*sizeof( float));
// copy host memory to device
cudaMemcpy( d_matrix_A, h_matrix_A, mem_size, cudaMemcpyHostToDevice);
cudaMemcpy( d_matrix_B, h_matrix_B, mem_size, cudaMemcpyHostToDevice);
// allocate device memory for result
float* d_matrix_C;
cudaMalloc( (void**) &d_matrix_C, mem_size);
// setup execution parameters
dim3 grid( n_blocks_x, n_blocks_y, 1);
dim3 threads( TILE_DIM, TILE_DIM, 1);
//cudaEventRecord(start,0);
// execute the kernel
mulMatrixKernel<<< grid, threads>>>( d_matrix_A, d_matrix_B, d_matrix_C, n_rows, n_cols, d_val);
//cudaEventRecord(stop,0);
// check if kernel execution generated and error
cutilCheckMsg("Kernel execution failed");
// allocate mem for the result on host side
float* h_matrix_C = (float*) malloc( mem_size);
// copy result from device to host
cutilSafeCall(cudaMemcpy( h_matrix_C, d_matrix_C, mem_size, cudaMemcpyDeviceToHost));
cudaMemcpy( h_val, d_val, 3*sizeof(float), cudaMemcpyDeviceToHost);
cutilCheckError( cutStopTimer( timer_GPU));
printMatrix(h_matrix_A, n_rows, n_cols);
printMatrix(h_matrix_B, n_rows, n_cols);
printMatrix(h_matrix_C, n_rows, n_cols);
// cleanup memory
free( h_matrix_A);
free( h_matrix_B);
free( h_matrix_C);
cudaFree(d_matrix_A);
cudaFree(d_matrix_B);
cudaFree(d_matrix_C);
cudaThreadExit();
}
void printMatrix(float *m, int r, int c)
{
for(int i=0;i<r;i++)
{
for(int j=0;j<r;j++)
printf("%4.0f ",m[i*c+j]);
printf("\n");
}
printf("\n");
}
////////////////////////////////////////////////////////////////////////////////
//! Kernel
////////////////////////////////////////////////////////////////////////////////
__global__ void mulMatrixKernel( float* g_matrix_A, float* g_matrix_B, float* g_matrix_C, int rows, int cols, float* g_val)
{
// access thread id
const unsigned int row = blockIdx.y*TILE_DIM+threadIdx.y;
const unsigned int col = blockIdx.x*TILE_DIM+threadIdx.x;
float sum=0.0f;
//perform computation
if(row<rows && col<cols)
for(int i=0;i<rows;i++)
sum+=g_matrix_A[row*cols+i]*g_matrix_B[i*cols+col];
g_matrix_C[row*cols+col]=sum;
}