Really weird behaviour of GPU GTX260


I’ve been trying to set up a very simple matrix multiplication application, i.e. without shared memory usage.

The algorithm works, but after I copy the results back to the host and display them, some of the rows have the value zero (the rows with zero values change at every new execution).

First I thought that maybe the host starts copying the results from the device before the kernel finishes its execution, but as the execution takes place in the default stream (stream 0), all the calls are synchronous? Or am I wrong?

Here’s the code for those of you whould like to take a look:



/* main Method														 */



int main(int argc, char* argv[])


	if(!InitCUDA()) {

		return 0;



	cutilExit(argc, argv);



//! CPU method


void mulMatrix(int n_rows, int n_cols)


	int n_blocks_x = (n_cols+TILE_DIM-1)/TILE_DIM;

	int n_blocks_y = (n_rows+TILE_DIM-1)/TILE_DIM;

	int threads_per_block = TILE_DIM * TILE_DIM;


	cudaEvent_t start, stop;

	float time;

	unsigned int timer_CPU = 0;

	unsigned int timer_GPU = 0;

	cutilCheckError( cutCreateTimer( &timer_CPU));

	cutilCheckError( cutCreateTimer( &timer_GPU));


	unsigned int mem_size = sizeof( float) * n_rows * n_cols;

	// allocate host memory

	float* h_matrix_A = (float*) malloc( mem_size);

	float* h_matrix_B = (float*) malloc( mem_size);

	float *h_val = (float*) malloc(3*sizeof( float));

	// initalize the memory

	for( unsigned int i = 0; i < n_rows; ++i) 

		for( unsigned int j = 0; j < n_cols; ++j) 





	//Start timer

	cutilCheckError( cutStartTimer( timer_GPU));

	// allocate device memory

	float* d_matrix_A;

	float* d_matrix_B;

	cudaMalloc( (void**) &d_matrix_A, mem_size);

	cudaMalloc( (void**) &d_matrix_B, mem_size);

	float* d_val;

	cudaMalloc( (void**) &d_val, 3*sizeof( float));

	// copy host memory to device

	cudaMemcpy( d_matrix_A, h_matrix_A, mem_size, cudaMemcpyHostToDevice);

	cudaMemcpy( d_matrix_B, h_matrix_B, mem_size, cudaMemcpyHostToDevice);

	// allocate device memory for result

	float* d_matrix_C;

	cudaMalloc( (void**) &d_matrix_C, mem_size);

	// setup execution parameters

	dim3  grid( n_blocks_x, n_blocks_y, 1);

	dim3  threads( TILE_DIM, TILE_DIM, 1);


	// execute the kernel

	mulMatrixKernel<<< grid, threads>>>( d_matrix_A, d_matrix_B, d_matrix_C, n_rows, n_cols, d_val);


	// check if kernel execution generated and error

	cutilCheckMsg("Kernel execution failed");

	// allocate mem for the result on host side

	float* h_matrix_C = (float*) malloc( mem_size);

	// copy result from device to host

	cutilSafeCall(cudaMemcpy( h_matrix_C, d_matrix_C, mem_size, cudaMemcpyDeviceToHost));

	cudaMemcpy( h_val, d_val, 3*sizeof(float), cudaMemcpyDeviceToHost);

	cutilCheckError( cutStopTimer( timer_GPU));

	printMatrix(h_matrix_A, n_rows, n_cols);

	printMatrix(h_matrix_B, n_rows, n_cols);

	printMatrix(h_matrix_C, n_rows, n_cols);

	// cleanup memory

	free( h_matrix_A);

	free( h_matrix_B);

	free( h_matrix_C);







void printMatrix(float *m, int r, int c)


	for(int i=0;i<r;i++)	


		for(int j=0;j<r;j++)	

			printf("%4.0f  ",m[i*c+j]);






//! Kernel


__global__ void mulMatrixKernel( float* g_matrix_A, float* g_matrix_B, float* g_matrix_C, int rows, int cols, float* g_val) 


  // access thread id

  const unsigned int row = blockIdx.y*TILE_DIM+threadIdx.y;

  const unsigned int col = blockIdx.x*TILE_DIM+threadIdx.x;

  float sum=0.0f;

//perform computation

  if(row<rows && col<cols)

	  for(int i=0;i<rows;i++)