Weird Matrix-Vector Results - Help?

Okay, this is weird.

I am trying to implement a general matrix-vector multiplication kernel (Ax = b where A is 2-D matrix and x, b are 1-D vectors) with CUDA and the results keep changing. I wanted to test the kernel with a small matrix-vector system then move up to larger dimensions. The input A matrix is 5x5 and contains non-zero elements A[0][0] = 2.1, A[1][1] = 1.2, A[2][2] = 3.1, A[3][3] = 1.3, A[4][4] = 4.1. The input b-vector is 5x1 and contains non-zero elements b[0] = 1, b[1] = .5, b[2] = .5, b[3] = .5, b[4] = 1. The resulting solution x-vector is x = 2.1, 0.6, 1.55, 0.65, and 4.1. Sometimes I get x = 2.1, 0.6, 1.55, 0.65, and 3.1 as a result OR 2.1, 0.6, 0.6, 0.6, 4.1 and once in a while I will actually get the correct answer.

The key is that it is never consistent, has anyone seen this before? Please help if so.

Many thanks to anyone with ANY hints/ideas.

The code follows:

#define BLOCK_SIZE 16

__global__ void mul_multipleblocks(float *A, float *b, float *x, int m, int n, int p){

	float sum = 0.0f;

	int row, col, k;

	row = blockIdx.y*blockDim.y + threadIdx.y;

	col = blockIdx.x*blockDim.x + threadIdx.x;

	for(k = 0; k < n; ++k){

		sum = sum + (A[n*row + k]*b[p*k + col]);

	}

	x[p*row + col] = sum;

}

void mul_multBlocks(float *A, float *b, float *x, int m, int n, int p, float &time){

	

	// Timing this operation.

	cudaEvent_t start, stop;

	time = 0.0f;

	// Initialize EVENT Timers - CUDA.

	cudaEventCreate(&start); 

	cudaEventCreate(&stop);

	// Variables to be placed on GPU.

	float *a_d = NULL;

	float *b_d = NULL;

	float *x_d = NULL;

	// Allocate memory on device for Matrix A = m*n

	cudaMalloc((void**)&a_d, m*n*sizeof(int));

	// Allocate memory on device for Matrix B = p*q

	cudaMalloc((void**)&b_d, n*p*sizeof(int));

	// Allocate memory on device for Matrix X = m*q

	cudaMalloc((void**)&x_d, m*p*sizeof(int));

	// 'a_d' points to matrix A (m*n) on device.

	cudaMemcpy(a_d, A, m*n*sizeof(int), cudaMemcpyHostToDevice);

	// 'b_d' points to matrix B (p*q) on device.

	cudaMemcpy(b_d, b, n*p*sizeof(int), cudaMemcpyHostToDevice);

	

	/////////////////////////////////////////////////////////////////////////////

	// If Matrix A is of order (32 x 1) and Matrix B of (1 x 32) order,		   //

	// then Matrix C is of order (32 x 32). For 32x32=1024 threads to be	   //

	// generated, the block size chosen is 16x16, such that there are		   //

	// 256 threads per block and hence, 4 such blocks in the grid to		   //

	// generate 256x4 = 1024 threads.										   //

	/////////////////////////////////////////////////////////////////////////////

	// In General:

	// number of blocks = (TOTAL_ELEMENTS/NUMBER_OF_THREADS)

	dim3 block(BLOCK_SIZE, BLOCK_SIZE);

	long gRow = (m + block.y - 1)/block.y; 

	long gCol = (p + block.x - 1)/block.x;

	dim3 grid(gCol, gRow);

	//dim3 dimGrid((p + dimBlock.y - 1)/dimBlock.y, (m + dimBlock.x - 1)/dimBlock.x);

	//long gridCol = (p + dimBlock.y - 1)/dimBlock.y + (p%dimBlock.y == 0? 0 : 1);

	//long gridRow = (m + dimBlock.x - 1)/dimBlock.x + (m%dimBlock.x == 0? 0 : 1);

	//long gridRow = p/bDim.x + (p%bDim.x == 0? 0 : 1);

	//long gridCol = m/bDim.y + (m%bDim.y == 0? 0 : 1);

	

	//dim3 gDim(gridCol, gridRow);

	//dim3 dimGrid(gridCol, gridRow);

	printf("Grid Row %d, Grid Col %d\n", gRow, gCol);

	// "Record" the start of this EVENT - i.e. kernel call.

	cudaEventRecord(start, 0);

	// Kernel call.

	mul_multipleblocks<<<grid, block>>>(a_d, b_d, x_d, m, n, p);

	// "Record" the stopping of this EVENT - i.e. return from kernel call.

	cudaEventRecord(stop, 0); 

	cudaEventSynchronize(stop);

	// Get the amount of time elapsed (in milliseconds) and

	// DESTROY the CUDA timer objects.

	cudaEventElapsedTime(&time, start, stop); 

	cudaEventDestroy(start); 

	cudaEventDestroy(stop);

	// Retrieve results pointed by 'c_h'

	cudaMemcpy(x, x_d, m*p*sizeof(int), cudaMemcpyDeviceToHost);

	// Free CUDA memory.

	cudaFree(a_d);		

	cudaFree(b_d);		

	cudaFree(x_d);

}

The indexing scheme for both the matrix and the vectors looks very suspicious. What order is the matrix in and what is the stride of the vectors?

Thank you so much for the reply.

You are correct, the code does look a bit problematic but it is nascent stage. Happily I found the problem - it was staring at me in the face :rolleyes: - The dimensions must be a multiple of the defined BLOCK_SIZE. The way it was, any “garbage” value on the GPU that just happen to occupy the desired address(s) was accessed. For example, the A matrix was 5x5 and the BLOCK_SIZE was 16, that meant I had 256 threads 231 of which were not valid - resulting inconsistent result(s).

To fix the issue I used a simple conditional - seems to work (although, I hesitate to accept the neccessity of the _syncthreads() calls in the kernel since no shared memory??).

If anyone else is having this same problem I am posting the code.

Once again, thank you very much for the response.

My current solution follows:

#define BLOCK_SIZE  16

__global__ void mul_multipleblocks(float *A, float *b, float *x, int m, int n, int p){

	float sum = 0.0f;

	int row, col, k;

	row = blockIdx.y*blockDim.y + threadIdx.y;

	col = blockIdx.x*blockDim.x + threadIdx.x;

	for(k = 0; k < n; ++k){

		sum = sum + (A[n*row + k]*b[p*k + col]);

	}

	__syncthreads();

	x[p*row + col] = sum;

	__syncthreads();

}

void mul_multBlocks(float *A, float *b, float *x, int m, int n, int p, float &time){

	

	// Timing this operation.

	cudaEvent_t start, stop;

	time = 0.0f;

	// Initialize EVENT Timers - CUDA.

	cudaEventCreate(&start); 

	cudaEventCreate(&stop);

	// Variables to be placed on GPU.

	float *a_d = NULL;

	float *b_d = NULL;

	float *x_d = NULL;

	// Allocate memory on device for Matrix A = m*n

	cudaMalloc((void**)&a_d, m*n*sizeof(int));

	// Allocate memory on device for Matrix B = p*q

	cudaMalloc((void**)&b_d, n*p*sizeof(int));

	// Allocate memory on device for Matrix X = m*q

	cudaMalloc((void**)&x_d, m*p*sizeof(int));

	// 'a_d' points to matrix A (m*n) on device.

	cudaMemcpy(a_d, A, m*n*sizeof(int), cudaMemcpyHostToDevice);

	// 'b_d' points to matrix B (p*q) on device.

	cudaMemcpy(b_d, b, n*p*sizeof(int), cudaMemcpyHostToDevice);

	// In General:

	// number of blocks = (TOTAL_ELEMENTS/NUMBER_OF_THREADS)

	/////////////////////////////////////////////////////////////////////////////

	// Adjust the BLOCK_SIZE such that it is a MULTIPLE of the dimension(s)	   //

	// being manipulated by the Ax=b system.								   //

	/////////////////////////////////////////////////////////////////////////////

	long blocksize = 0;

	if(m < BLOCK_SIZE) {

		blocksize = m;

	}else if((m%BLOCK_SIZE) != 0){

		// Must be a MULTIPLE of block.

		blocksize = BLOCK_SIZE + 1;

	}else{

		blocksize = BLOCK_SIZE;

	}

	// Set "altered" dimension(s).

	dim3 dimBlock(blocksize, blocksize);

	long gRow = p/dimBlock.x + 1;

	long gCol = m/dimBlock.y + 1;

	dim3 dimGrid(gRow, gCol);

	/////////////////////////////////////////////////////////////////////////////

	/////////////////////////////////////////////////////////////////////////////

	// "Record" the start of this EVENT - i.e. kernel call.

	cudaEventRecord(start, 0);

	// Kernel call.

	mul_multipleblocks<<<dimGrid, dimBlock>>>(a_d, b_d, x_d, m, n, p);

	// "Record" the stopping of this EVENT - i.e. return from kernel call.

	cudaEventRecord(stop, 0); 

	cudaEventSynchronize(stop);

	// Get the amount of time elapsed (in milliseconds) and

	// DESTROY the CUDA timer objects.

	cudaEventElapsedTime(&time, start, stop); 

	cudaEventDestroy(start); 

	cudaEventDestroy(stop);

	// Retrieve results pointed by 'c_h'

	cudaMemcpy(x, x_d, m*p*sizeof(int), cudaMemcpyDeviceToHost);

	// Free CUDA memory.

	cudaFree(a_d);		

	cudaFree(b_d);		

	cudaFree(x_d);

}