Okay, this is weird.
I am trying to implement a general matrix-vector multiplication kernel (Ax = b where A is 2-D matrix and x, b are 1-D vectors) with CUDA and the results keep changing. I wanted to test the kernel with a small matrix-vector system then move up to larger dimensions. The input A matrix is 5x5 and contains non-zero elements A[0][0] = 2.1, A[1][1] = 1.2, A[2][2] = 3.1, A[3][3] = 1.3, A[4][4] = 4.1. The input b-vector is 5x1 and contains non-zero elements b[0] = 1, b[1] = .5, b[2] = .5, b[3] = .5, b[4] = 1. The resulting solution x-vector is x = 2.1, 0.6, 1.55, 0.65, and 4.1. Sometimes I get x = 2.1, 0.6, 1.55, 0.65, and 3.1 as a result OR 2.1, 0.6, 0.6, 0.6, 4.1 and once in a while I will actually get the correct answer.
The key is that it is never consistent, has anyone seen this before? Please help if so.
Many thanks to anyone with ANY hints/ideas.
The code follows:
#define BLOCK_SIZE 16
__global__ void mul_multipleblocks(float *A, float *b, float *x, int m, int n, int p){
float sum = 0.0f;
int row, col, k;
row = blockIdx.y*blockDim.y + threadIdx.y;
col = blockIdx.x*blockDim.x + threadIdx.x;
for(k = 0; k < n; ++k){
sum = sum + (A[n*row + k]*b[p*k + col]);
}
x[p*row + col] = sum;
}
void mul_multBlocks(float *A, float *b, float *x, int m, int n, int p, float &time){
// Timing this operation.
cudaEvent_t start, stop;
time = 0.0f;
// Initialize EVENT Timers - CUDA.
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Variables to be placed on GPU.
float *a_d = NULL;
float *b_d = NULL;
float *x_d = NULL;
// Allocate memory on device for Matrix A = m*n
cudaMalloc((void**)&a_d, m*n*sizeof(int));
// Allocate memory on device for Matrix B = p*q
cudaMalloc((void**)&b_d, n*p*sizeof(int));
// Allocate memory on device for Matrix X = m*q
cudaMalloc((void**)&x_d, m*p*sizeof(int));
// 'a_d' points to matrix A (m*n) on device.
cudaMemcpy(a_d, A, m*n*sizeof(int), cudaMemcpyHostToDevice);
// 'b_d' points to matrix B (p*q) on device.
cudaMemcpy(b_d, b, n*p*sizeof(int), cudaMemcpyHostToDevice);
/////////////////////////////////////////////////////////////////////////////
// If Matrix A is of order (32 x 1) and Matrix B of (1 x 32) order, //
// then Matrix C is of order (32 x 32). For 32x32=1024 threads to be //
// generated, the block size chosen is 16x16, such that there are //
// 256 threads per block and hence, 4 such blocks in the grid to //
// generate 256x4 = 1024 threads. //
/////////////////////////////////////////////////////////////////////////////
// In General:
// number of blocks = (TOTAL_ELEMENTS/NUMBER_OF_THREADS)
dim3 block(BLOCK_SIZE, BLOCK_SIZE);
long gRow = (m + block.y - 1)/block.y;
long gCol = (p + block.x - 1)/block.x;
dim3 grid(gCol, gRow);
//dim3 dimGrid((p + dimBlock.y - 1)/dimBlock.y, (m + dimBlock.x - 1)/dimBlock.x);
//long gridCol = (p + dimBlock.y - 1)/dimBlock.y + (p%dimBlock.y == 0? 0 : 1);
//long gridRow = (m + dimBlock.x - 1)/dimBlock.x + (m%dimBlock.x == 0? 0 : 1);
//long gridRow = p/bDim.x + (p%bDim.x == 0? 0 : 1);
//long gridCol = m/bDim.y + (m%bDim.y == 0? 0 : 1);
//dim3 gDim(gridCol, gridRow);
//dim3 dimGrid(gridCol, gridRow);
printf("Grid Row %d, Grid Col %d\n", gRow, gCol);
// "Record" the start of this EVENT - i.e. kernel call.
cudaEventRecord(start, 0);
// Kernel call.
mul_multipleblocks<<<grid, block>>>(a_d, b_d, x_d, m, n, p);
// "Record" the stopping of this EVENT - i.e. return from kernel call.
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
// Get the amount of time elapsed (in milliseconds) and
// DESTROY the CUDA timer objects.
cudaEventElapsedTime(&time, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
// Retrieve results pointed by 'c_h'
cudaMemcpy(x, x_d, m*p*sizeof(int), cudaMemcpyDeviceToHost);
// Free CUDA memory.
cudaFree(a_d);
cudaFree(b_d);
cudaFree(x_d);
}