I am running a symmetric sparse matrix - vector multiplication code on CUDA. The program output seems to be very inconsistent. I see correct output
several times in succession but then all of a sudden it start spitting different numbers. Sometimes i notice the successive runs keep adding results from previous runs. I am compiling from inside VISUAL STUDIO 2010 and i have GeForce GTX 560 Ti installed. Can anyone see what i am missing which may be causing the problem?
Thanks,
White_Noise
//The following implementation is for symmetric sparse matrix (stored in CSR format) - vector multiplication//
global void SparseMatrixVectorMultiplication1(float *values, int *ia, int *ja, float *x, float *result, int size)
{
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
while(tid < size)
{
int j;
for( j = ia[tid]; j < ia[tid+1]; j++ )
{
result[tid] += values[ia[tid] + ja[j] - tid] * x[ja[j]];
}
for( j = 0; j < tid; j++)
{
//see if tid is there in between ja[ia[j]] and ja[ia[j+1]-1]//
//if yes then result[tid] += values[ia[j] + tid - j] * x[j];
if( tid > ja[ia[j]] && tid <= ja[ia[j+1]-1] )
{
int k;
for( k = ia[j]; k < ia[j+1]; k++ )
{
if(ja[k] == tid)
{
result[tid] += values[ia[j] + tid - j] * x[j];
break;
}
else if(ja[k] > tid)
break;
}
}
}
tid += blockDim.x * gridDim.x;
}
}
void RunTestCase2()
{
int NSize = 1000;
float values_gpu;
int ia_gpu;
int ja_gpu;
float x_gpu, res_gpu;
HANDLE_ERROR( cudaMalloc((void)&values_gpu,sizevalsizeof(float)) );
HANDLE_ERROR( cudaMalloc((void)&ia_gpu,sizeiasizeof(int)) );
HANDLE_ERROR( cudaMalloc((void**)&ja_gpu,sizejasizeof(int)) );
HANDLE_ERROR( cudaMalloc((void**)&x_gpu,NSizesizeof(float)) );
HANDLE_ERROR( cudaMalloc((void**)&res_gpu,NSize2*sizeof(float)) );
HANDLE_ERROR( cudaMemcpy(values_gpu, values, sizeval*sizeof(float), cudaMemcpyHostToDevice) );
HANDLE_ERROR( cudaMemcpy(ia_gpu, ia, sizeia*sizeof(int), cudaMemcpyHostToDevice) );
HANDLE_ERROR( cudaMemcpy(ja_gpu, ja, sizeja*sizeof(int), cudaMemcpyHostToDevice) );
HANDLE_ERROR( cudaMemcpy(x_gpu, X.GetVector(), NSize*sizeof(float), cudaMemcpyHostToDevice) );
dim3 dimBlock; dimBlock.x = 64;
dim3 dimGrid; dimGrid.x = min((NSize/dimBlock.x) + 1,(int) MaxBlocks);
SparseMatrixVectorMultiplication1<<<dimGrid,dimBlock>>>(values_gpu, ia_gpu, ja_gpu, x_gpu, res_gpu, NSize);
HANDLE_ERROR( cudaMemcpy(Result2.GetVector(), res_gpu, NSize*sizeof(float), cudaMemcpyDeviceToHost) );
HANDLE_ERROR( cudaFree(values_gpu));
HANDLE_ERROR( cudaFree(ia_gpu));
HANDLE_ERROR( cudaFree(ja_gpu));
HANDLE_ERROR( cudaFree(x_gpu));
HANDLE_ERROR( cudaFree(res_gpu));
}