Inconsistent output using CUDA

I am running a symmetric sparse matrix - vector multiplication code on CUDA. The program output seems to be very inconsistent. I see correct output
several times in succession but then all of a sudden it start spitting different numbers. Sometimes i notice the successive runs keep adding results from previous runs. I am compiling from inside VISUAL STUDIO 2010 and i have GeForce GTX 560 Ti installed. Can anyone see what i am missing which may be causing the problem?
Thanks,
White_Noise

//The following implementation is for symmetric sparse matrix (stored in CSR format) - vector multiplication//
global void SparseMatrixVectorMultiplication1(float *values, int *ia, int *ja, float *x, float *result, int size)
{

unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;

while(tid < size)
{
	int j;
	for( j = ia[tid]; j < ia[tid+1]; j++ )
	{
		result[tid] += values[ia[tid] + ja[j] - tid] * x[ja[j]];
	}
	for( j = 0; j < tid; j++)
	{
		//see if tid is there in between ja[ia[j]] and ja[ia[j+1]-1]//
		//if yes then result[tid] += values[ia[j] + tid - j] * x[j];
		if( tid > ja[ia[j]] && tid <= ja[ia[j+1]-1] )
		{
			int k;
			for( k = ia[j]; k < ia[j+1]; k++ )
			{
				if(ja[k] == tid)
				{
					result[tid] += values[ia[j] + tid - j] * x[j];
					break;
				}
				else if(ja[k] > tid)
					break;
			}
		}
	}
	tid += blockDim.x * gridDim.x;
}

}

void RunTestCase2()
{
int NSize = 1000;
float values_gpu;
int ia_gpu;
int ja_gpu;
float x_gpu, res_gpu;
HANDLE_ERROR( cudaMalloc((void
)&values_gpu,sizeval
sizeof(float)) );
HANDLE_ERROR( cudaMalloc((void
)&ia_gpu,sizeiasizeof(int)) );
HANDLE_ERROR( cudaMalloc((void**)&ja_gpu,sizeja
sizeof(int)) );
HANDLE_ERROR( cudaMalloc((void**)&x_gpu,NSizesizeof(float)) );
HANDLE_ERROR( cudaMalloc((void**)&res_gpu,NSize
2*sizeof(float)) );

HANDLE_ERROR( cudaMemcpy(values_gpu, values, sizeval*sizeof(float), cudaMemcpyHostToDevice) );
HANDLE_ERROR( cudaMemcpy(ia_gpu, ia, sizeia*sizeof(int), cudaMemcpyHostToDevice) );
HANDLE_ERROR( cudaMemcpy(ja_gpu, ja, sizeja*sizeof(int), cudaMemcpyHostToDevice) );
HANDLE_ERROR( cudaMemcpy(x_gpu, X.GetVector(), NSize*sizeof(float), cudaMemcpyHostToDevice) );

dim3 dimBlock; dimBlock.x = 64;
dim3 dimGrid; dimGrid.x = min((NSize/dimBlock.x) + 1,(int) MaxBlocks);
SparseMatrixVectorMultiplication1<<<dimGrid,dimBlock>>>(values_gpu, ia_gpu, ja_gpu, x_gpu, res_gpu, NSize);
HANDLE_ERROR( cudaMemcpy(Result2.GetVector(), res_gpu, NSize*sizeof(float), cudaMemcpyDeviceToHost) );
HANDLE_ERROR( cudaFree(values_gpu));
HANDLE_ERROR( cudaFree(ia_gpu));
HANDLE_ERROR( cudaFree(ja_gpu));
HANDLE_ERROR( cudaFree(x_gpu));
HANDLE_ERROR( cudaFree(res_gpu));

}

result[tid] += values[ia[tid] + ja[j] - tid] * x[ja[j]];

result seems was not initialized, malloc probablyn does not fill memory with zeroes.

thank you! it solved my problem