Problem with memory while running on Fermi


I have a problem with running CUDA C program on GTX 580.

My code works well on GTX 260, GTX 275, 8400GS with any given value of threadsPerBlock, but when I run it on Fermi (gtx 580) with threadsPerBlock > 1 it behaves weirdly. Global memory (on device) pointed by matRowPtr, matColIndex, matVal and vecVal (see code below) is modified during the kernel call. I test memory just before and after running the kernel and several dozen values are different (arrays sizes: matRowPtr, vecVal - 15k; matColIndex, matVal - 50k), although they should be constant. As I’ve mentioned, there is no problem when threadsPerBlock is set to 1.

When I run program with cuda-memchech kernel returns error “unspecified launch failure” and cuda-memcheck reports “Address 0x04623b10 is out of bounds”, but always in different thread/block and with different address, so this information isn’t very valuable.

I really don’t know what is going on, because on other devices it works great, and as far as I know new devices should be backward compatible.

Can you help me with this problem?

Here’s my code:

__global__ void kernel(int size, const int* rowPtr, const int* colIndex, const float* mat, const float* vec, float* out)


	int row = blockDim.x * blockIdx.x + threadIdx.x;

	if (row < size) {

		float dot = 0;


		for (int i = rowPtr[row]; i < rowPtr[row+1]; ++i) {

			dot += mat[i] * vec[colIndex[i]];



		out[row] = dot;



void mul(const MatrixCRS& mat, const Vector& vec, Vector& out, int threadsPerBlock) {

	/* ... */

	int* matRowPtr;

	float* matVal;

	int* matColIndex;

	float* vecVal;

	float* outVal;

	int nbValues = mat.GetRowPtrArray()[mat.GetRows()];

	int outSize = mat.GetRows();

	CudaCheckReturn( cudaMalloc((void**) &matRowPtr, sizeof(int)*(mat.GetRows()+1)) );

	CudaCheckReturn( cudaMalloc((void**) &matColIndex, sizeof(int)*nbValues) );

	CudaCheckReturn( cudaMalloc((void**) &matVal, sizeof(float)*nbValues) );

	CudaCheckReturn( cudaMalloc((void**) &vecVal, sizeof(float)*vec.GetSize()) );

	CudaCheckReturn( cudaMalloc((void**) &outVal, sizeof(float)*outSize) );

	CudaCheckReturn( cudaMemcpy(matRowPtr, mat.GetRowPtrArray(), sizeof(int)*(mat.GetRows()+1), cudaMemcpyHostToDevice) );

	CudaCheckReturn( cudaMemcpy(matColIndex, mat.GetColIndexArray(), sizeof(int)*nbValues, cudaMemcpyHostToDevice) );

	CudaCheckReturn( cudaMemcpy(matVal, mat.GetValArray(), sizeof(float)*nbValues, cudaMemcpyHostToDevice) );

	CudaCheckReturn( cudaMemcpy(vecVal, vec.GetValArray(), sizeof(float)*vec.GetSize(), cudaMemcpyHostToDevice) );

	int numBlocks = (outSize + threadsPerBlock - 1) / threadsPerBlock;

	kernel <<<numBlocks, threadsPerBlock>>> (outSize, matRowPtr, matColIndex, matVal, vecVal, outVal);

	CudaCheckError("kernel fails");


	CudaCheckReturn( cudaMemcpy(out.GetPtr(), outVal, sizeof(float)*outSize, cudaMemcpyDeviceToHost) );


	/* ... */