Hi,
I have a problem with running CUDA C program on GTX 580.
My code works well on GTX 260, GTX 275, 8400GS with any given value of threadsPerBlock, but when I run it on Fermi (gtx 580) with threadsPerBlock > 1 it behaves weirdly. Global memory (on device) pointed by matRowPtr, matColIndex, matVal and vecVal (see code below) is modified during the kernel call. I test memory just before and after running the kernel and several dozen values are different (arrays sizes: matRowPtr, vecVal - 15k; matColIndex, matVal - 50k), although they should be constant. As I’ve mentioned, there is no problem when threadsPerBlock is set to 1.
When I run program with cuda-memchech kernel returns error “unspecified launch failure” and cuda-memcheck reports “Address 0x04623b10 is out of bounds”, but always in different thread/block and with different address, so this information isn’t very valuable.
I really don’t know what is going on, because on other devices it works great, and as far as I know new devices should be backward compatible.
Can you help me with this problem?
Here’s my code:
__global__ void kernel(int size, const int* rowPtr, const int* colIndex, const float* mat, const float* vec, float* out)
{
int row = blockDim.x * blockIdx.x + threadIdx.x;
if (row < size) {
float dot = 0;
for (int i = rowPtr[row]; i < rowPtr[row+1]; ++i) {
dot += mat[i] * vec[colIndex[i]];
}
out[row] = dot;
}
}
void mul(const MatrixCRS& mat, const Vector& vec, Vector& out, int threadsPerBlock) {
/* ... */
int* matRowPtr;
float* matVal;
int* matColIndex;
float* vecVal;
float* outVal;
int nbValues = mat.GetRowPtrArray()[mat.GetRows()];
int outSize = mat.GetRows();
CudaCheckReturn( cudaMalloc((void**) &matRowPtr, sizeof(int)*(mat.GetRows()+1)) );
CudaCheckReturn( cudaMalloc((void**) &matColIndex, sizeof(int)*nbValues) );
CudaCheckReturn( cudaMalloc((void**) &matVal, sizeof(float)*nbValues) );
CudaCheckReturn( cudaMalloc((void**) &vecVal, sizeof(float)*vec.GetSize()) );
CudaCheckReturn( cudaMalloc((void**) &outVal, sizeof(float)*outSize) );
CudaCheckReturn( cudaMemcpy(matRowPtr, mat.GetRowPtrArray(), sizeof(int)*(mat.GetRows()+1), cudaMemcpyHostToDevice) );
CudaCheckReturn( cudaMemcpy(matColIndex, mat.GetColIndexArray(), sizeof(int)*nbValues, cudaMemcpyHostToDevice) );
CudaCheckReturn( cudaMemcpy(matVal, mat.GetValArray(), sizeof(float)*nbValues, cudaMemcpyHostToDevice) );
CudaCheckReturn( cudaMemcpy(vecVal, vec.GetValArray(), sizeof(float)*vec.GetSize(), cudaMemcpyHostToDevice) );
int numBlocks = (outSize + threadsPerBlock - 1) / threadsPerBlock;
kernel <<<numBlocks, threadsPerBlock>>> (outSize, matRowPtr, matColIndex, matVal, vecVal, outVal);
CudaCheckError("kernel fails");
out.SetSize(outSize);
CudaCheckReturn( cudaMemcpy(out.GetPtr(), outVal, sizeof(float)*outSize, cudaMemcpyDeviceToHost) );
/* ... */
}