Own GEMV implementation failed after some iterations [code included]

The following code fails after some iterations on our FX360M and 8600M GT, but works fine on our 8800 Ultra.

#include <assert.h>

#include <cutil_inline.h>

#ifdef __DEVICE_EMULATION__

	#define UIMUL(a, b)  (((size_t)(a)) * ((size_t)(b)))

	#define EMUSYNC	  __syncthreads()

	#define THREADS	  128

#else

	#define UIMUL(a, b)  __umul24(a, b)

	#define EMUSYNC

	#define THREADS	  128

#endif

template<size_t blockSize>

__global__ void

gemv_kernel(float const * src1, size_t const strideSrc1,

			size_t const rows, size_t const cols,

			float const * src2, float const * src3, float * dst,

			float const alpha, float const beta)

{

	__shared__ float sdata[blockSize];

	float  const * const row = src1 + UIMUL(blockIdx.x, strideSrc1);

	size_t const		 tid = threadIdx.x;

	// Reduce multiple elements per thread

	sdata[tid] = 0;

	for (size_t i = tid; i < cols; i += blockSize)

		sdata[tid] += row[i] * src2[i];

	__syncthreads();

	// Do reduction in shared memory

	if (blockSize >= 512) {

		if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads();

	}

	if (blockSize >= 256) {

		if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads();

	}

	if (blockSize >= 128) {

		if (tid <  64) { sdata[tid] += sdata[tid +  64]; } __syncthreads();

	}

#ifndef __DEVICE_EMULATION__

	if (tid < 32)

#endif

	{

		if (blockSize >=  64) { sdata[tid] += sdata[tid + 32]; EMUSYNC; }

		if (blockSize >=  32) { sdata[tid] += sdata[tid + 16]; EMUSYNC; }

		if (blockSize >=  16) { sdata[tid] += sdata[tid +  8]; EMUSYNC; }

		if (blockSize >=   8) { sdata[tid] += sdata[tid +  4]; EMUSYNC; }

		if (blockSize >=   4) { sdata[tid] += sdata[tid +  2]; EMUSYNC; }

		if (blockSize >=   2) { sdata[tid] += sdata[tid +  1]; EMUSYNC; }

	}

	// Write result to global memory

	if (tid == 0) {

		dst[blockIdx.x] = alpha * sdata[0] + beta * src3[blockIdx.x];

	}

}

void gemv(float const * src1, size_t strideSrc1,

		  size_t rowsSrc1, size_t colsSrc1,

		  float const * src2, size_t lenSrc2,

		  float const * src3, size_t lenSrc3,

		  float * dst)

{

	assert(src1 && strideSrc1 >= colsSrc1);

	assert(rowsSrc1 > 0 && colsSrc1 > 0);

	assert(src2 && lenSrc2 == colsSrc1);

	assert(src3 && lenSrc3 == rowsSrc1);

	assert(dst);

	gemv_kernel<THREADS><<<rowsSrc1, THREADS>>>

		(src1, strideSrc1, rowsSrc1, colsSrc1, src2, src3, dst, 1, 1);

	cudaError err = cudaGetLastError();

	if (err != cudaSuccess) {

		printf("Kernel call 'gemv_kernel' failed [%s].\n",

			cudaGetErrorString(err));

		exit(EXIT_FAILURE);

	}

	err = cudaThreadSynchronize();

	if (err != cudaSuccess) {

		printf("Kernel call 'gemv_kernel' failed [%s].\n",

			cudaGetErrorString(err));

		exit(EXIT_FAILURE);

	}

}

void randInit(float * ptr, size_t const len)

{

	for (size_t i = 0; i < len; ++i)

		ptr[i] = rand() / (float) RAND_MAX;

}

int main(int argc, char** argv)

{

	size_t const rows = 300, cols = 100;

	

	float *h_A = (float*) malloc(rows * cols * sizeof(float)), 

		  *h_x = (float*) malloc(cols * sizeof(float)),

		  *h_y = (float*) malloc(rows * sizeof(float)),

		  *h_z = (float*) malloc(rows * sizeof(float));

	srand((unsigned int)time(NULL));

	randInit(h_A, rows * cols);

	randInit(h_x, cols);

	randInit(h_y, rows);

	randInit(h_z, rows);

	size_t d_strideA = 0;

	float *d_A = NULL, *d_x = NULL, *d_y = NULL, *d_z = NULL;

	cutilSafeCall(cudaMallocPitch((void**)&d_A, &d_strideA, cols * sizeof(float), rows));

	d_strideA /= sizeof(float);

	cutilSafeCall(cudaMalloc((void**)&d_x, cols * sizeof(float)));

	cutilSafeCall(cudaMalloc((void**)&d_y, rows * sizeof(float)));

	cutilSafeCall(cudaMalloc((void**)&d_z, rows * sizeof(float)));

	

	cutilSafeCall(cudaMemcpy2D(

		d_A, d_strideA * sizeof(float), h_A, cols * sizeof(float),

		cols * sizeof(float), rows, cudaMemcpyHostToDevice));

	cutilSafeCall(cudaMemcpy(d_x, h_x, cols * sizeof(float), cudaMemcpyHostToDevice));

	cutilSafeCall(cudaMemcpy(d_y, h_y, rows * sizeof(float), cudaMemcpyHostToDevice));

	cutilSafeCall(cudaMemcpy(d_z, h_z, rows * sizeof(float), cudaMemcpyHostToDevice));

	for (size_t k = 1; k <= 100; ++k)

	{

		printf("try %03d ... ", k);

		for (size_t i = 1; i <= (rows * cols); ++i)

		{

			gemv(d_A, d_strideA, rows, cols, d_x, cols, d_y, rows, d_z);

			gemv(d_A, d_strideA, rows, cols, d_x, cols, d_y, rows, d_z);

		}

		printf("OK\n");

	}

	cutilSafeCall(cudaFree(d_A));

	cutilSafeCall(cudaFree(d_x));

	cutilSafeCall(cudaFree(d_y));

	cutilSafeCall(cudaFree(d_z));

	free(h_A);

	free(h_x);

	free(h_y);

	free(h_z);

}

When I start this code on our Quadro FX360M I got the following result:

try 001 ... OK

try 002 ... OK

...

try 068 ... OK

try 069 ... Kernel call 'gemv_kernel' failed [unspecified launch failure].

It’s not realy deterministic when the error occurs, but it occurs after some iterations.

Does anyone have an idea what’s going wrong?

Additional info: When the error occurs the notebook display flickers for a short moment!

Thanks and regards,

Daniel.

I am using the newest CUDA version:

CUDA 2.2
CUDA Driver: NVIDIA Notebook Driver for Windows XP 32-bit 185.85
CUDA Toolkit: CUDA Toolkit 2.2 for Windows XP 32-bit
CUDA SDK: CUDA SDK 2.2 code samples for Windows XP 32-bit

The error occurs under Windows and Linux!

When adding the statement “if (blockIdx.x >= rows) return;” on top of the kernel function it seems to work:

template<size_t blockSize>

__global__ void

gemv_kernel(float const * src1, size_t const strideSrc1,

			size_t const rows, size_t const cols,

			float const * src2, float const * src3, float * dst,

			float const alpha, float const beta)

{

	if (blockIdx.x >= rows) return;

	[...]

}

I am only launching number of blocks thats equal to number of rows, so what’s going wrong (blockIdx.x should NEVER be greater or equal to rows)?

I see nothing wrong with the kernel function (without the “if (blockIdx.x >= rows) return;” statement), memory allocation or kernel launch.

Is this a BUG in CUDA? (on the Quadro FX360M and Geforce 8600M GT hardware)

I have the same problem. In my case I have implemented a kernel which writes 39 results from iterating 1000 times and I have set cuda to run more or less 65000 threads. Everything looks working fine, but I opened windows media and I noted very short display flickers. If I start to open more and more applications (i.e. loading the GPU with display tasks) it is possible to freeze the computer. I have not tried your solution but does not make sense for me how you fixed the problem.

It also doesn’t make sense to me, but adding the statement change the behaviour and now it seems to work … really strange!