Kernel failing on repeated invocation (thousends of times) Either I did something really stupid ...

I tried to offload some unit testing to the GPU to be able to test a wider parameter range in a reasonable time. Therefore I wrote a kernel that implements nothing more but a big fancy or on a hug array of true/false values. For each block it is supposed to return the index of the first true value within that block. For testing purposes there is a second kernel which is run first and will set one index in the original array to true.

This kernel works fine when run a few time. However it running it hundreds of times it will suddenly start to return false values. To be precise, blocks that are not handling index 0 will claim to have found index 0 to be true.

I tested this both on 2.0 and 2.2 beta. On a GTX 280 and on Tesla S1070s. Different systems for 2.0 and 2.2 beta. All expose the error. In fact the GTX 280 seems to be a little more robust with respect to this specific error.

The attached code runs the kernel over the same set of parameters for up to 100000 times. The Teslas usually fail very fast, although at different numbers of iterations. The GTX 280 often comes up to 80000 iterations, even though quicker failures occur.

/*

   This library is free software; you can redistribute it and/or

   modify it under the terms of the GNU Library General Public

   License version 2 as published by the Free Software Foundation.

This library is distributed in the hope that it will be useful,

   but WITHOUT ANY WARRANTY; without even the implied warranty of

   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU

   Library General Public License for more details.

You should have received a copy of the GNU Library General Public License

   along with this library; see the file COPYING.LIB.  If not, write to

   the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,

   Boston, MA 02110-1301, USA.

*/

#include <cutil_inline.h>

//#include <iostream>

__global__ void seedErrorsKernel( int seed, int* g_errors, int size )

{

	int thread = blockDim.x * blockIdx.x + threadIdx.x;

	if( thread < size )

	{

		g_errors[ thread ] = ( seed == thread );

	}

}

void seedErrors( int seed, int* d_errors, int size )

{

	dim3 threads( 256 );

	dim3 blocks( static_cast<int>( ceilf( static_cast<float>( size ) / static_cast<float>( threads.x ) ) ) );

	seedErrorsKernel<<< blocks, threads >>>( seed, d_errors, size );

	CUDA_SAFE_CALL( cudaGetLastError() );

}

/**

 * Buffer for use in the reduction by the FirstErrorsPerBlock-Kernel

 */

extern __shared__ int buffer[];

__global__ void getFirstErrorsPerBlock( int* d_errors, int size, int* d_idx ) {

	int idx = 2 * blockDim.x * blockIdx.x + threadIdx.x;

	int cand1 = ( idx			  < size ) ? d_errors[ idx			  ] : 0;

	int cand2 = ( idx + blockDim.x < size ) ? d_errors[ idx + blockDim.x ] : 0;

	if( cand1 != 0 )

	{

		buffer[ threadIdx.x ] = idx;

	}

	else if( cand2 != 0 )

	{

		buffer[ threadIdx.x ] = idx + blockDim.x;

	}

	else

	{

		buffer[ threadIdx.x ] = -1;

	}

	for( int stride = blockDim.x / 2; stride >= 1; stride /= 2 )

	{

		if( threadIdx.x < stride )

		{

			cand1 = buffer[ threadIdx.x ];

			cand2 = ( threadIdx.x + stride < blockDim.x ) ? buffer[ threadIdx.x + stride ] : -1;

			if( cand1 == -1 || ( cand2 != -1 && cand1 > cand2 ) )

			{

				buffer[ threadIdx.x ] = buffer[ threadIdx.x + stride ];

			}

		}

	}

	if( threadIdx.x == 0 )

	{

		d_idx[ blockIdx.x ] = buffer[ threadIdx.x ];

	}

}

int cudaTestGetFirstError( int* d_errors, int size ) {

	int* d_idx, *h_idx;

	dim3 blockDim = 64;

	dim3 gridDim = static_cast<int>( ceilf( static_cast<float>( size ) / static_cast<float>( blockDim.x ) / 2.0f ) );

	size_t shared = blockDim.x * sizeof( int );

	CUDA_SAFE_CALL( cudaMalloc( reinterpret_cast<void**>( &d_idx ), gridDim.x * sizeof( int ) ) );

	getFirstErrorsPerBlock<<< gridDim, blockDim, shared >>>( d_errors, size, d_idx );

	CUDA_SAFE_CALL( cudaGetLastError() );

//	h_idx = (int*) malloc( gridDim.x * sizeof( int ) );

	CUDA_SAFE_CALL( cudaMallocHost( reinterpret_cast<void**>( &h_idx ), gridDim.x * sizeof( int ) ) );

	CUDA_SAFE_CALL( cudaMemcpy( h_idx, d_idx, gridDim.x * sizeof( int ), cudaMemcpyDeviceToHost ) );

	CUDA_SAFE_CALL( cudaFree( d_idx ) );

	int firstErr = -1;

	for( int i = 0; i < gridDim.x; ++i )

	{

		if( h_idx[ i ] != -1 )

		{

			// comment this to have less noise

			printf( "Error found for thread %i of %i in block %i of %i\n", h_idx[ i ], size, i, gridDim.x );

			firstErr = h_idx[ i ];

			break;

		}

	}

//	free( h_idx );

	CUDA_SAFE_CALL( cudaFreeHost( h_idx ) );

	return firstErr;

}

void runTest( int argc, char** argv) 

{

	if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )

		cutilDeviceInit(argc, argv);

	else

		cudaSetDevice( cutGetMaxGflopsDeviceId() );

	int* d_playground;

	int size = 40000;

	CUDA_SAFE_CALL( cudaMalloc( reinterpret_cast<void**>( &d_playground ), size * sizeof( int ) ) );

	

	int seed = 10080;

	for( int i = 0; i < 1000000; ++i )

	{

		printf( "Try %i: ", i );

		seedErrors( seed, d_playground, size );

		int res = cudaTestGetFirstError( d_playground, size );

		if( res != seed )

		{

			printf( "Failed on try %i. %i != %i\n", i, res, seed );

			break;

		}

	}

	CUDA_SAFE_CALL( cudaFree( d_playground ) );

}

int main( int argc, char** argv) 

{

	runTest( argc, argv);

	cutilExit(argc, argv);

}

This code can be compiled withing the CUDA SDK 2.2 beta. I would have loved to attached it as a tarball, but the forum wouldn’t let me.

Sorry, my mistake, I was missing some syncthreads … how embarrassing.