bug in cuda-memcheck

I have to pass shared memory to a device function as a parameter - I’ll post my testprogram at the end.
When I use it in a “usually” way, than I get an error in cuda-memcheck.
If I call the function from another function (use it as a typ of wrapper), than I don’t get this error message.
Is this a bug or make I something wron in the usage of shared memory.

Testprogram

#define BLOCK_SIZE 8
#define BLOCKS 4

__inline__ __device__ void FuncCopyFromShared( int *aiGlobalMem, volatile int *aiSharedMem, int iArrayLength )
{
	int iIndex = threadIdx.x + blockIdx.x * blockDim.x;

	if( iIndex < iArrayLength )
		aiGlobalMem[ iIndex ] = aiSharedMem[ threadIdx.x ];

	__syncthreads();
}

__inline__ __device__ void FuncCopyFromShared_Wrapper( int *aiGlobalMem, volatile int *aiSharedMem, int iArrayLength )
{
	FuncCopyFromShared( aiGlobalMem, aiSharedMem, iArrayLength );
}

__global__ void Kernel( int *aiGlobalMem )
{
	extern __shared__ int aiSharedMem[];

	int iIndex = threadIdx.x + blockIdx.x * blockDim.x;
	aiSharedMem[ threadIdx.x ] = iIndex + 1;

	#if 1
	FuncCopyFromShared( aiGlobalMem, aiSharedMem, BLOCKS*BLOCK_SIZE );
	#else
	FuncCopyFromShared_Wrapper( aiGlobalMem, aiSharedMem, BLOCKS*BLOCK_SIZE );
	#endif


	//FuncCopyToSharedAndEdit( aSharedMem, aGlobalMem, BLOCKS*BLOCK_SIZE );
}

int main( int argc, char** argv )
{
	int *hArray = NULL;
	int *dArray = NULL;

	findCudaDevice(argc, (const char **)argv);

	hArray = (int*)malloc( BLOCKS*BLOCK_SIZE*sizeof(int) );
	checkCudaErrors( cudaMalloc( (void**)&dArray, BLOCKS*BLOCK_SIZE*sizeof(int) ) );

	checkCudaErrors( cudaDeviceSynchronize() );

	Kernel>( dArray );

	getLastCudaError( "Kernel failed\n" );

	checkCudaErrors( cudaDeviceSynchronize() );

	checkCudaErrors( cudaMemcpy( hArray, dArray, BLOCKS*BLOCK_SIZE*sizeof(int), cudaMemcpyDeviceToHost ) );

	printf( "\n" );
	for( int i = 0; i < BLOCKS*BLOCK_SIZE; i++ )
		printf( "%i\n", hArray[i] );

	free( hArray );
	checkCudaErrors( cudaFree( dArray ) );

	cudaDeviceReset();

	printf( "\nfinished (:\n" );

	return 0;
}

Errormessage if I use FuncCopyFromShared in the kernel:

========= Invalid __global__ read of size 4
=========     at 0x000000d0 in /home/strautz/develop/software/CuTest/Debug/../CuTest.cu:16:FuncCopyFromShared(int*, int volatile *, int)
=========     by thread (7,0,0) in block (3,0,0)
=========     Address 0x0100001c is out of bounds
=========     Device Frame:/home/strautz/develop/software/CuTest/Debug/../CuTest.cu:41:Kernel(int*) (Kernel(int*) : 0xc8)
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/usr/lib/libcuda.so (cuLaunchKernel + 0x34b) [0x55d0b]
=========     Host Frame:/usr/local/cuda-5.0/lib/libcudart.so.5.0 [0x8f6a]

I get this error for every single thread that i use. Strange for me is, that the program crashes when it’s called by cuda-memcheck, but if I run it without everything is fine.
Also there is no error or problem when I use FuncCopyFromShared_Wrapper in the kernel.

It was a driver problem. I updatet to the current 310 driver and now everything is fine.