I have to pass shared memory to a device function as a parameter - I’ll post my testprogram at the end.
When I use it in a “usually” way, than I get an error in cuda-memcheck.
If I call the function from another function (use it as a typ of wrapper), than I don’t get this error message.
Is this a bug or make I something wron in the usage of shared memory.
Testprogram
#define BLOCK_SIZE 8
#define BLOCKS 4
__inline__ __device__ void FuncCopyFromShared( int *aiGlobalMem, volatile int *aiSharedMem, int iArrayLength )
{
int iIndex = threadIdx.x + blockIdx.x * blockDim.x;
if( iIndex < iArrayLength )
aiGlobalMem[ iIndex ] = aiSharedMem[ threadIdx.x ];
__syncthreads();
}
__inline__ __device__ void FuncCopyFromShared_Wrapper( int *aiGlobalMem, volatile int *aiSharedMem, int iArrayLength )
{
FuncCopyFromShared( aiGlobalMem, aiSharedMem, iArrayLength );
}
__global__ void Kernel( int *aiGlobalMem )
{
extern __shared__ int aiSharedMem[];
int iIndex = threadIdx.x + blockIdx.x * blockDim.x;
aiSharedMem[ threadIdx.x ] = iIndex + 1;
#if 1
FuncCopyFromShared( aiGlobalMem, aiSharedMem, BLOCKS*BLOCK_SIZE );
#else
FuncCopyFromShared_Wrapper( aiGlobalMem, aiSharedMem, BLOCKS*BLOCK_SIZE );
#endif
//FuncCopyToSharedAndEdit( aSharedMem, aGlobalMem, BLOCKS*BLOCK_SIZE );
}
int main( int argc, char** argv )
{
int *hArray = NULL;
int *dArray = NULL;
findCudaDevice(argc, (const char **)argv);
hArray = (int*)malloc( BLOCKS*BLOCK_SIZE*sizeof(int) );
checkCudaErrors( cudaMalloc( (void**)&dArray, BLOCKS*BLOCK_SIZE*sizeof(int) ) );
checkCudaErrors( cudaDeviceSynchronize() );
Kernel>( dArray );
getLastCudaError( "Kernel failed\n" );
checkCudaErrors( cudaDeviceSynchronize() );
checkCudaErrors( cudaMemcpy( hArray, dArray, BLOCKS*BLOCK_SIZE*sizeof(int), cudaMemcpyDeviceToHost ) );
printf( "\n" );
for( int i = 0; i < BLOCKS*BLOCK_SIZE; i++ )
printf( "%i\n", hArray[i] );
free( hArray );
checkCudaErrors( cudaFree( dArray ) );
cudaDeviceReset();
printf( "\nfinished (:\n" );
return 0;
}
Errormessage if I use FuncCopyFromShared in the kernel:
========= Invalid __global__ read of size 4
========= at 0x000000d0 in /home/strautz/develop/software/CuTest/Debug/../CuTest.cu:16:FuncCopyFromShared(int*, int volatile *, int)
========= by thread (7,0,0) in block (3,0,0)
========= Address 0x0100001c is out of bounds
========= Device Frame:/home/strautz/develop/software/CuTest/Debug/../CuTest.cu:41:Kernel(int*) (Kernel(int*) : 0xc8)
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/libcuda.so (cuLaunchKernel + 0x34b) [0x55d0b]
========= Host Frame:/usr/local/cuda-5.0/lib/libcudart.so.5.0 [0x8f6a]
I get this error for every single thread that i use. Strange for me is, that the program crashes when it’s called by cuda-memcheck, but if I run it without everything is fine.
Also there is no error or problem when I use FuncCopyFromShared_Wrapper in the kernel.