I’ve been fighting with shared memory alot these last days. It seems shared memory is a bit unstable. For example, in one kernel I allocate an array of a struct in shared memory, same size as the number of threads in a block, if I add a member to the struct which I dont have to even touch, compilation goes fine but ones I run the software, It crashes my computer and I have to restart. Output struct is allocated in shared memory and passed as reference to the function argument. I do this because I want to lower register usage and coalesced global writing. ( func is in this case a kdtree raytracer )
[codebox]template <class Input, class Output, class F>
global void PersistentWorker(Input* input, int count, Output* output, F func)
{
int rayIndex = (blockIdx.x*threadHeight +threadIdx.y)*threadWidth + threadIdx.x;
__shared__ int nextRay[threadHeight];
__shared__ Output tempOut[threadSize];
if( threadIdx.x == 0 )
{
nextRay[threadIdx.y] = rayIndex;
}
while( rayIndex < count )
{
func.Call(input[rayIndex], tempOut[threadIdx.x+threadWidth*threadIdx.y], rayIndex);
float* from = (float*)&tempOut[threadWidth*threadIdx.y];
float* to = (float*)&output[nextRay[threadIdx.y]];
for(int i=0; i<sizeof(Output)/4; i++)
{
int index = threadWidth*i + threadIdx.x;
to[index] = from[index];
}
if( threadIdx.x == 0 )
{
nextRay[threadIdx.y] = atomicAdd(&g_RayCount, threadWidth);
}
rayIndex = nextRay[threadIdx.y] + threadIdx.x;
}
}[/codebox]
Also when I tried avoid using an atomic for every iteration by adding the folling code it always crashes my computer.
[codebox]template <class Input, class Output, class F>
global void PW3(Input* input, int count, Output* output, F func)
{
int rayIndex = (blockIdx.x*threadHeight +threadIdx.y)*threadWidth + threadIdx.x;
__shared__ volatile int nextRay[threadHeight];
__shared__ Output tempOut[threadSize];
__shared__ volatile int rayCountLeft[threadHeight];
if( threadIdx.x == 0 )
{
nextRay[threadIdx.y] = rayIndex;
rayCountLeft[threadIdx.y] = 0;
}
while( rayIndex < count )
{
func.Call(input[rayIndex], tempOut[threadIdx.x+threadWidth*threadIdx.y], rayIndex);
volatile float* from = (float*)&tempOut[threadWidth*threadIdx.y];
volatile float* to = (float*)&output[nextRay[threadIdx.y]];
for(int i=0; i<sizeof(Output)/4; i++)
{
int index = threadWidth*i + threadIdx.x;
to[index] = from[index];
}
if( threadIdx.x == 0 )
{
if( rayCountLeft[threadIdx.y] == 0 )
{
nextRay[threadIdx.y] = atomicAdd(&g_RayCount, threadWidth*4);
rayCountLeft[threadIdx.y] = 3;
}
else
{
rayCountLeft[threadIdx.y]--;
nextRay[threadIdx.y] += threadWidth;
}
}
rayIndex = nextRay[threadIdx.y] + threadIdx.x;
}
}[/codebox]
I’ve been using cuda for a few months now and I keep running in to strange bugs all the time that wastes a lot of development time. Am I the only one?
I’ve tried to examine the assembly but so far I’ve found nothing.