Once again I have a problem which is most likely caused by me. I can’t understand why this doesn’t work, since the kernels read from the correct position in shared memory.
Someone correct me if I’m wrong, but unattached shared memory (extern) should be right at the beginning of shared memory as opposed to attached memory that can be anywhere in shared memory, correct?
The effect I’m seeing here is that only Threads (0,0) and (0,1) seem to be able to read from shared memory when it’s extern. Can someone explain the effect I’m seeing here? (Kernel is called with an 8x8 block dimension).
attached shared mem
__global__ void
testKernel(float4* frameBuffer, int frameWidth)
{
__shared__ float4 bucket[64];
if(threadIdx.x == 0 && threadIdx.y == 0)
{
for(int i = 0; i < 64; i++)
{
bucket[i] = make_float4(0.5, 0.5, 0.5, 0.5);
}
}
long memIdx = (threadIdx.x+blockIdx.x*blockDim.x) +
((threadIdx.y+blockIdx.y*blockDim.y) * frameWidth);
int bkIdx = (threadIdx.x + blockDim.x * threadIdx.y);
float4* val = &bucket[bkIdx];
__syncthreads();
frameBuffer[memIdx] = *val;
return;
}
unattached shared mem
__global__ void
testKernel(float4* frameBuffer, int frameWidth)
{
extern __shared__ float4 bucket[];
if(threadIdx.x == 0 && threadIdx.y == 0)
{
for(int i = 0; i < 64; i++)
{
bucket[i] = make_float4(0.5, 0.5, 0.5, 0.5);
}
}
long memIdx = (threadIdx.x+blockIdx.x*blockDim.x) +
((threadIdx.y+blockIdx.y*blockDim.y) * frameWidth);
int bkIdx = (threadIdx.x + blockDim.x * threadIdx.y);
float4* val = &bucket[bkIdx];
__syncthreads();
frameBuffer[memIdx] = *val;
return;
}