shared memory issue

Hello,

i just tryed to write some (12) float values to shared memory, calculate with them, and write it back to global memory.

i recognized, that only 4 of 12 float values of each thread have been written to shared memory, and i just dont know why:

I use a 9800, max blocksize is 512. Max numbers of SMem/Block is 16 Kb, so each thread (512/block) should be able to store 16384/4/512=8 float values. In my example (see below) only the first half of each block is writing to smem, so i should be able to store 16 float values.

In case this was the bareer, i tryed it with a blocksize of 128 to be sure - same result. only 4 float values get written. the result looks like “float float float float 0 0 0 0 0 0 0 0 float float float float 0 0 0 0…”

what am i doing wrong?

__global__ void CUDA_GrafFirstPass(FLOAT* fpPunkte, FLOAT* fpResult, ULONG ulAnz)

{

extern __shared__ FLOAT  afData[];

LOCAL ULONG  lulTid = threadIdx.x;

LOCAL ULONG  lulI   = blockIdx.x*blockDim.x+threadIdx.x;

if (lulI>=ulAnz)

{

	 fpPunkte[lulI*6+3]=fpPunkte[3];

	 fpPunkte[lulI*6+4]=fpPunkte[4];

	 fpPunkte[lulI*6+5]=fpPunkte[5];

}

__syncthreads();

if (lulTid<blockDim.x/2)

{

//don't care 'bout the calculation, it doesnt work with simple values too...

afData[lulTid*12]=min(fpPunkte[lulI*6+3], fpPunkte[(lulI+blockDim.x/2)*6+3]);

afData[lulTid*12+1]=min(fpPunkte[lulI*6+4], fpPunkte[(lulI+blockDim.x/2)*6+4]);

afData[lulTid*12+2]=min(fpPunkte[lulI*6+5], fpPunkte[(lulI+blockDim.x/2)*6+5]);

afData[lulTid*12+3]=max(fpPunkte[lulI*6+3], fpPunkte[(lulI+blockDim.x/2)*6+3]);

afData[lulTid*12+4]=max(fpPunkte[lulI*6+4], fpPunkte[(lulI+blockDim.x/2)*6+4]);

afData[lulTid*12+5]=max(fpPunkte[lulI*6+5], fpPunkte[(lulI+blockDim.x/2)*6+5]);

afData[lulTid*12+6]=min((fpPunkte[lulI*6+3]*caf3GrafMatV[0].x+fpPunkte[lulI*6+4]*caf3GrafMatV[1].x+fpPunkte[lulI*6+5]*caf3GrafMatV[2].x+caf3GrafMatV[3].x),(fpPunkte[(lulI+blockDim.x/2)*6+3]*caf3GrafMatV[0].x+fpPunkte[(lulI+blockDim.x/2)*6+4]*caf3GrafMatV[1].x+fpPunkte[(lulI+blockDim.x/2)*6+5]*caf3GrafMatV[2].x+caf3GrafMatV[3].x));

afData[lulTid*12+7]=min((fpPunkte[lulI*6+3]*caf3GrafMatV[0].x+fpPunkte[lulI*6+4]*caf3GrafMatV[1].x+fpPunkte[lulI*6+5]*caf3GrafMatV[2].x+caf3GrafMatV[3].x),(fpPunkte[(lulI+blockDim.x/2)*6+3]*caf3GrafMatV[0].y+fpPunkte[(lulI+blockDim.x/2)*6+4]*caf3GrafMatV[1].y+fpPunkte[(lulI+blockDim.x/2)*6+5]*caf3GrafMatV[2].y+caf3GrafMatV[3].y));

afData[lulTid*12+8]=min((fpPunkte[lulI*6+3]*caf3GrafMatV[0].x+fpPunkte[lulI*6+4]*caf3GrafMatV[1].x+fpPunkte[lulI*6+5]*caf3GrafMatV[2].x+caf3GrafMatV[3].x),(fpPunkte[(lulI+blockDim.x/2)*6+3]*caf3GrafMatV[0].z+fpPunkte[(lulI+blockDim.x/2)*6+4]*caf3GrafMatV[1].z+fpPunkte[(lulI+blockDim.x/2)*6+5]*caf3GrafMatV[2].z+caf3GrafMatV[3].z));

afData[lulTid*12+9]=max((fpPunkte[lulI*6+3]*caf3GrafMatV[0].x+fpPunkte[lulI*6+4]*caf3GrafMatV[1].x+fpPunkte[lulI*6+5]*caf3GrafMatV[2].x+caf3GrafMatV[3].x),(fpPunkte[(lulI+blockDim.x/2)*6+3]*caf3GrafMatV[0].x+fpPunkte[(lulI+blockDim.x/2)*6+4]*caf3GrafMatV[1].x+fpPunkte[(lulI+blockDim.x/2)*6+5]*caf3GrafMatV[2].x+caf3GrafMatV[3].x));

afData[lulTid*12+10]=max((fpPunkte[lulI*6+3]*caf3GrafMatV[0].x+fpPunkte[lulI*6+4]*caf3GrafMatV[1].x+fpPunkte[lulI*6+5]*caf3GrafMatV[2].x+caf3GrafMatV[3].x),(fpPunkte[(lulI+blockDim.x/2)*6+3]*caf3GrafMatV[0].y+fpPunkte[(lulI+blockDim.x/2)*6+4]*caf3GrafMatV[1].y+fpPunkte[(lulI+blockDim.x/2)*6+5]*caf3GrafMatV[2].y+caf3GrafMatV[3].y));

afData[lulTid*12+11]=max((fpPunkte[lulI*6+3]*caf3GrafMatV[0].x+fpPunkte[lulI*6+4]*caf3GrafMatV[1].x+fpPunkte[lulI*6+5]*caf3GrafMatV[2].x+caf3GrafMatV[3].x),(fpPunkte[(lulI+blockDim.x/2)*6+3]*caf3GrafMatV[0].z+fpPunkte[(lulI+blockDim.x/2)*6+4]*caf3GrafMatV[1].z+fpPunkte[(lulI+blockDim.x/2)*6+5]*caf3GrafMatV[2].z+caf3GrafMatV[3].z));

}

Do you specify enough shared memory when you invoke the kernel? I.e.:

CUDA_GrafFirstPass<<<gridSize, blockSize, sharedMemorySizeInBytes>>>(...)

Note that you actually have slightly less than 16KiB available because built-in variables are also stored in that memory.

haha…thats kind of…embarrassing for me, cause i didn’t even used that parameter, and must have been working with a default value.

works perfectly now, like this:

CUDA_GrafFirstPass<<<lulGridSize, lulBlockSize, lulBlockSize*12*4>>>(lfpPunkte, lfpResult, ulBytes/(6*sizeof(float)));

thank you very much…i got to learn so much-.-