Hello,
i just tryed to write some (12) float values to shared memory, calculate with them, and write it back to global memory.
i recognized, that only 4 of 12 float values of each thread have been written to shared memory, and i just dont know why:
I use a 9800, max blocksize is 512. Max numbers of SMem/Block is 16 Kb, so each thread (512/block) should be able to store 16384/4/512=8 float values. In my example (see below) only the first half of each block is writing to smem, so i should be able to store 16 float values.
In case this was the bareer, i tryed it with a blocksize of 128 to be sure - same result. only 4 float values get written. the result looks like “float float float float 0 0 0 0 0 0 0 0 float float float float 0 0 0 0…”
what am i doing wrong?
__global__ void CUDA_GrafFirstPass(FLOAT* fpPunkte, FLOAT* fpResult, ULONG ulAnz)
{
extern __shared__ FLOAT afData[];
LOCAL ULONG lulTid = threadIdx.x;
LOCAL ULONG lulI = blockIdx.x*blockDim.x+threadIdx.x;
if (lulI>=ulAnz)
{
fpPunkte[lulI*6+3]=fpPunkte[3];
fpPunkte[lulI*6+4]=fpPunkte[4];
fpPunkte[lulI*6+5]=fpPunkte[5];
}
__syncthreads();
if (lulTid<blockDim.x/2)
{
//don't care 'bout the calculation, it doesnt work with simple values too...
afData[lulTid*12]=min(fpPunkte[lulI*6+3], fpPunkte[(lulI+blockDim.x/2)*6+3]);
afData[lulTid*12+1]=min(fpPunkte[lulI*6+4], fpPunkte[(lulI+blockDim.x/2)*6+4]);
afData[lulTid*12+2]=min(fpPunkte[lulI*6+5], fpPunkte[(lulI+blockDim.x/2)*6+5]);
afData[lulTid*12+3]=max(fpPunkte[lulI*6+3], fpPunkte[(lulI+blockDim.x/2)*6+3]);
afData[lulTid*12+4]=max(fpPunkte[lulI*6+4], fpPunkte[(lulI+blockDim.x/2)*6+4]);
afData[lulTid*12+5]=max(fpPunkte[lulI*6+5], fpPunkte[(lulI+blockDim.x/2)*6+5]);
afData[lulTid*12+6]=min((fpPunkte[lulI*6+3]*caf3GrafMatV[0].x+fpPunkte[lulI*6+4]*caf3GrafMatV[1].x+fpPunkte[lulI*6+5]*caf3GrafMatV[2].x+caf3GrafMatV[3].x),(fpPunkte[(lulI+blockDim.x/2)*6+3]*caf3GrafMatV[0].x+fpPunkte[(lulI+blockDim.x/2)*6+4]*caf3GrafMatV[1].x+fpPunkte[(lulI+blockDim.x/2)*6+5]*caf3GrafMatV[2].x+caf3GrafMatV[3].x));
afData[lulTid*12+7]=min((fpPunkte[lulI*6+3]*caf3GrafMatV[0].x+fpPunkte[lulI*6+4]*caf3GrafMatV[1].x+fpPunkte[lulI*6+5]*caf3GrafMatV[2].x+caf3GrafMatV[3].x),(fpPunkte[(lulI+blockDim.x/2)*6+3]*caf3GrafMatV[0].y+fpPunkte[(lulI+blockDim.x/2)*6+4]*caf3GrafMatV[1].y+fpPunkte[(lulI+blockDim.x/2)*6+5]*caf3GrafMatV[2].y+caf3GrafMatV[3].y));
afData[lulTid*12+8]=min((fpPunkte[lulI*6+3]*caf3GrafMatV[0].x+fpPunkte[lulI*6+4]*caf3GrafMatV[1].x+fpPunkte[lulI*6+5]*caf3GrafMatV[2].x+caf3GrafMatV[3].x),(fpPunkte[(lulI+blockDim.x/2)*6+3]*caf3GrafMatV[0].z+fpPunkte[(lulI+blockDim.x/2)*6+4]*caf3GrafMatV[1].z+fpPunkte[(lulI+blockDim.x/2)*6+5]*caf3GrafMatV[2].z+caf3GrafMatV[3].z));
afData[lulTid*12+9]=max((fpPunkte[lulI*6+3]*caf3GrafMatV[0].x+fpPunkte[lulI*6+4]*caf3GrafMatV[1].x+fpPunkte[lulI*6+5]*caf3GrafMatV[2].x+caf3GrafMatV[3].x),(fpPunkte[(lulI+blockDim.x/2)*6+3]*caf3GrafMatV[0].x+fpPunkte[(lulI+blockDim.x/2)*6+4]*caf3GrafMatV[1].x+fpPunkte[(lulI+blockDim.x/2)*6+5]*caf3GrafMatV[2].x+caf3GrafMatV[3].x));
afData[lulTid*12+10]=max((fpPunkte[lulI*6+3]*caf3GrafMatV[0].x+fpPunkte[lulI*6+4]*caf3GrafMatV[1].x+fpPunkte[lulI*6+5]*caf3GrafMatV[2].x+caf3GrafMatV[3].x),(fpPunkte[(lulI+blockDim.x/2)*6+3]*caf3GrafMatV[0].y+fpPunkte[(lulI+blockDim.x/2)*6+4]*caf3GrafMatV[1].y+fpPunkte[(lulI+blockDim.x/2)*6+5]*caf3GrafMatV[2].y+caf3GrafMatV[3].y));
afData[lulTid*12+11]=max((fpPunkte[lulI*6+3]*caf3GrafMatV[0].x+fpPunkte[lulI*6+4]*caf3GrafMatV[1].x+fpPunkte[lulI*6+5]*caf3GrafMatV[2].x+caf3GrafMatV[3].x),(fpPunkte[(lulI+blockDim.x/2)*6+3]*caf3GrafMatV[0].z+fpPunkte[(lulI+blockDim.x/2)*6+4]*caf3GrafMatV[1].z+fpPunkte[(lulI+blockDim.x/2)*6+5]*caf3GrafMatV[2].z+caf3GrafMatV[3].z));
}