Way 1:
#define AIR(x) (IR(x)&0x7fffffff)
#define IR(x) ((udword&)(x))
kernel code:
udword tx = threadIdx.x;
udword ty = threadIdx.y;
udword bw = blockDim.x;
udword bh = blockDim.y;
udword tid = __umul24(ty, bw) + tx;
udword bid = __umul24(__umul24(blockIdx.x, bw), bh);
udword th_id = bid + tid;
shared float3 pairsCen[512];
shared float3 pairsExt[512];
…
…
unsigned int *index0 = &d_Sorted[objID];
float3 cen0 = pairsCen[index0];
float3 extent0 = pairsExt[index0];
unsigned int *index = index0 + 1;
udword offset = 0;
i=0;
while(i<32)
{
float3 cen = pairsCen[index];
float3 extent = pairsExt[index];
float t1 = cen0.y-cen.y;
float e1 = extent0.y+extent.y;
if(AIR(t1) <= IR(e1)) //(fabs(t1)<=e1)
{
float t2 = cen0.z-cen.z;
float e2 = extent0.z+extent.z;
if(AIR(t2) <= IR(e2)) //(fabs(t2)<=e2)
{
d_mulpairs[th_id+offset*totalThreadNum] = *(start_index + index)+1;
offset++;
}
}
index++;
i++;
}
Way2:
#define AIR(x) (IR(x)&0x7fffffff)
#define IR(x) ((udword&)(x))
kernel code:
udword tx = threadIdx.x;
udword ty = threadIdx.y;
udword bw = blockDim.x;
udword bh = blockDim.y;
udword tid = __umul24(ty, bw) + tx;
udword bid = __umul24(__umul24(blockIdx.x, bw), bh);
udword th_id = bid + tid;
shared float3 pairsCen[512];
shared float3 pairsExt[512];
…
…
unsigned int *index0 = &d_Sorted[objID];
float3 cen0 = pairsCen[index0];
float3 extent0 = pairsExt[index0];
unsigned int *index = index0 + 1;
udword offset = 0;
udword indexBits = 0;
udword mask = 1;
i=0;
while(i<32)
{
float t1 = cen0.y-cen.y;
float e1 = extent0.y+extent.y;
if(AIR(t1) <= IR(e1)) //(fabs(t1)<=e1)
{
float t2 = cen0.z-cen.z;
float e2 = extent0.z+extent.z;
if(AIR(t2) <= IR(e2)) //(fabs(t2)<=e2)
{
indexBits = indexBits | mask << offset;
offset++;
}
}
index++;
i++;
}
__syncthreads();
d_mulpairs[th_id] = indexBits;
In way 1, writing to global memory is random; While writing to global memory is contiguous in way 2. So, way 2 should be faster than way 1.
But the timing results are similiar. I wonder the reason. It doesn’t make sense. And, I check the PTX code and find all the local variables stored in registers.
But register is fast for access. Anybody who can give me some advice? Thanks very much!