I’m trying to figure out why either this code for storing a float4 vector, or the code for loading a float4 doesn’t appear to be atomic.
The theory is that I stage each float4 to local memory, then 4 threads each write one component of the vector to global memory, and vice versa for loading. This should make each vector read or write coalesced into a single memory transaction, and therefore atomic, but that doesn’t seem to be happening.
Any ideas where things are going wrong?
float4 loadPoint(int nodeIndex, local unsigned int * randStates,local int * pointOffset,local int * oldPointOffset,local float * pointStage, global float* pointPool)
{
int index = get_local_id(0);
oldPointOffset[index] = pointOffset[index];
pointOffset[index] = nodeIndex*NUM_POINTS_PER_NODE+randInt()%NUM_POINTS_PER_NODE;
for (int n = (index&~3); n < ((index&~3)+4);n++)
{
pointStage[n*4+(index&3)]=pointPool[pointOffset[n]+(index&3)];
}
return (float4)(pointStage[index*4],pointStage[index*4+1],pointStage[index*4+2],pointStage[index*4+3]);
}
void storePoint(float4 point, int nodeIndex, local unsigned int * randStates,local int * pointOffset,local int * oldPointOffset,local float * pointStage, global float* pointPool)
{
int index = get_local_id(0);
pointStage[index*4] = point.x;
pointStage[index*4+1]=point.y;
pointStage[index*4+2]=point.z;
pointStage[index*4+3]=point.w;
for (int n = (index&~3); n < ((index&~3)+4); n++)
{
pointPool[oldPointOffset[n]+(index&3)]=pointStage[n*4+(index&3)];
}
}