Dear everybody,
I have a little kernel and I seem to be out of my depths. Here is the kernel itself:
__global__
void potatoe_stamp(float4 * d_particles, float * d_density, uint * d_offsets, int * d_stamp_offsets, float * d_stamp, size_t stamp_vol, size_t chunk_vol, uint chunk_count, size_t particle_num, size_t pixel_num){
//load stamp and stamp offsets into shared memory
extern __shared__ float s_stamp[];
int * s_stamp_offsets = (int *) &s_stamp[chunk_vol];
uint address;
//for all chunks
int c = 0;
int load;
while (c < chunk_count){
//load stamp chunk to shared
int t = threadIdx.x;
load = (c == chunk_count - 1) ? stamp_vol - (chunk_count-1)*chunk_vol : chunk_vol;
while (t < load){
s_stamp_offsets[t] = d_stamp_offsets[c*chunk_vol + t];
s_stamp[t] = d_stamp[c*chunk_vol + t];
t+= blockDim.x;
}
__syncthreads();
int w = blockIdx.x;
while (w < particle_num){
t = threadIdx.x;
while (t < load){
address = d_offsets[w] + s_stamp_offsets[t];
atomicAdd(&d_density[address], d_particles[w].w*s_stamp[t]);
//atomicAdd(&d_density[address], 1.0*w + 1.0*t);
__syncthreads();
t += blockDim.x;
}
w += gridDim.x;
}
c++;
}
}
And here is what it is supposed to do:
- Load chunks of a “stamp”, specified by float values and memory offsets in a preset 3d pixel array (to shared)
- For all chunks of a stamp
for all particles in a set
for all pixels of the stamp
calculate the offset of the pixel relative to the particle in a linearised global 3d density
This somewhat works but the results are slightly different in each run. With a little trick,
atomicAdd(&d_density[address], 1.0*w + 1.0*t);
I found that not all particle/pixel combinations are always realized. This makes me think I somehow have misused atomicAdd. The implementation of atomic add is the standard one:
__device__ __forceinline__ float atomicAdd(float *address, float val)
{
// Doing it all as longlongs cuts one __longlong_as_double from the inner loop
unsigned int *ptr = (unsigned int *)address;
unsigned int old, newint, ret = *ptr;
do {
old = ret;
newint = __float_as_int(__int_as_float(old)+val);
} while((ret = atomicCAS(ptr, old, newint)) != old);
return __int_as_float(ret); unsigned long long old, newdbl, ret = *ptr;
do {
old = ret;
newdbl = __double_as_longlong(__longlong_as_double(old)+val);
} while((ret = atomicCAS(ptr, old, newdbl)) != old);
return __longlong_as_double(ret);
}
}
Can anybody provide me with a hint?