I originally posted this problem inside this thread about an implementation of [font=“Courier New”]atomicAdd [/font]for [font=“Courier New”]short[/font] integers, but thought the problem might be more of a general nature.
The problem I’m having currently is whereas the following test program works perfectly (credit goes to tera and Sylvain Collange for the [font=“Courier New”]atomicAddShort()[/font] function, see above link):
__global__ void test_kernel ( short* d_data)
{
__shared__ short s_data[4];
atomicAddShort(&d_data[0], 32767);
atomicAddShort(&d_data[1], -32768);
atomicAddShort(&d_data[2], 1);
atomicAddShort(&d_data[3], -1);
s_data[0] = d_data[0];
s_data[1] = d_data[1];
s_data[2] = d_data[2];
s_data[3] = d_data[3];
d_data[2] = s_data[2]-1;
d_data[3] = s_data[3]+1;
d_data[0] = s_data[0]+1;
d_data[1] = s_data[1]-1;
}
//tera's signed version
__device__ short atomicAddShort(short* address, short val)
{
unsigned int *base_address = (unsigned int *)((size_t)address & ~2);
unsigned int long_val = ((size_t)address & 2) ? ((unsigned int)val << 16) : (unsigned short)val;
unsigned int long_old = atomicAdd(base_address, long_val);
if((size_t)address & 2) {
return (short)(long_old >> 16);
} else {
unsigned int overflow = ((long_old & 0xffff) + long_val) & 0xffff0000;
if (overflow)
atomicSub(base_address, overflow);
return (short)(long_old & 0xffff);
}
}
int main (void)
{
short h_data[4] = {0};
short* d_data;
cudaMalloc((void**)&d_data, 4*sizeof( short));
cudaMemcpy(d_data, h_data, 4*sizeof( short), cudaMemcpyHostToDevice);
test_kernel<<<1,1>>>(d_data);
cudaMemcpy(h_data, d_data, 4*sizeof( short), cudaMemcpyDeviceToHost);
printf("%d\n%d\n%d\n%d\n", h_data[0], h_data[1], h_data[2], h_data[3]);
return 0;
}
It does not work when the [font=“Courier New”]atomicAddShort()[/font] function is applied in the following way (ie. to operate on the array values stored in shared mem):
__global__ void test_kernel ( short* d_data)
{
__shared__ short s_data[4];
s_data[0] = d_data[0];
s_data[1] = d_data[1];
s_data[2] = d_data[2];
s_data[3] = d_data[3];
atomicAddShort(&s_data[0], 32767);
atomicAddShort(&s_data[1], -32768);
atomicAddShort(&s_data[2], 1);
atomicAddShort(&s_data[3], -1);
d_data[2] = s_data[2]-1;
d_data[3] = s_data[3]+1;
d_data[0] = s_data[0]+1;
d_data[1] = s_data[1]-1;
}
Would anyone have any insight as to why this may be?
A memory alignment issue?
Cheers,
Mike