I am using shared memory (a_smem and b_smem) in my CUDA kernel to perform a simple difference operation over 19 elements.
My kernel is launched with 608 threads per block to copy data i.e 24 bytes per thread, but only 192 threads participate in the differnece computation.
Given fixed input vectors (aIn and bIn), I expect the output to be consistent. However, I observe random variations in the results across different runs.
each thread accesses a different 32-bit memory location at a time. I would appreciate if anyone could throw insights about his random behaviour.
Thank you
global void Kernel()
{
__shared__ int16_t a_smem[384 * 19];
__shared__ int16_t b_smem[384 * 19];
uint32_t chunk_len = 384;
uint64_t a_reg[3];
uint64_t b_reg[3];
int32_t *a_reg_ptr = (int32_t *)a_reg;
int32_t *b_reg_ptr = (int32_t *)b_reg;
uint32_t tidx = threadIdx.x; // 0 to 607
uint32_t col = tidx / 32; // 0 to 18 max
uint32_t idx = (tidx % 32) * 12; // 0 to 372
int16_t * aptr = &aIn[col * chunk_len];
// one warp copies 32*24 bytes
uint64_t *p1 = reinterpret_cast<uint64_t *>(&aptr[idx]);
a_reg[0] = *p1++;
a_reg[1] = *p1++;
a_reg[2] = *p1;
int16_t * bptr = &bIn[col * chunk_len];
uint64_t *p2 = reinterpret_cast<uint64_t *>(&bptr[idx]);
b_reg[0] = *p1++;
b_reg[1] = *p1++;
b_reg[2] = *p1;
int32_t *a_smem_ptr = (int32_t *)&a_smem[(col * chunk_len) + idx];
for (int x = 0; x < 6; x++)
{
a_smem_ptr[x] = a_reg_ptr[x];
}
int32_t *b_smem_ptr = (int32_t *)&b_smem[(col * chunk_len) + idx];
for (int x = 0; x < 6; x++)
{
b_smem_ptr[x] = b_reg_ptr[x];
}
__syncthreads();
if(tidx < chunk_len/2) //192 threads
{
for (uint32_t ind = 0; ind < 19; col_ind++)
{
int32_t *ptr_a = (int32_t *)&a_smem[ind * chunk_len];
int32_t *ptr_b = (int32_t *)&b_smem[ind * chunk_len];
// Saturating Subtraction
int32_t a = ptr_a[tidx];
int16_t a0 = a & 0xFFFF;
int16_t a1 = (a >> 16) & 0xFFFF;
int32_t b = ptr_b[tidx];
int16_t b0 = b & 0xFFFF;
int16_t b1 = (b >> 16) & 0xFFFF;
int16_t sub_val0 = b0 - a0;
int16_t sub_val1 = b1 - a1;
// store updates
int32_t tmp = (sub_val1 << 16);
tmp = tmp & 0xFFFF0000;
tmp = tmp | (sub_val0 & 0xFFFF);
ptr_b[tidx] = tmp;
}
}
for (int x = 0; x < 6; x++)
{
b_reg_ptr[x] = b_smem_ptr[x];
}
uint64_t *p3 = reinterpret_cast<uint64_t *>(&bptr[z_idx]);
*p3++ = b_reg[0];
*p3++ = b_reg[1];
*p3 = b_reg[2];
}