So this is a question that likely only someone at nvidia can answer; where does shared memory start, 0x0000000000000000 or 0x0000000200000000 ?
I am asking based on this example code
void
createIndicesCompaction( T* s_compaction_list_exc,
unsigned int num_threads_compaction ) {
unsigned int offset = 1;
const unsigned int tid = threadIdx.x;
// higher levels of scan tree
for(int d = (num_threads_compaction >> 1); d > 0; d >>= 1) {
__syncthreads();
if (tid < d) {
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
///////// THIS PART
unsigned int ai = offset*(2*tid+1)-1;
unsigned int bi = offset*(2*tid+2)-1;
s_compaction_list_exc[bi] = s_compaction_list_exc[bi]
+ s_compaction_list_exc[ai];
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
}
offset <<= 1;
}
// traverse down tree: first down to level 2 across
for( int d = 2; d < num_threads_compaction; d <<= 1) {
offset >>= 1;
__syncthreads();
if (tid < (d-1)) {
unsigned int ai = offset*(tid+1) - 1;
unsigned int bi = ai + (offset >> 1);
s_compaction_list_exc[bi] = s_compaction_list_exc[bi]
+ s_compaction_list_exc[ai];
}
}
__syncthreads();
}
where the highlighted part is compiled into the following PTX
mul24.lo.u32 %r61, %r5, 2; //
add.u32 %r62, %r61, 2; //
mul.lo.u32 %r63, %r62, %r60; //
cvt.u64.u32 %rd24, %r63; //
mul.lo.u64 %rd25, %rd24, 2; //
add.u64 %rd26, %rd3, %rd25; //
add.u64 %rd27, %rd26, 8589934592; //
ld.shared.u16 %rh6, [%rd27+0]; // id:1279 __cuda_s_compaction_list3206+0x0
Now note the instruction “add.u64 %rd27, %rd26, 8589934592” which adds the value 0x0000000200000000 to the computed offset (offset*(tid+1) - 1). Why is this instruction there? Is it adding the base address of shared memory (no other use of shared memory that I have seen adds an offset)? Is it a compiler bug? Any ideas?