While investigating my previous issue I ran across another problem with shared memory. Consider the following code snippet:
module shr
integer, parameter :: size = 10000
integer, device :: glob(size)
contains
attributes(global) subroutine sh_test(ncs)
implicit none
integer, value :: ncs
integer, shared :: ncsp
integer :: indx
indx = (blockidx%x-1) * blockdim%x + threadidx%x
if(threadidx%x .eq. 1) then
ncsp = 2*ncs
end if
call syncthreads()
if(indx .lt. size) then
glob(indx) = ncs+ncsp
end if
end subroutine sh_test
end module shr
This compiles to:
.global .align 16 .b8 shr_16[40000];
.entry sh_test (
.param .s32 __cudaparm_sh_test___V_ncs)
{
.reg .u16 %rh<4>;
.reg .u32 %r<16>;
.reg .u64 %rd<5>;
.reg .pred %p<4>;
.loc 14 5 0
$LBB1_sh_test:
.loc 14 17 0
cvt.u32.u16 %r1, %tid.x;
mov.u32 %r2, 0;
setp.ne.s32 %p1, %r1, %r2;
@%p1 bra $Lt_0_258; // <-- Skip the next few instructions unless threadIdx.x == 0
.loc 14 18 0
ld.param.s32 %r3, [__cudaparm_sh_test___V_ncs];
mul.lo.s32 %r4, %r3, 2;
mov.s32 %r5, %r4; // <-- Why the extra mov ?
$Lt_0_258:
.loc 14 20 0
bar.sync 0; // <-- All threads resume here
.loc 14 21 0
cvt.s32.u16 %r6, %ntid.x;
cvt.u16.u32 %rh1, %r6;
mov.u16 %rh2, %ctaid.x;
mul.wide.u16 %r7, %rh1, %rh2;
add.s32 %r8, %r7, %r1;
add.s32 %r9, %r8, 1;
mov.u32 %r10, 9999;
setp.gt.s32 %p2, %r9, %r10; // <-- if (indx .lt. size) then ...
@%p2 bra $Lt_0_514;
.loc 14 22 0
ld.param.s32 %r11, [__cudaparm_sh_test___V_ncs];
mov.s32 %r12, %r5; // <-- For most threads %r5 has not been set!
add.s32 %r13, %r11, %r12;
mul.lo.s32 %r14, %r8, 4;
cvt.u64.s32 %rd1, %r14;
mov.u64 %rd2, shr_16;
add.u64 %rd3, %rd1, %rd2;
st.global.s32 [%rd3+0], %r13;
$Lt_0_514:
.loc 14 24 0
exit;
$LDWend_sh_test:
} // sh_test
Compare this with some corresponding excerpts from the CUCDA C version:
. . .
@%p1 bra $Lt_0_1794;
.loc 14 24 0
ld.param.s32 %r3, [__cudaparm__Z7sh_testi_ncs];
mul.lo.s32 %r4, %r3, 2;
st.shared.s32 [ncsp], %r4; // <-- result stored in shared memory
$Lt_0_1794:
.loc 14 18 0
bar.sync 0;
. . .
@%p2 bra $Lt_0_2306;
.loc 14 30 0
ld.param.s32 %r8, [__cudaparm__Z7sh_testi_ncs];
ld.shared.s32 %r9, [ncsp]; // <-- result retrieved from shared memory before use.
add.s32 %r10, %r8, %r9;
Admittedly, this is something of a contrived example, as there the use of shared memory is pointless here, but it should still work. In fact, the code in the first example seems to result in register %r5 being used without being set in all threads except %tid.x == 0.
Am I correct, or is there some subtlety I’m missing?
-robert.