When I compile the following nonsense test program:
_shared__ int shared[8];
__device__ void share1(void)
{
int i = threadIdx.x;
__shared__ int j[8];
j[i] = shared[i+1];
shared[i] = j[i+1];
}
__device__ void share2(void)
{
int i = threadIdx.x;
__shared__ int j[8];
j[i] = shared[i+1];
shared[i] += j[i+1];
}
__global__ void kernel(void)
{
share1();
share2();
}
I get the following ptx file (extract of):
.shared .align 4 .b8 shared[32];
.shared .align 4 .b8 _ZZ6share1vE1j[32];
.shared .align 4 .b8 _ZZ6share2vE1j[32];
......................
.entry kernel
{
.reg .u32 $r1,$r2,$r3,$r4,$r5,$r6,$r7,$r8,$r9,
$r10,$r11,$r12;
# .loc 10 21 0
# 17 j[i] = shared[i+1];
# 18 shared[i] += j[i+1];
# 19 }
# 20
# 21 __global__ void kernel(void)
$LBB1_kernel:
# .loc 10 8 0
cvt.u32.u16 $r1, %tid.x; #
mul.lo.u32 $r2, $r1, 4; #
mov.u32 $r3, (&shared); #
add.u32 $r4, $r2, $r3; #
mov.u32 $r5, (&_ZZ6share1vE1j); #
add.u32 $r6, $r2, $r5; #
ld.shared.s32 $r7, [$r4+4]; # id:26 shared+0x0
st.shared.s32 [$r6+0], $r7; # id:27 _ZZ6share1vE1j+0x0
# .loc 10 9 0
ld.shared.s32 $r8, [$r6+4]; # id:28 _ZZ6share1vE1j+0x0
st.shared.s32 [$r4+0], $r8; # id:29 shared+0x0
# .loc 10 17 0
mov.u32 $r9, (&_ZZ6share2vE1j); #
add.u32 $r10, $r2, $r9; #
st.shared.s32 [$r10+0], $r7; # id:30 _ZZ6share2vE1j+0x0
# .loc 10 18 0
ld.shared.s32 $r11, [$r10+4]; # id:31 _ZZ6share2vE1j+0x0
add.s32 $r12, $r11, $r8; #
st.shared.s32 [$r4+0], $r12; # id:32 shared+0x0
# .loc 10 24 0
# 22 {
# 23 share1();
# 24 share2();
exit; #
} # kernel
.version 1.1
Question is why are the shared segments _ZZ6share1vE1j and _ZZ6share2vE1j not overlaid? The compiler can easily work out which autos can be overlaid and save heaps of a precious resource. Sorry if there has already been a topic, I could not find it, but this does seem to be pretty basic! It blows my shared mem budget out of the water!
Thanks, Eric