Hi,
In its simplest form, simple loop. You will notice that xm4 is xm3 - 256 floats (1024 bytes) ONLY if x is not a 7 + multiple of 8.
for (x = 0; x < nx; x++) {
int xm4, xm3;
xm4 = 256 * ( x & 7);
xm3 = 256 * ((x + 1) & 7);
// Save the new sample
xIn[xm4] = xIn[xm3];
barrier(CLK_LOCAL_MEM_FENCE);
}
The NVVM assembly misses that and ALWAYS computes xm4 as xm3 - 256 float (1024 bytes):
mov.u32 %r34, 0;
mov.u32 %r33, %r34;
BB0_2:
add.s32 %r33, %r33, 256;
and.b32 %r28, %r33, 1792;
add.s32 %r29, %r28, %r3;
shl.b32 %r30, %r29, 2;
ld.param.u32 %r32, [rtmForwardStep2D_param_19];
add.s32 %r31, %r32, %r30;
ld.shared.f32 %f1, [%r31];
st.shared.f32 [%r31+-1024], %f1;
bar.sync 0;
add.s32 %r34, %r34, 1;
setp.lt.s32 %p2, %r34, %r17;
@%p2 bra BB0_2;
This error occurs with:
//
// Generated by NVIDIA NVVM Compiler
// Compiler built on Mon Oct 17 21:21:39 2011 (1318904499)
// Driver 285.05.15
//