Trouble tracking down error

I am trying to implement an indirect add operation where one of the operands may be also used as the destination.

In this case result and y are the same variable. Y is much smaller than X leading to heavy contention but I would expect marking the pointers as volatile and reading from the write-ptr before I generate the result would ensure that the atomicCAS would force a recalculation in the case where someone else wrote to Y. Then Y’s value would get reloaded and everything would work.

This is not the case on a 960 GPU, however.

The operation looks:
/code
template
device
void indirect_add (dtype alpha, dtype* x, const int* x_indexes
, dtype beta, dtype* y, const int* y_indexes
, dtype* res, const int* res_indexes
, int n_elems_per_idx, int n_indexes)
{
typedef Converter TConverterType;
typedef typename TConverterType::rettype TIntType;
int i = blockDim.x * blockIdx.x + threadIdx.x;
int elem_count = n_elems_per_idx * n_indexes;
if ( i < elem_count ) {
int index_idx = i / n_elems_per_idx;
int elem_offset = i % n_elems_per_idx;
int x_offset = (x_indexes[index_idx] * n_elems_per_idx) + elem_offset;
int y_offset = (y_indexes[index_idx] * n_elems_per_idx) + elem_offset;
int res_offset = (res_indexes[index_idx] * n_elems_per_idx) + elem_offset;
dtype* write_ptr = res + res_offset;
TIntType* int_addr = TConverterType::from(write_ptr);
volatile dtype* safe_write_ptr = write_ptr;
TIntType old, assumed;
volatile dtype* x_ptr = x + x_offset;
volatile dtype* y_ptr = y + y_offset;
do {
assumed = TConverterType::from(*safe_write_ptr);
dtype new_value = alpha * (*x_ptr) + beta * (*y_ptr);
old = atomicCAS(int_addr, assumed, TConverterType::from(new_value));
// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
} while (assumed != old);
}
}

The generate PTX for double arithmetic looks fine to me:
visible .entry indirect_add_d(
.param .f64 indirect_add_d_param_0,
.param .u64 indirect_add_d_param_1,
.param .u64 indirect_add_d_param_2,
.param .f64 indirect_add_d_param_3,
.param .u64 indirect_add_d_param_4,
.param .u64 indirect_add_d_param_5,
.param .u64 indirect_add_d_param_6,
.param .u64 indirect_add_d_param_7,
.param .u32 indirect_add_d_param_8,
.param .u32 indirect_add_d_param_9
)
{
.reg .pred %p<3>;
.reg .b32 %r<16>;
.reg .f64 %fd<8>;
.reg .b64 %rd<26>;

ld.param.f64 	%fd1, [indirect_add_d_param_0];
ld.param.u64 	%rd4, [indirect_add_d_param_1];
ld.param.u64 	%rd5, [indirect_add_d_param_2];
ld.param.f64 	%fd2, [indirect_add_d_param_3];
ld.param.u64 	%rd6, [indirect_add_d_param_4];
ld.param.u64 	%rd7, [indirect_add_d_param_5];
ld.param.u64 	%rd8, [indirect_add_d_param_6];
ld.param.u64 	%rd9, [indirect_add_d_param_7];
ld.param.u32 	%r2, [indirect_add_d_param_8];
ld.param.u32 	%r3, [indirect_add_d_param_9];
mov.u32 	%r4, %ntid.x;
mov.u32 	%r5, %ctaid.x;
mov.u32 	%r6, %tid.x;
mad.lo.s32 	%r1, %r5, %r4, %r6;
mul.lo.s32 	%r7, %r3, %r2;
setp.ge.s32	%p1, %r1, %r7;
@%p1 bra 	BB0_3;

cvta.to.global.u64 	%rd10, %rd5;
div.s32 	%r8, %r1, %r2;
mul.wide.s32 	%rd11, %r8, 4;
add.s64 	%rd12, %rd10, %rd11;
ld.global.u32 	%r9, [%rd12];
rem.s32 	%r10, %r1, %r2;
mad.lo.s32 	%r11, %r9, %r2, %r10;
cvta.to.global.u64 	%rd13, %rd7;
add.s64 	%rd14, %rd13, %rd11;
ld.global.u32 	%r12, [%rd14];
mad.lo.s32 	%r13, %r12, %r2, %r10;
cvta.to.global.u64 	%rd15, %rd9;
add.s64 	%rd16, %rd15, %rd11;
ld.global.u32 	%r14, [%rd16];
mad.lo.s32 	%r15, %r14, %r2, %r10;
cvta.to.global.u64 	%rd17, %rd8;
mul.wide.s32 	%rd18, %r15, 8;
add.s64 	%rd1, %rd17, %rd18;
cvta.to.global.u64 	%rd19, %rd4;
mul.wide.s32 	%rd20, %r11, 8;
add.s64 	%rd2, %rd19, %rd20;
cvta.to.global.u64 	%rd21, %rd6;
mul.wide.s32 	%rd22, %r13, 8;
add.s64 	%rd3, %rd21, %rd22;

BB0_2:
ld.volatile.global.f64 %fd3, [%rd1];
mov.b64 %rd23, %fd3;
ld.volatile.global.f64 %fd4, [%rd2];
ld.volatile.global.f64 %fd5, [%rd3];
mul.f64 %fd6, %fd5, %fd2;
fma.rn.f64 %fd7, %fd4, %fd1, %fd6;
mov.b64 %rd24, %fd7;
atom.global.cas.b64 %rd25, [%rd1], %rd23, %rd24;
setp.ne.s64 %p2, %rd23, %rd25;
@%p2 bra BB0_2;

BB0_3:
ret;
}

In any case it produces inconsistent results from run to run so there is definitely a synchronization error in there somwhere but I have been unable to find it. I must be breaking a rule w/r/t threading in cuda.

You have to use the “assumed” value in your calculation. There is no point checking it and then using something else, because the value pointed at might change at any time.

Above is the correct answer. If I know that one of the two arguments alias the result then I have to use a different method.