Actually it seems to work for me. Here is my code:
float2 temp;
for (int i=0; i<3; i++)
{
int done = 0;
temp = force[4*NodeNumber+i];
__syncthreads();
temp.x = tid;
temp.y += F[i];
force[4*NodeNumber+i] = temp;
for (int q=0; q < VALENCE; q++)
{
__syncthreads();
temp = force[4*NodeNumber+i];
// Check if the write succeed
if ( force[4*NodeNumber+i].x == tid )
{
done = 1;
}
__syncthreads();
// If not, write again
if ( !done )
{
temp.x = tid;
temp.y += F[i];
force[4*NodeNumber+i] = temp;
}
}
}
The for loop on i is for compute each compoment of my force vector. My force array is declared like that:
float2* force = 0;
cudaMalloc((void**)&force, 4*NumNodes*sizeof(float2));
cudaMemset(force, 0, 4*NumNodes*sizeof(float2));
My algorithm seems to work, and seems stable as far as I can tell. It’s slow so far because my kernel is using 38 registers so I’m trying to decrease that number. But the result seems correct. The compiler doesn’t do the optimization you got:
# 287 for (int i=0; i<3; i++)
# 288 {
# 289 int done = 0;
mov.s16 $rh1, 0; #
mov.s32 $r88, 0; #
# .loc 10 292 0
# 290
# 291
# 292 temp = force[4*NodeNumber+i];
ld.global.f32 $f398, [$r87+4]; # id:1345
# .loc 10 293 0
# 293 __syncthreads();
bar.wait 0; #
# .loc 10 295 0
# 294 temp.x = tid;
# 295 temp.y += F[i];
ld.local.f32 $f399, [$r83+0]; # id:1346 F$0+0x0
add.f32 $f398, $f399, $f398; #
st.global.v2.f32 [$r87+0], {$f397,$f398}; #
# .loc 10 299 0
# 296 force[4*NodeNumber+i] = temp;
# 297
# 298
# 299 for (int q=0; q < VALENCE; q++)
mov.s16 $rh2, 0; #
mov.s32 $r89, 0; #
$Lt_0_50:
#<loop> Loop body line 299, nesting depth: 2, iterations: 8
# .loc 10 301 0
# 300 {
# 301 __syncthreads();
bar.wait 0; #
ld.global.v2.f32 {$f400,$f398}, [$r87+0]; #
# .loc 10 304 0
# 302 temp = force[4*NodeNumber+i];
# 303 // Check if the write succeed
# 304 if ( force[4*NodeNumber+i].x == tid )
mov.s32 $r90, 1; #
setp.eq.f32 $p2, $f400, $f397; #
selp.s32 $r88, $r90, $r88, $p2; #
# .loc 10 308 0
# 305 {
# 306 done = 1;
# 307 }
# 308 __syncthreads();
bar.wait 0; #
mov.s32 $r91, 0; #
setp.ne.s32 $p3, $r88, $r91; #
@$p3 bra $Lt_0_51; #
$LBB15_Kernel1:
#<loop> Part of loop body line 299, head labeled $Lt_0_50
# .loc 10 313 0
# 309 // If not, write again
# 310 if ( !done )
# 311 {
# 312 temp.x = tid;
# 313 temp.y += F[i];
add.f32 $f398, $f399, $f398; #
st.global.v2.f32 [$r87+0], {$f397,$f398}; #
$Lt_0_51:
#<loop> Part of loop body line 299, head labeled $Lt_0_50
# .loc 10 299 0
add.s32 $r89, $r89, 1; #
mov.s32 $r92, 8; #
setp.ne.s32 $p4, $r89, $r92; #
@$p4 bra $Lt_0_50; #
$LBB17_Kernel1:
#<loop> Part of loop body line 146, head labeled $Lt_0_47
# .loc 10 287 0
add.u32 $r87, $r87, 8; #
add.u32 $r83, $r83, 4; #
setp.ne.s32 $p5, $r83, $r84; #
@$p5 bra $Lt_0_47; #
$LBB18_Kernel1:
mul.lo.s32 $r93, $r39, 4; #
mov.u32 $r94, (&F$0); #
mul.lo.u32 $r95, $r93, 8; #
add.u32 $r96, $r95, $r86; #
$Lt_0_57:
#<loop> Loop body line 287, nesting depth: 1, iterations: 3
# .loc 10 323 0
I believe that line 301 the instruction ld.global.v2.f32 {$f400,$f398}, [$r87+0]; actually read the value from the memory between the 2 syncthread(). Is my understanding is correct?