synchronization and block independence

Greetings.

i have a program that launches with 128 threads per block and goes onto 4 different paths:

global foo() {
int w = threadIdx.x >> 5;
if (w == 1) {
do sth;
bar.sync 0;
} else if (w == 2) {
bar.sync 0;
do sth
} else if (w == 3) {

} else {

}

the warps communicate via shared memory and synchronize via
asm(“bar.sync 0;”);

First question:
If i change it to “bar.sync 1;” i get a launch error, even though the ptx doc states
that there is more than one builtin sync variable.

Second question:
If i lauch 2 blocks for each processor (1 processor is 8 cores and 16kbyte shared memory)
i can interleave thread execution and get 30% speed up, but the results are not correct, but
only for those blocks that ran concurrently. The processors that get assigned 1 block only
still produce correct results. With my gtx260SP216 i can lauch 1-27 blocks and all is fine,
if i launch 28-54 blocks a growing fraction of the results are wrong.
If i launch more blocks than processors on a 9600M-GT with only 8192 registers per processor,
the blocks cannot be interleaved (each block needs 8192 registers) and they all produce correct
results. Looks like the blocks influence each other when executed concurrently on one processor.

The code is part of a rainbow table generator for the A5/1 cipher. It uses bitslicing with 64bit registers
to compute 64 A5/1 keystreams per cuda thread.

Maybe some of you know whats going on… Thanks for listening.

#ifndef __TMTO__ALGORITHM__A51__IMPLEMENTATION__CUDA__KERNEL__BITSLI
CE__HPP
#define __TMTO__ALGORITHM__A51__IMPLEMENTATION__CUDA__KERNEL__BITSLI
CE__HPP

#include <tmto/algorithm/A51/implementation/cuda/kernel/common.hpp>
#include <tmto/algorithm/A51/implementation/cuda/kernel/kernel.hpp>
#include <tmto/device/cuda/kernel/kernel.hpp>
#include <tmto/misc/stdint.hpp>
#include <boost/preprocessor/repetition/repeat.hpp>
#include <boost/preprocessor/repetition/enum.hpp>
#include <boost/preprocessor/arithmetic/sub.hpp>
#include <tmto/device/cuda/kernel/kernel.hpp>
#include <tmto/algorithm/A51/implementation/cuda/bitslice.hpp>
#include <tmto/condition/cuda/all_conditions.hpp>

#if defined(pbs_declare_var) ||
defined(pbs_clock_r1) ||
defined(pbs_clock_r2) ||
defined(pbs_clock_r3) ||
defined(pbs_assign_r1) ||
defined(pbs_assign_r2) ||
defined(pbs_assign_r3)
#error preprocessor namespace collision
#endif

#define pbs_declare_var(z, n, prefix) prefix ## n

#define pbs_clock_r1(z, n, start) BOOST_PP_CAT(r1_, BOOST_PP_SUB(start, n)) = BOOST_PP_CAT(r1_, BOOST_PP_SUB(start, n)) & not_clock_r1 | BOOST_PP_CAT(r1_, BOOST_PP_SUB(BOOST_PP_SUB(start, 1), n)) & do_clock_r1;
#define pbs_clock_r2(z, n, start) BOOST_PP_CAT(r2_, BOOST_PP_SUB(start, n)) = BOOST_PP_CAT(r2_, BOOST_PP_SUB(start, n)) & not_clock_r2 | BOOST_PP_CAT(r2_, BOOST_PP_SUB(BOOST_PP_SUB(start, 1), n)) & do_clock_r2;
#define pbs_clock_r3(z, n, start) BOOST_PP_CAT(r3_, BOOST_PP_SUB(start, n)) = BOOST_PP_CAT(r3_, BOOST_PP_SUB(start, n)) & not_clock_r3 | BOOST_PP_CAT(r3_, BOOST_PP_SUB(BOOST_PP_SUB(start, 1), n)) & do_clock_r3;

#define pbs_assign_r1(z, n, p) BOOST_PP_CAT(r1_, BOOST_PP_SUB(18, n)) = result[(23 + 22 + n) * 32 + tid];
#define pbs_assign_r2(z, n, p) BOOST_PP_CAT(r2_, BOOST_PP_SUB(21, n)) = result[(23 + n) * 32 + tid];
#define pbs_assign_r3(z, n, p) BOOST_PP_CAT(r3_, BOOST_PP_SUB(22, n)) = result[n * 32 + tid];

namespace tmto { namespace algorithm { namespace A51 { namespace implementation { namespace cuda { namespace kernel {

struct bitslice : public kernel {
static const uint64_t one = 0xffffffffffffffffULL;
static const uint64_t zero = 0ULL;
typedef bitslice this_t;

template <typename ImplArgs, typename Condition, typename RoundFunc, typename Args>
struct implementation {
  typedef typename Args::combined_work_item_t work_item_t;

  __device__ static void run(uint32_t index, int run, Args * args) {
typedef uint64_t reg_t;
typedef uint64_t RT;

int maxrun  = args->operations + args->operations_rest;
RT * result = args->working_set.data_ + blockIdx.x * blockDim.x * 16;

__shared__ volatile uint32_t
  r1_clock_lo   [32], r1_clock_hi   [32],
  r2_clock_lo   [32], r2_clock_hi   [32],
  r3_clock_lo   [32], r3_clock_hi   [32],
  r1_out_lo     [32], r1_out_hi     [32],
  r2_out_lo     [32], r2_out_hi     [32],
  r3_out_lo     [32], r3_out_hi     [32],
  r1_clock_do_lo[32], r1_clock_do_hi[32],
  r2_clock_do_lo[32], r2_clock_do_hi[32],
  r3_clock_do_lo[32], r3_clock_do_hi[32];

typename device::cuda::select_condition<
  Condition
>::type condition(args->condition, args->common_condition_state);

int tid = threadIdx.x & 0x1f;

__shared__ volatile RT real_roundfunc[64];
__shared__ volatile RT null_roundfunc[64];

if (threadIdx.x < 64) {
  real_roundfunc[threadIdx.x] = args->roundfunc.generator.table[64 * args->common_roundfunc_state.second().state + threadIdx.x];
  null_roundfunc[threadIdx.x] = 0;
}

__syncthreads();

//	if (threadIdx.x < 32) {
//	  for (int i = 0; i < 64; ++i) {
//	    result[i * 32 + tid] = result[i * 32 + tid] ^ real_roundfunc[i];
//	  }
//	}

//	__syncthreads();

if ((threadIdx.x >> 5 & 3) == 0) {
  RT BOOST_PP_ENUM(19, pbs_declare_var, r1_);
  for (int run = 0; run < maxrun; ++run) {
    BOOST_PP_REPEAT(19, pbs_assign_r1,);

    r1_clock_lo[tid] = r1_8;  r1_clock_hi[tid] = r1_8  >> 32;
    r1_out_lo  [tid] = r1_18; r1_out_hi  [tid] = r1_18 >> 32;
    asm("bar.sync 0;\n"); // yield

    for (int i = 0; i < 63; ++i) {
      asm("bar.sync 0;\n"); // wait
      RT do_clock_r1  = r1_clock_do_lo[tid] | RT(r1_clock_do_hi[tid]) << 32;
      RT not_clock_r1 = ~do_clock_r1;

      RT r1_out   = r1_18 & not_clock_r1 | r1_17 & do_clock_r1;
      RT r1_clock = r1_8  & not_clock_r1 | r1_7  & do_clock_r1;

      r1_clock_lo[tid] = r1_clock; r1_clock_hi[tid] = r1_clock >> 32;
      r1_out_lo  [tid] = r1_out;   r1_out_hi  [tid] = r1_out   >> 32;

      RT r1_fb = r1_18 ^ r1_17 ^ r1_16 ^ r1_13;

      asm("bar.sync 0;\n"); // yield
      r1_18 = r1_out;
      BOOST_PP_REPEAT(9, pbs_clock_r1, 17);
      r1_8 = r1_clock;
      BOOST_PP_REPEAT(7, pbs_clock_r1, 7);
      r1_0 = r1_0 & not_clock_r1 | r1_fb & do_clock_r1;
    }
  }
} else if ((threadIdx.x >> 5 & 3) == 1) {
  RT BOOST_PP_ENUM(22, pbs_declare_var, r2_);
  for (int run = 0; run < maxrun; ++run) {
    BOOST_PP_REPEAT(22, pbs_assign_r2,);

    r2_clock_lo[tid] = r2_10;
    r2_clock_hi[tid] = r2_10 >> 32;
    r2_out_lo[tid]   = r2_21;
    r2_out_hi[tid]   = r2_21 >> 32;
    asm("bar.sync 0;\n"); // yield

    for (int i = 0; i < 63; ++i) {
      asm("bar.sync 0;\n"); // wait
      RT do_clock_r2  = r2_clock_do_lo[tid] | RT(r2_clock_do_hi[tid]) << 32;
      RT not_clock_r2 = ~do_clock_r2;

      RT r2_out   = r2_21 & not_clock_r2 | r2_20 & do_clock_r2;
      RT r2_clock = r2_10 & not_clock_r2 | r2_9  & do_clock_r2;
      
      r2_out_lo  [tid] = r2_out;   r2_out_hi  [tid] = r2_out   >> 32;
      r2_clock_lo[tid] = r2_clock; r2_clock_hi[tid] = r2_clock >> 32;

      RT r2_fb = r2_21 ^ r2_20;

      asm("bar.sync 0;\n"); // yield
      r2_21 = r2_out;
      BOOST_PP_REPEAT(10, pbs_clock_r2, 20);
      r2_10 = r2_clock;
      BOOST_PP_REPEAT(9, pbs_clock_r2, 9);
      r2_0 = r2_0 & not_clock_r2 | r2_fb & do_clock_r2;
    }
  }
} else if ((threadIdx.x >> 5 & 3) == 2) {
  RT BOOST_PP_ENUM(23, pbs_declare_var, r3_);
  for (int run = 0; run < maxrun; ++run) {
    BOOST_PP_REPEAT(23, pbs_assign_r3,);

    r3_clock_lo[tid] = r3_10;
    r3_clock_hi[tid] = r3_10 >> 32;
    r3_out_lo  [tid] = r3_22;
    r3_out_hi  [tid] = r3_22 >> 32;

    asm("bar.sync 0;\n"); // yield

    for (int i = 0; i < 63; ++i) {
      asm("bar.sync 0;\n"); // wait
      RT do_clock_r3  = r3_clock_do_lo[tid] | RT(r3_clock_do_hi[tid]) << 32;
      RT not_clock_r3 = ~do_clock_r3;
      
      RT r3_out   = r3_22 & not_clock_r3 | r3_21 & do_clock_r3;
      RT r3_clock = r3_10 & not_clock_r3 | r3_9  & do_clock_r3;
      
      r3_out_lo  [tid] = r3_out;   r3_out_hi  [tid] = r3_out   >> 32;
      r3_clock_lo[tid] = r3_clock; r3_clock_hi[tid] = r3_clock >> 32;

      RT r3_fb = r3_22 ^ r3_21 ^ r3_20 ^ r3_7;

      asm("bar.sync 0;\n"); // yield
      r3_22 = r3_out;
      BOOST_PP_REPEAT(11, pbs_clock_r3, 21);
      r3_10 = r3_clock;
      BOOST_PP_REPEAT(9, pbs_clock_r3, 9);
      r3_0 = r3_0 & not_clock_r3 | r3_fb & do_clock_r3;
    }
  }
} else {
  RT DP = this_t::one, new_dp = this_t::one;
  volatile RT * roundfunc = real_roundfunc;
  for (int run = 0; run < maxrun; ++run) {

    DP &= new_dp;

    //	    if (run + 1 == maxrun) {
    //	      roundfunc = null_roundfunc;
    //	    }

    new_dp = this_t::zero;
    asm("bar.sync 0;\n"); // wait

    int i = 0;
    for (; i < 23; ++i) {
      RT r1_out = RT(r1_out_lo[tid]) | RT(r1_out_hi[tid]) << 32;
      RT r2_out = RT(r2_out_lo[tid]) | RT(r2_out_hi[tid]) << 32;
      RT r3_out = RT(r3_out_lo[tid]) | RT(r3_out_hi[tid]) << 32;

      //	      RT res    = (r1_out ^ r2_out ^ r3_out ^ roundfunc[i]) & DP | r3_out & ~DP;
      RT res    = (r1_out ^ r2_out ^ r3_out);
      

      //	      & DP | r3_out & ~DP;
      //	      RT res    = (r1_out ^ r2_out ^ r3_out ^ roundfunc[i]) & DP | r3_out & ~DP;

      condition.matches_bitslice((res ^ real_roundfunc[i]) & DP, i, new_dp, args->common_condition_state);

      //	      if (i < 15) { new_dp |= res; }
      result[i * 32 + tid] = (res ^ roundfunc[i]) & DP | r3_out & ~DP;

      RT r1c = r1_clock_lo[tid] | RT(r1_clock_hi[tid]) << 32;
      RT r2c = r2_clock_lo[tid] | RT(r2_clock_hi[tid]) << 32;
      RT r3c = r3_clock_lo[tid] | RT(r3_clock_hi[tid]) << 32;

      RT r1r2c = ~(r1c ^ r2c);
      RT r1r3c = ~(r1c ^ r3c);
      RT r2r3c = ~(r2c ^ r3c);

      RT r1_clock_do = (r1r2c | r1r3c) & DP;
      RT r2_clock_do = (r1r2c | r2r3c) & DP;
      RT r3_clock_do = (r1r3c | r2r3c) | ~DP;

      r1_clock_do_lo[tid] = r1_clock_do; r1_clock_do_hi[tid] = r1_clock_do >> 32;
      r2_clock_do_lo[tid] = r2_clock_do; r2_clock_do_hi[tid] = r2_clock_do >> 32;
      r3_clock_do_lo[tid] = r3_clock_do; r3_clock_do_hi[tid] = r3_clock_do >> 32;

      asm("bar.sync 0;\n"); // yield
      asm("bar.sync 0;\n"); // wait
    }

    for (; i < 23 + 22; ++i) {
      RT r1_out = r1_out_lo[tid] | RT(r1_out_hi[tid]) << 32;
      RT r2_out = r2_out_lo[tid] | RT(r2_out_hi[tid]) << 32;
      RT r3_out = r3_out_lo[tid] | RT(r3_out_hi[tid]) << 32;
      RT res = (r1_out ^ r2_out ^ r3_out ^ roundfunc[i]) & DP | r2_out & ~DP;

      result[i * 32 + tid] = res;

      RT r1c = r1_clock_lo[tid] | RT(r1_clock_hi[tid]) << 32;
      RT r2c = r2_clock_lo[tid] | RT(r2_clock_hi[tid]) << 32;
      RT r3c = r3_clock_lo[tid] | RT(r3_clock_hi[tid]) << 32;

      RT r1r2c = ~( r1c ^ r2c);
      RT r1r3c = ~( r1c ^ r3c);
      RT r2r3c = ~( r2c ^ r3c);

      RT r1_clock_do = (r1r2c | r1r3c) & DP;
      RT r2_clock_do = r1r2c | r2r3c | ~DP;
      RT r3_clock_do = r1r3c | r2r3c;

      r1_clock_do_lo[tid] = r1_clock_do;
      r2_clock_do_lo[tid] = r2_clock_do;
      r3_clock_do_lo[tid] = r3_clock_do;
      r1_clock_do_hi[tid] = r1_clock_do >> 32;
      r2_clock_do_hi[tid] = r2_clock_do >> 32;
      r3_clock_do_hi[tid] = r3_clock_do >> 32;

      asm("bar.sync 0;\n");
      asm("bar.sync 0;\n");
    }

    for (; i < 63; ++i) {
      RT r1_out = r1_out_lo[tid] | RT(r1_out_hi[tid]) << 32;
      RT r2_out = r2_out_lo[tid] | RT(r2_out_hi[tid]) << 32;
      RT r3_out = r3_out_lo[tid] | RT(r3_out_hi[tid]) << 32;
      RT res = (r1_out ^ r2_out ^ r3_out ^ roundfunc[i]) & DP | r1_out & ~DP;

      result[i * 32 + tid] = res;

      RT r1c = r1_clock_lo[tid] | RT(r1_clock_hi[tid]) << 32;
      RT r2c = r2_clock_lo[tid] | RT(r2_clock_hi[tid]) << 32;
      RT r3c = r3_clock_lo[tid] | RT(r3_clock_hi[tid]) << 32;

      RT r1r2c = ~( r1c ^ r2c);
      RT r1r3c = ~( r1c ^ r3c);
      RT r2r3c = ~( r2c ^ r3c);

      RT r1_clock_do = r1r2c | r1r3c | ~DP;
      RT r2_clock_do = r1r2c | r2r3c;
      RT r3_clock_do = r1r3c | r2r3c;

      r1_clock_do_lo[tid] = r1_clock_do;
      r2_clock_do_lo[tid] = r2_clock_do;
      r3_clock_do_lo[tid] = r3_clock_do;
      r1_clock_do_hi[tid] = r1_clock_do >> 32;
      r2_clock_do_hi[tid] = r2_clock_do >> 32;
      r3_clock_do_hi[tid] = r3_clock_do >> 32;

      asm("bar.sync 0;\n");
      asm("bar.sync 0;\n");
    }

    RT r1_out = r1_out_lo[tid] | RT(r1_out_hi[tid]) << 32;
    RT r2_out = r2_out_lo[tid] | RT(r2_out_hi[tid]) << 32;
    RT r3_out = r3_out_lo[tid] | RT(r3_out_hi[tid]) << 32;
    RT res = (r1_out ^ r2_out ^ r3_out ^ roundfunc[i]) & DP | r1_out & ~DP;

    result[i * 32 + tid] = res;
  }
  //	  if (~new_dp & DP) {
  //	  RT not_new_dp = ~new_dp & DP;
  //	  for (int i = 0; i < 64; ++i) {
  //	    result[i * 32 + tid] = result[i * 32 + tid]
  //	      ^ real_roundfunc[i] & not_new_dp;
  //	  }
  //	  }
}
  }
};

};
} } } } } }

namespace tmto {namespace device { namespace cuda {
template <>
struct select_kernel_implementationalgorithm::A51::implementation::cuda::tag::bitslice {
typedef algorithm::A51::implementation::cuda::kernel::bitslice type;
};
} } }

#endif

int w = threadIdx.x >> 5;

means w = threadIdx.x / 32,

threadIdx.x = 0,1,2,…,31 —> w = 0

threadIdx.x = 32, 33, …, 63 --> w = 1

all threads with the same value w belongs to the same warp, _synch is no need.

__global__ foo() {

int w = threadIdx.x >> 5;

if (w == 1) {

	do sth;

} else if (w == 2) {

 	do sth

} else if (w == 3) {

...

} else {

...

}

this is intended. i do not want to sync all threads of a warp, i want to sync all warps of a block.

the work warp 1 does depends on that done by warp 2 before and then warp 2 uses the work of warp 1

and they ping pong in a loop, communicating via shared memory.

I wanted to share the solution i found with you and google:

there seems to be some kind of timeout built into the bar.sync instruction. when i increased the block count
the time between the bar.sync instruction executions got longer, in part due to the fact that i did
many global memory accesses in the critical parts which gave the different blocks opportunity to slow
down each other. with more bar.sync instructions (one more right
in the middle of the instruction block that reads from global memory) everything works fine now, although it is
obviously still dependent on the actual hardware timing and implementation details.