# divergent branches how to change it?

Hi all,

I recently changed code like this:

``````for (int h = 1; h <= 4; h++) {

int idx_y = max_idx(h, threadIdx.y); // will be -1 for threads with y > n(h)

for (int w = 1 + (h == 1); w <= 4; w++) {

int idx_x = max_idx(w, threadIdx.x); // will be -1 for threads with x > n(w)

if (w == 1) {

// (h \geq 2) ==> (idx_y >= 12)

if (idx_y != -1) {

IndexInstruction instr = idx_instr[idx_y - 12];

max_v[idx_y][idx_x] =

max(max_v[instr.idx1][idx_x],

max_v[instr.idx2][idx_x]);

scan_inner(w, h, max_v[idx_y][idx_x]);

}

} else if (idx_x != -1 && idx_y != -1) {

// w \geq 2

IndexInstruction instr = idx_instr[idx_x - 12];

max_v[idx_y][idx_x] =

max(max_v[idx_y][instr.idx1],

max_v[idx_y][instr.idx2]);

scan_inner(w, h, max_v[idx_y][idx_x]);

}

if (w < 3) {

}

}

}
``````

to this

``````int idx_x, idx_y, read_idx1, read_idx2;

INDEXINSTRUCTION_SET_LOCALS(instrs[tid + 0]);

if (tid < 72) {

scan_inner(2, 1, max_v[idx_y][idx_x]);

}

INDEXINSTRUCTION_SET_LOCALS(instrs[tid + 72]);

if (tid < 48) {

scan_inner(3, 1, max_v[idx_y][idx_x]);

}

else if (tid < 84) {

scan_inner(4, 1, max_v[idx_y][idx_x]);

}

else if (tid < 156) {

scan_inner(1, 2, max_v[idx_y][idx_x]);

}

INDEXINSTRUCTION_SET_LOCALS(instrs[tid + 228]);

if (tid < 36) {

scan_inner(2, 2, max_v[idx_y][idx_x]);

}
``````

where most of the above code is generated (jinja2 + some python backend code)

I was hoping it would keep warps together better since I had a 12x12 block size (now 192 to support the “tid < 156”), but it doesn’t seem to be doing so. Does anyone have ideas?

Thanks,

Nicholas

edit: sorry for the sloppy original post (I changed it a few seconds after; accidentally hit the wrong button). I’m looking at the divergent_branch counter in the profiler. INDEXINSTRUCTION_SET_LOCALS is pretty simple:

``````struct IndexInstruction {

uint32_t v_0;

__device__ inline void get(int *idx_x, int *idx_y, int *read_idx1, int *read_idx2) {

uint32_t tmp_0 = v_0;

tmp_0 >>= 5;

tmp_0 >>= 5;

*idx_y = (tmp_0 & 0x1f);

tmp_0 >>= 5;

*idx_x = (tmp_0 & 0x1f);

}

#define INDEXINSTRUCTION_SET_LOCALS(name) \

};
``````

Sorry again for the crappy original post. I indeed forgot to include a code bit, which I didn’t think would influence the results:

``````__device__ void scan_inner(int w, int h, int max_v) {

atomicAdd(&(cached_bit_bins.by_pattern[w - 1][h - 1].nbits_req[max_v]), 1U);

}
``````

However, the presence of atomicAdd seems to make the threads diverge. Removing it makes the divergence counter go [in one case] from 140k to 7k. Does anyone know a way to work around this? Do I have to avoid using atomicAdd? That seems potentially ugly…

Thanks so much,

Nicholas