Here is a complete example of warp divergence. This uses Ocelot to execute the program rather than CUDA (there may be differences with how it is implemented in NVIDIA GPUs, but at least this is how it is understood to happen in the academic world)
Take the following very simple example:
int a = 2;
if(threadIdx.x > 1)
{
int b = a + 1;
if(threadIdx.x > 0)
{
}
}
Assume that this is compiled with no optimization, and evaluated in a straightforward manner. You might get a binary that resembles this:
.version 1.4
.entry warpTest()
{
.reg .u32 %r<10>;
.reg .u32 %tid;
.reg .pred %pred;
Entry:
mov.u32 %r0, 2;
mov.u32 %r2, 1;
cvt.u32.u16 %tid, %tid.x;
setp.ge.u32 %pred, %tid, %r0;
@%pred bra Exit;
add.u32 %r1, %r0, 1;
setp.ge.u32 %pred, %tid, %r2;
@%pred bra Exit;
Exit:
exit;
}
Now assume that you ran this example with one warp, with only four threads active.
This produces the following execution trace which indicates how each instruction is executed.
Running kernel warpTest
[0] - mov.u32 %r0, 2 - [1111]
[1] - mov.u32 %r1, 1 - [1111]
[2] - cvt.u32.u16 %r2, %tid.x - [1111]
[3] - setp.ge.u32 %p3, %r2, %r0 - [1111]
[4] - @%p3 bra $BB_1_3 - [1111]
taken: [1100]
fall-through: [0011]
reconverge: [1111]
[5] - add.u32 %r4, %r0, 1 - [0011]
[6] - setp.ge.u32 %p3, %r2, %r1 - [0011]
[7] - @%p3 bra $BB_1_3 - [0011]
taken: [0010]
fall-through: [0001]
reconverge: [0011]
[8] - reconverge - [0001]
[8] - reconverge - [0010]
[9] - reconverge - [0011]
[8] - reconverge - [1100]
[10] - exit - [1111]
Finished kernel
In this example, the predicate mask [xxxx] indicates which threads execute the instruction. A mask of [1100] indicates that the first two threads execute the instruction, but not the second two. In this example, the first two threads skip the if-then-else completely, while the second two go into the ‘then’ path. From there, only the last thread falls into the ‘then’ path of the second if-then-else. The if-then-else gets compiled into a series of conditional branches, which are evaluated differently for different threads. When this happens, three contexts are created, a fallthrough, a target, and a reconverge, whose execution is serialized. When the target and fallthrough contexts have reached the reconverge point (some post-dominator of the branch) execution switches back to the reconverge context, which has the same threads that were active before diverging.
The source code for this example is attached in case you want to download it and experiment yourself:
#include <ocelot/api/interface/ocelot.h>
#include <ocelot/cuda/interface/cuda_runtime.h>
#include <ocelot/trace/interface/TraceGenerator.h>
#include <ocelot/trace/interface/TraceEvent.h>
#include <ocelot/executive/interface/ExecutableKernel.h>
#include <iostream>
class MaskGenerator : public trace::TraceGenerator
{
public:
virtual void initialize(const executive::ExecutableKernel& kernel)
{
std::cout << "Running kernel " << kernel.name << "\n";
}
virtual void event(const trace::TraceEvent & event)
{
std::string string;
boost::to_string(event.active, string);
std::cout << " [" << event.PC << "] - "
<< event.instruction->toString() << " - ["
<< string << "]\n";
if(event.instruction->opcode == ir::PTXInstruction::Bra)
{
boost::to_string(event.taken, string);
std::cout << " taken: [" << string << "]\n";
boost::to_string(event.fallthrough, string);
std::cout << " fall-through: [" << string << "]\n";
boost::to_string(event.active, string);
std::cout << " reconverge: [" << string << "]\n";
}
}
virtual void finish()
{
std::cout << "Finished kernel\n";
}
};
int main(int argc, char** argv)
{
std::stringstream ptx;
ptx << ".version 1.4\n";
ptx << ".entry warpTest()\n";
ptx << "{\n";
ptx << "\t.reg .u32 %r<10>;\n";
ptx << "\t.reg .u32 %tid;\n";
ptx << "\t.reg .pred %pred;\n";
ptx << "\tmov.u32 %r0, 2;\n";
ptx << "\tmov.u32 %r2, 1;\n";
ptx << "\tcvt.u32.u16 %tid, %tid.x;\n";
ptx << "\tsetp.ge.u32 %pred, %tid, %r0;\n";
ptx << "\t@%pred bra Exit;\n";
ptx << "\tadd.u32 %r1, %r0, 1;\n";
ptx << "\tsetp.ge.u32 %pred, %tid, %r2;\n";
ptx << "\t@%pred bra Exit;\n";
ptx << "Exit:\n";
ptx << "\texit;\n";
ptx << "}\n";
MaskGenerator gen;
ocelot::addTraceGenerator(gen);
ocelot::registerPTXModule(ptx, "module");
cudaConfigureCall(dim3(1,1,1), dim3(4, 1, 1), 0, 0);
ocelot::launch("module", "warpTest");
return 0;
}