Consider the following very simple C++ code with a divergent virtual function call:
struct X {
virtual __device__ void f();
};
__global__ void test(X **x) {
x[threadIdx.x]->f();
}
Compiling this via NVCC on CUDA 11.1 yields PTX code including a non-divergent call instruction call.uni
. I am curious to understand why this is the case? The PTX specification seems to indicate that the call
instruction is need whenever the call target may vary within a warp.
.version 7.1
.target sm_75
.address_size 64
// .globl _Z4testPP1X
.visible .entry _Z4testPP1X(
.param .u64 _Z4testPP1X_param_0
)
{
.reg .b32 %r<2>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [_Z4testPP1X_param_0];
cvta.to.global.u64 %rd2, %rd1;
mov.u32 %r1, %tid.x;
mul.wide.u32 %rd3, %r1, 8;
add.s64 %rd4, %rd2, %rd3;
ld.global.u64 %rd5, [%rd4];
ld.u64 %rd6, [%rd5];
ld.u64 %rd7, [%rd6];
// Callseq Start 0
{
.reg .b32 temp_param_reg;
// <end>}
.param .b64 param0;
st.param.b64 [param0+0], %rd5;
prototype_0 : .callprototype ()_ (.param .b64 _) ;
call.uni
%rd7,
(
param0
)
, prototype_0;
//{
}// Callseq End 0
ret;
}
Thanks,
Wenzel