Hello,
This kernel was generated for compute 2.0 compatibility and thus I think it should be forward compatible with higher compute capabilities, for example GTX 970.
However users of this kernel are reporting a launch failure when dimensions are:
BlockWidth: 1024
BlockHeight: 1
BlockDepth: 1
GridWidth: 13691407
GridHeight: 1
GridDepth: 1
//
// Generated by NVIDIA NVVM Compiler
// Compiler built on Sat Mar 15 02:52:22 2014 (1394848342)
// Cuda compilation tools, release 6.0, V6.0.1
//
.version 4.0
.target sm_20
.address_size 32
.visible .entry KernelBandwidth(
.param .u32 KernelBandwidth_param_0,
.param .u32 KernelBandwidth_param_1
)
{
.reg .pred %p<2>;
.reg .s32 %r<24>;
.reg .f32 %f<2>;
ld.param.u32 %r2, [KernelBandwidth_param_0];
ld.param.u32 %r3, [KernelBandwidth_param_1];
cvta.to.global.u32 %r1, %r2;
mov.u32 %r4, %ctaid.z;
mov.u32 %r5, %nctaid.y;
mov.u32 %r6, %ctaid.y;
mad.lo.s32 %r7, %r4, %r5, %r6;
mov.u32 %r8, %nctaid.x;
mov.u32 %r9, %ctaid.x;
mad.lo.s32 %r10, %r7, %r8, %r9;
mov.u32 %r11, %ntid.z;
mov.u32 %r12, %tid.z;
mad.lo.s32 %r13, %r10, %r11, %r12;
mov.u32 %r14, %ntid.y;
mov.u32 %r15, %tid.y;
mad.lo.s32 %r16, %r13, %r14, %r15;
mov.u32 %r17, %ntid.x;
mov.u32 %r18, %tid.x;
mad.lo.s32 %r19, %r16, %r17, %r18;
rem.s32 %r20, %r19, %r3;
shl.b32 %r21, %r20, 4;
add.s32 %r22, %r1, %r21;
ld.global.f32 %f1, [%r22];
// ld.volatile.global.f32 %f1, [%r22];
// st.global.f32 [%r22], %f1;
ret;
}
My application uses the driver api, so the driver should jit compile this PTX to something that can execute on GTX 970. This seems to fail ? Thus I consider this a “compatibility bug”.
Are bugs like these going to be fixed some day ?!?
Recompiling this kernel to higher compute capability for example 3.5 would give different looking ptx:
//
// Generated by NVIDIA NVVM Compiler
// Compiler built on Sat Mar 15 02:52:22 2014 (1394848342)
// Cuda compilation tools, release 6.0, V6.0.1
//
.version 4.0
.target sm_35
.address_size 32
.weak .func (.param .b32 func_retval0) cudaMalloc(
.param .b32 cudaMalloc_param_0,
.param .b32 cudaMalloc_param_1
)
{
.reg .s32 %r<2>;
mov.u32 %r1, 30;
st.param.b32 [func_retval0+0], %r1;
ret;
}
.weak .func (.param .b32 func_retval0) cudaFuncGetAttributes(
.param .b32 cudaFuncGetAttributes_param_0,
.param .b32 cudaFuncGetAttributes_param_1
)
{
.reg .s32 %r<2>;
mov.u32 %r1, 30;
st.param.b32 [func_retval0+0], %r1;
ret;
}
.visible .entry KernelBandwidth(
.param .u32 KernelBandwidth_param_0,
.param .u32 KernelBandwidth_param_1
)
{
.reg .pred %p<2>;
.reg .s32 %r<24>;
.reg .f32 %f<2>;
ld.param.u32 %r2, [KernelBandwidth_param_0];
ld.param.u32 %r3, [KernelBandwidth_param_1];
cvta.to.global.u32 %r1, %r2;
mov.u32 %r4, %ctaid.z;
mov.u32 %r5, %nctaid.y;
mov.u32 %r6, %ctaid.y;
mad.lo.s32 %r7, %r4, %r5, %r6;
mov.u32 %r8, %nctaid.x;
mov.u32 %r9, %ctaid.x;
mad.lo.s32 %r10, %r7, %r8, %r9;
mov.u32 %r11, %ntid.z;
mov.u32 %r12, %tid.z;
mad.lo.s32 %r13, %r10, %r11, %r12;
mov.u32 %r14, %ntid.y;
mov.u32 %r15, %tid.y;
mad.lo.s32 %r16, %r13, %r14, %r15;
mov.u32 %r17, %ntid.x;
mov.u32 %r18, %tid.x;
mad.lo.s32 %r19, %r16, %r17, %r18;
rem.s32 %r20, %r19, %r3;
shl.b32 %r21, %r20, 4;
add.s32 %r22, %r1, %r21;
ld.global.f32 %f1, [%r22];
ret;
}
The difference is minor… anyway my application should not require a re-compile of kernels ?!?
For now I have no choice and will have to offer multiple PTX versions so users can run it.