The following kernel
[codebox]//cusolver_int.csm
global void ElasticSurf3D()
{
const int k = blockIdx.x + 1;
cuDev.u1[k * 9 + threadIdx.x] = cuDev.nei[8*(blockIdx.x+1)+4];
}
[/codebox]
gives wrong result stored in a global array cuDev.u1 for the blockIdx.x = 32768 (blockDim.x = 42153).
However declaring
[codebox]//cusolver_uint.csm
const unsigned int k = blockIdx.x + 1;
[/codebox]
starts to produce correct results. Decuda files for both versions are given at the end of the post (I’ve failed to attach them). From my point of view the problem is caused by a strange splitting “int k” into two 16-bit numbers which are not able to store the value 32768. This splitting is clearly visible in the corresponding decuda file.
Moreover the compiler’'s behaviour is quite unpredictable. If the kernel is like this
[codebox]//cusolver_while.csm
global void ElasticSurf3D()
{
const int k = blockIdx.x + 1;
if(k <= 0)
while(1);
cuDev.u1[k * 9 + threadIdx.x] = cuDev.nei[8*(blockIdx.x+1)+4];
}
[/codebox]
the results are also correct. However
[codebox]//cusolver_wo_while.csm
global void ElasticSurf3D()
{
const int k = blockIdx.x + 1;
if(k <= 0);
cuDev.u1[k * 9 + threadIdx.x] = cuDev.nei[8*(blockIdx.x+1)+4];
}
[/codebox]
leads to the same splitting and wrong results.
Can it be considered as a compiler bug? If so how it is possible to detect such cases.
Thanks in advance.
[codebox]cusolver_int.csm
// Disassembling _Z13ElasticSurf3Dv (1)
.entry _Z13ElasticSurf3Dv
{
.lmem 0
.smem 0
.reg 3
.bar 0
mul24.lo.u32.u16.u16 $r1, s[0x000c], 0x0008
shl.u32 $r1, $r1, 0x00000002
mov.half.b16 $r0.hi, s[0x000c]
add.half.b32 $r2, $r1, c0[0x0028]
cvt.u32.u16 $r1, $r0.lo
mul24.lo.s32.s16.s16 $r64, $r0.hi, 0x0009
add.b32 $r2, $r2, 0x00000030
add.u32 $r1, $r1, $r0
mov.u32 $r0, g[$r2]
shl.u32 $r1, $r1, 0x00000002
add.u32 $r1, $r1, c0[0x0008]
add.b32 $r1, $r1, 0x00000024
cvt.rn.f32.s32 $r0, $r0
mov.end.u32 g[$r1], $r0
}
[/codebox]
[codebox]cusolver_uint.csm
// Disassembling _Z13ElasticSurf3Dv (1)
.entry _Z13ElasticSurf3Dv
{
.lmem 0
.smem 0
.reg 3
.bar 0
mul24.lo.u32.u16.u16 $r1, s[0x000c], 0x0008
shl.u32 $r1, $r1, 0x00000002
add.u32 $r1, $r1, c0[0x0028]
add.b32 $r1, $r1, 0x00000030
cvt.u32.u16 $r0, $r0.lo
mov.u32 $r1, g[$r1]
mad24.lo.u32.u16.u16 $r0, s[0x000c], 0x0009// (No operand 4 in this instruction)
shl.u32 $r0, $r0, 0x00000002
add.u32 $r0, $r0, c0[0x0008]
add.b32 $r2, $r0, 0x00000024
cvt.rn.f32.s32 $r0, $r1
mov.end.u32 g[$r2], $r0
#.constseg 1:0x0000 const
#{
#d.32 0x00000009 // 0000
#}
}
[/codebox]
[codebox]cusolver_while.csm
// Disassembling _Z13ElasticSurf3Dv (1)
.entry _Z13ElasticSurf3Dv
{
.lmem 0
.smem 0
.reg 3
.bar 0
cvt.u32.u16 $r1, s[0x000c]
add.b32 $r2, $r1, 0x00000001
set.gt.s32 $p0|$o127, $r2, $r60// (unk0 00400000)
@$p0.ne bra.label label1
label0: bra.label label0
label1: shl.u32 $r2, $r1, 0x00000005
add.u32 $r2, $r2, c0[0x0028]
mul24.lo.s32 $r1, $r1, 0x00000009
cvt.u32.u16 $r0, $r0.lo
add.b32 $r2, $r2, 0x00000030
add.u32 $r1, $r0, $r1
mov.u32 $r0, g[$r2]
shl.u32 $r1, $r1, 0x00000002
add.u32 $r1, $r1, c0[0x0008]
cvt.rn.f32.s32 $r0, $r0
add.b32 $r1, $r1, 0x00000024
mov.end.u32 g[$r1], $r0
}
[/codebox]
[codebox]cusolver_wo_while.csm
// Disassembling _Z13ElasticSurf3Dv (1)
.entry _Z13ElasticSurf3Dv
{
.lmem 0
.smem 0
.reg 3
.bar 0
mul24.lo.u32.u16.u16 $r1, s[0x000c], 0x0008
shl.u32 $r1, $r1, 0x00000002
mov.half.b16 $r0.hi, s[0x000c]
add.half.b32 $r2, $r1, c0[0x0028]
cvt.u32.u16 $r1, $r0.lo
mul24.lo.s32.s16.s16 $r64, $r0.hi, 0x0009
add.b32 $r2, $r2, 0x00000030
add.u32 $r1, $r1, $r0
mov.u32 $r0, g[$r2]
shl.u32 $r1, $r1, 0x00000002
add.u32 $r1, $r1, c0[0x0008]
add.b32 $r1, $r1, 0x00000024
cvt.rn.f32.s32 $r0, $r0
mov.end.u32 g[$r1], $r0
}
[/codebox]