Wrong unsigned to signed conversion Problem with types conversion leads to incorrect kernel's re

The following kernel

[codebox]//cusolver_int.csm

global void ElasticSurf3D()

{

const int k = blockIdx.x + 1;

cuDev.u1[k * 9 + threadIdx.x] = cuDev.nei[8*(blockIdx.x+1)+4];

}

[/codebox]

gives wrong result stored in a global array cuDev.u1 for the blockIdx.x = 32768 (blockDim.x = 42153).

However declaring

[codebox]//cusolver_uint.csm

const unsigned int k = blockIdx.x + 1;

[/codebox]

starts to produce correct results. Decuda files for both versions are given at the end of the post (I’ve failed to attach them). From my point of view the problem is caused by a strange splitting “int k” into two 16-bit numbers which are not able to store the value 32768. This splitting is clearly visible in the corresponding decuda file.

Moreover the compiler’'s behaviour is quite unpredictable. If the kernel is like this

[codebox]//cusolver_while.csm

global void ElasticSurf3D()

{

const int k = blockIdx.x + 1;

if(k <= 0)

while(1);

cuDev.u1[k * 9 + threadIdx.x] = cuDev.nei[8*(blockIdx.x+1)+4];

}

[/codebox]

the results are also correct. However

[codebox]//cusolver_wo_while.csm

global void ElasticSurf3D()

{

const int k = blockIdx.x + 1;

if(k <= 0);

cuDev.u1[k * 9 + threadIdx.x] = cuDev.nei[8*(blockIdx.x+1)+4];

}

[/codebox]

leads to the same splitting and wrong results.

Can it be considered as a compiler bug? If so how it is possible to detect such cases.

Thanks in advance.

[codebox]cusolver_int.csm

// Disassembling _Z13ElasticSurf3Dv (1)

.entry _Z13ElasticSurf3Dv

{

.lmem 0

.smem 0

.reg 3

.bar 0

mul24.lo.u32.u16.u16 $r1, s[0x000c], 0x0008

shl.u32 $r1, $r1, 0x00000002

mov.half.b16 $r0.hi, s[0x000c]

add.half.b32 $r2, $r1, c0[0x0028]

cvt.u32.u16 $r1, $r0.lo

mul24.lo.s32.s16.s16 $r64, $r0.hi, 0x0009

add.b32 $r2, $r2, 0x00000030

add.u32 $r1, $r1, $r0

mov.u32 $r0, g[$r2]

shl.u32 $r1, $r1, 0x00000002

add.u32 $r1, $r1, c0[0x0008]

add.b32 $r1, $r1, 0x00000024

cvt.rn.f32.s32 $r0, $r0

mov.end.u32 g[$r1], $r0

}

[/codebox]

[codebox]cusolver_uint.csm

// Disassembling _Z13ElasticSurf3Dv (1)

.entry _Z13ElasticSurf3Dv

{

.lmem 0

.smem 0

.reg 3

.bar 0

mul24.lo.u32.u16.u16 $r1, s[0x000c], 0x0008

shl.u32 $r1, $r1, 0x00000002

add.u32 $r1, $r1, c0[0x0028]

add.b32 $r1, $r1, 0x00000030

cvt.u32.u16 $r0, $r0.lo

mov.u32 $r1, g[$r1]

mad24.lo.u32.u16.u16 $r0, s[0x000c], 0x0009// (No operand 4 in this instruction)

shl.u32 $r0, $r0, 0x00000002

add.u32 $r0, $r0, c0[0x0008]

add.b32 $r2, $r0, 0x00000024

cvt.rn.f32.s32 $r0, $r1

mov.end.u32 g[$r2], $r0

#.constseg 1:0x0000 const

#{

#d.32 0x00000009 // 0000

#}

}

[/codebox]

[codebox]cusolver_while.csm

// Disassembling _Z13ElasticSurf3Dv (1)

.entry _Z13ElasticSurf3Dv

{

.lmem 0

.smem 0

.reg 3

.bar 0

cvt.u32.u16 $r1, s[0x000c]

add.b32 $r2, $r1, 0x00000001

set.gt.s32 $p0|$o127, $r2, $r60// (unk0 00400000)

@$p0.ne bra.label label1

label0: bra.label label0

label1: shl.u32 $r2, $r1, 0x00000005

add.u32 $r2, $r2, c0[0x0028]

mul24.lo.s32 $r1, $r1, 0x00000009

cvt.u32.u16 $r0, $r0.lo

add.b32 $r2, $r2, 0x00000030

add.u32 $r1, $r0, $r1

mov.u32 $r0, g[$r2]

shl.u32 $r1, $r1, 0x00000002

add.u32 $r1, $r1, c0[0x0008]

cvt.rn.f32.s32 $r0, $r0

add.b32 $r1, $r1, 0x00000024

mov.end.u32 g[$r1], $r0

}

[/codebox]

[codebox]cusolver_wo_while.csm

// Disassembling _Z13ElasticSurf3Dv (1)

.entry _Z13ElasticSurf3Dv

{

.lmem 0

.smem 0

.reg 3

.bar 0

mul24.lo.u32.u16.u16 $r1, s[0x000c], 0x0008

shl.u32 $r1, $r1, 0x00000002

mov.half.b16 $r0.hi, s[0x000c]

add.half.b32 $r2, $r1, c0[0x0028]

cvt.u32.u16 $r1, $r0.lo

mul24.lo.s32.s16.s16 $r64, $r0.hi, 0x0009

add.b32 $r2, $r2, 0x00000030

add.u32 $r1, $r1, $r0

mov.u32 $r0, g[$r2]

shl.u32 $r1, $r1, 0x00000002

add.u32 $r1, $r1, c0[0x0008]

add.b32 $r1, $r1, 0x00000024

cvt.rn.f32.s32 $r0, $r0

mov.end.u32 g[$r1], $r0

}

[/codebox]

I succeeded to extract the bug to a small program.

Here it is:

[codebox]

#include <stdio.h>

/* Sample CUDA code demonstrating wrong type conversion.

Copyrignt Denis Sabitov, Anatoly Vershinin

*/

#define SIZEMAS 42000

global void ElasticSurf3D(float* u1, const float* nei)

{

const int k = blockIdx.x + 1;

u1[k * 9 + threadIdx.x] = nei[8*(blockIdx.x+1)+4];

}

int main(int argc, char* argv)

{

float *u, *nei;

float *dev_u, *dev_nei;

u = (float *) malloc((SIZEMAS + 1) * 9 * sizeof(float));

nei = (float *) malloc((SIZEMAS + 1) * 8 * sizeof(float));

for (int i = 0; i < (SIZEMAS + 1) * 9; i++) {

	u[i] = 0.0f;

}

for (int i = 0; i <= SIZEMAS; i++) {

	nei[i*8+4] = (float) i;

}

cudaMalloc((void**) &dev_u, (SIZEMAS + 1) * 9 * sizeof(float));

cudaMalloc((void**) &dev_nei, (SIZEMAS + 1) * 8 * sizeof(float));

cudaMemcpy(dev_u, u, (SIZEMAS + 1) * 9 * sizeof(float), cudaMemcpyHostToDevice);

cudaMemcpy(dev_nei, nei, (SIZEMAS + 1) * 8 * sizeof(float), cudaMemcpyHostToDevice);

ElasticSurf3D<<<SIZEMAS, 1>>>(dev_u, dev_nei);

cudaThreadSynchronize();

cudaMemcpy(u, dev_u, (SIZEMAS + 1) * 9 * sizeof(float), cudaMemcpyDeviceToHost);

for (int i = 1; i < SIZEMAS; i++) {

	if (fabs(u[9*i] - nei[i*8+4]) > 1e-4) {

		printf("Wrong! i = %d Host = %g Cuda = %g PreviousCuda = %g NextCuda = %g\n", i, nei[i*8+4], u[9*i], u[9*(i-1)], u[9*(i+1)]);

		fflush(stdout);

		exit(1);

	}

}

printf("Success!\n");

cudaFree(dev_u);

cudaFree(dev_nei);

free(u);

free(nei);

return 0;

}

[/codebox]

Making SIZEMAS less than 2^16 removes the error.

From the programming guide, Section A.1.1:

“The maximum size of each dimension of a grid of thread blocks is 65535.”

Also, I hope you use a block size greater than 1 in your actual program, or you are wasting 97% of your GPU processing power.

Yes, I mistyped, thank you. One should make SIZEMAS less than 2^15 to remove the error. In my example SIZEMAS is set to 42000 that is definitely less than 65535 but the problem exists.

This is simply the sample of code which exibits the strange behaviour of the program. Please note that for emulation mode everything works fine.