Wrong unsigned to signed conversion Problem with types conversion leads to incorrect kernel's re

PapaZol · February 6, 2009, 7:36pm

The following kernel

[codebox]//cusolver_int.csm

global void ElasticSurf3D()

{

const int k = blockIdx.x + 1;

cuDev.u1[k * 9 + threadIdx.x] = cuDev.nei[8*(blockIdx.x+1)+4];

}

[/codebox]

gives wrong result stored in a global array cuDev.u1 for the blockIdx.x = 32768 (blockDim.x = 42153).

However declaring

[codebox]//cusolver_uint.csm

const unsigned int k = blockIdx.x + 1;

[/codebox]

starts to produce correct results. Decuda files for both versions are given at the end of the post (I’ve failed to attach them). From my point of view the problem is caused by a strange splitting “int k” into two 16-bit numbers which are not able to store the value 32768. This splitting is clearly visible in the corresponding decuda file.

Moreover the compiler’'s behaviour is quite unpredictable. If the kernel is like this

[codebox]//cusolver_while.csm

global void ElasticSurf3D()

{

const int k = blockIdx.x + 1;

if(k <= 0)

while(1);

cuDev.u1[k * 9 + threadIdx.x] = cuDev.nei[8*(blockIdx.x+1)+4];

}

[/codebox]

the results are also correct. However

[codebox]//cusolver_wo_while.csm

global void ElasticSurf3D()

{

const int k = blockIdx.x + 1;

if(k <= 0);

cuDev.u1[k * 9 + threadIdx.x] = cuDev.nei[8*(blockIdx.x+1)+4];

}

[/codebox]

leads to the same splitting and wrong results.

Can it be considered as a compiler bug? If so how it is possible to detect such cases.

Thanks in advance.

[codebox]cusolver_int.csm

// Disassembling _Z13ElasticSurf3Dv (1)

.entry _Z13ElasticSurf3Dv

{

.lmem 0

.smem 0

.reg 3

.bar 0

mul24.lo.u32.u16.u16 $r1, s[0x000c], 0x0008

shl.u32 $r1, $r1, 0x00000002

mov.half.b16 $r0.hi, s[0x000c]

add.half.b32 $r2, $r1, c0[0x0028]

cvt.u32.u16 $r1, $r0.lo

mul24.lo.s32.s16.s16 $r64, $r0.hi, 0x0009

add.b32 $r2, $r2, 0x00000030

add.u32 $r1, $r1, $r0

mov.u32 $r0, g[$r2]

shl.u32 $r1, $r1, 0x00000002

add.u32 $r1, $r1, c0[0x0008]

add.b32 $r1, $r1, 0x00000024

cvt.rn.f32.s32 $r0, $r0

mov.end.u32 g[$r1], $r0

}

[/codebox]

[codebox]cusolver_uint.csm

// Disassembling _Z13ElasticSurf3Dv (1)

.entry _Z13ElasticSurf3Dv

{

.lmem 0

.smem 0

.reg 3

.bar 0

mul24.lo.u32.u16.u16 $r1, s[0x000c], 0x0008

shl.u32 $r1, $r1, 0x00000002

add.u32 $r1, $r1, c0[0x0028]

add.b32 $r1, $r1, 0x00000030

cvt.u32.u16 $r0, $r0.lo

mov.u32 $r1, g[$r1]

mad24.lo.u32.u16.u16 $r0, s[0x000c], 0x0009// (No operand 4 in this instruction)

shl.u32 $r0, $r0, 0x00000002

add.u32 $r0, $r0, c0[0x0008]

add.b32 $r2, $r0, 0x00000024

cvt.rn.f32.s32 $r0, $r1

mov.end.u32 g[$r2], $r0

#.constseg 1:0x0000 const

#{

#d.32 0x00000009 // 0000

#}

}

[/codebox]

[codebox]cusolver_while.csm

// Disassembling _Z13ElasticSurf3Dv (1)

.entry _Z13ElasticSurf3Dv

{

.lmem 0

.smem 0

.reg 3

.bar 0

cvt.u32.u16 $r1, s[0x000c]

add.b32 $r2, $r1, 0x00000001

set.gt.s32 $p0|$o127, $r2, $r60// (unk0 00400000)

@$p0.ne bra.label label1

label0: bra.label label0

label1: shl.u32 $r2, $r1, 0x00000005

add.u32 $r2, $r2, c0[0x0028]

mul24.lo.s32 $r1, $r1, 0x00000009

cvt.u32.u16 $r0, $r0.lo

add.b32 $r2, $r2, 0x00000030

add.u32 $r1, $r0, $r1

mov.u32 $r0, g[$r2]

shl.u32 $r1, $r1, 0x00000002

add.u32 $r1, $r1, c0[0x0008]

cvt.rn.f32.s32 $r0, $r0

add.b32 $r1, $r1, 0x00000024

mov.end.u32 g[$r1], $r0

}

[/codebox]

[codebox]cusolver_wo_while.csm

// Disassembling _Z13ElasticSurf3Dv (1)

.entry _Z13ElasticSurf3Dv

{

.lmem 0

.smem 0

.reg 3

.bar 0

mul24.lo.u32.u16.u16 $r1, s[0x000c], 0x0008

shl.u32 $r1, $r1, 0x00000002

mov.half.b16 $r0.hi, s[0x000c]

add.half.b32 $r2, $r1, c0[0x0028]

cvt.u32.u16 $r1, $r0.lo

mul24.lo.s32.s16.s16 $r64, $r0.hi, 0x0009

add.b32 $r2, $r2, 0x00000030

add.u32 $r1, $r1, $r0

mov.u32 $r0, g[$r2]

shl.u32 $r1, $r1, 0x00000002

add.u32 $r1, $r1, c0[0x0008]

add.b32 $r1, $r1, 0x00000024

cvt.rn.f32.s32 $r0, $r0

mov.end.u32 g[$r1], $r0

}

[/codebox]

PapaZol · February 13, 2009, 5:27pm

I succeeded to extract the bug to a small program.

Here it is:

[codebox]

#include <stdio.h>

/* Sample CUDA code demonstrating wrong type conversion.

Copyrignt Denis Sabitov, Anatoly Vershinin

*/

#define SIZEMAS 42000

global void ElasticSurf3D(float* u1, const float* nei)

{

const int k = blockIdx.x + 1;

u1[k * 9 + threadIdx.x] = nei[8*(blockIdx.x+1)+4];

}

int main(int argc, char* argv)

{

float *u, *nei;

float *dev_u, *dev_nei;

u = (float *) malloc((SIZEMAS + 1) * 9 * sizeof(float));

nei = (float *) malloc((SIZEMAS + 1) * 8 * sizeof(float));

for (int i = 0; i < (SIZEMAS + 1) * 9; i++) {

	u[i] = 0.0f;

}

for (int i = 0; i <= SIZEMAS; i++) {

	nei[i*8+4] = (float) i;

}

cudaMalloc((void**) &dev_u, (SIZEMAS + 1) * 9 * sizeof(float));

cudaMalloc((void**) &dev_nei, (SIZEMAS + 1) * 8 * sizeof(float));

cudaMemcpy(dev_u, u, (SIZEMAS + 1) * 9 * sizeof(float), cudaMemcpyHostToDevice);

cudaMemcpy(dev_nei, nei, (SIZEMAS + 1) * 8 * sizeof(float), cudaMemcpyHostToDevice);

ElasticSurf3D<<<SIZEMAS, 1>>>(dev_u, dev_nei);

cudaThreadSynchronize();

cudaMemcpy(u, dev_u, (SIZEMAS + 1) * 9 * sizeof(float), cudaMemcpyDeviceToHost);

for (int i = 1; i < SIZEMAS; i++) {

	if (fabs(u[9*i] - nei[i*8+4]) > 1e-4) {

		printf("Wrong! i = %d Host = %g Cuda = %g PreviousCuda = %g NextCuda = %g\n", i, nei[i*8+4], u[9*i], u[9*(i-1)], u[9*(i+1)]);

		fflush(stdout);

		exit(1);

	}

}

printf("Success!\n");

cudaFree(dev_u);

cudaFree(dev_nei);

free(u);

free(nei);

return 0;

}

[/codebox]

Making SIZEMAS less than 2^16 removes the error.

seibert · February 15, 2009, 12:57am

From the programming guide, Section A.1.1:

“The maximum size of each dimension of a grid of thread blocks is 65535.”

Also, I hope you use a block size greater than 1 in your actual program, or you are wasting 97% of your GPU processing power.

PapaZol · February 16, 2009, 6:15pm

Yes, I mistyped, thank you. One should make SIZEMAS less than 2^15 to remove the error. In my example SIZEMAS is set to 42000 that is definitely less than 65535 but the problem exists.

This is simply the sample of code which exibits the strange behaviour of the program. Please note that for emulation mode everything works fine.

Topic		Replies	Views
Problems in 32-bit ---> 64-bit conversion Kernel idx goes mad, known issue? CUDA Programming and Performance	14	1540	January 24, 2011
nvcc compiler bug CUDA Programming and Performance	5	1266	July 15, 2010
Bizarre type promotion Pretending int is a uint CUDA Programming and Performance	8	3617	September 26, 2008
BUG: Possible shared state in CUDA Toolkit Internals - Makes size_t 64-bit or 32-bit CUDA Programming and Performance	4	745	March 29, 2017
problem with bigger than 32768-size grids CUDA bug? CUDA Programming and Performance	9	6677	January 28, 2009
Mixing Types CUDA Programming and Performance	16	2409	August 18, 2011
Error with types; CUDA Programming and Performance	1	646	November 21, 2014
Very strange behaviour. Maybe a bug...? Kernel fails to run strangely, but no errors are reported. CUDA Programming and Performance	5	1128	May 13, 2009
Parameter transfer to device wrong? CUDA Programming and Performance	3	7708	May 14, 2009
Problem Lanuching Simple Kernel CUDA Programming and Performance	14	2657	August 15, 2015

Wrong unsigned to signed conversion Problem with types conversion leads to incorrect kernel's re

Related topics