Bug in type promotion

I’ve been trying to find a workaround for this bug that doesn’t involve PTX as there is no PTX and this is a fairly complicated set of kernels I don’t want to hand code.

uint128 is a struct with 2 uint64’s which are typedef’ed unsigned long long’s

uint32 is a uint typedef

static __inline__ __device__ uint32 cump128_mod_ui(uint128 a, uint32 p)

{

        uint64 ret;

       ret = a.y % p;

        ret = ((ret << 32) | (a.x >> 32)) % p;

        ret = ((ret << 32) | (a.x & 0xffffffff)) % p;

        return ret;

}

and

static __inline__ __device__ uint32 cump128_mod_ui(uint128 a, uint32 p)

{

        uint64 ret;

       ret = a.y % p;

        ret = ((ret << 32) | (a.x >> 32)) % p;

        ret = ((ret << 32) | ((uint32) a.x) % p;

        return ret;

}

don’t work, at least on the device. With deviceemu (which is nothing more than a passthrough to g++ from what I can figure with one pthread per GPU pthread, also thank you for the forkbomb this creates when running high threadcount kernels).

I’ve also tried shifting up then down on a.x. The result breaks in the last line of the ret ='s block.

-Patrick

Here’s the ptx

       cvt.u64.u32     %rd18, %r14;            //

        cvt.u32.u64     %r15, %rd13;            //

        cvt.u64.u32     %rd19, %r15;            //

        shr.u64         %rd20, %rd13, 32;       //

        rem.u64         %rd21, %rd17, %rd18;    //

        shl.b64         %rd22, %rd21, 32;       //

        or.b64  %rd23, %rd20, %rd22;    //

        rem.u64         %rd24, %rd23, %rd18;    //

        shl.b64         %rd25, %rd24, 32;       //

        or.b64  %rd26, %rd19, %rd25;    //

        rem.u64         %rd27, %rd26, %rd18;    //

        cvt.u32.u64     %r16, %rd27;            //

So a bit more experimentation yields another compiler bug:

Basically in the prior examples it refuses to calculate the remainder for the last step and outputs the unreduced value (although the PTX indicates it should).

if I add a few more ret = ret % p;'s for fun.

the PTX yields

       rem.u64         %rd27, %rd26, %rd18;    //

        rem.u64         %rd28, %rd27, %rd18;    //

        rem.u64         %rd29, %rd28, %rd18;    //

        rem.u64         %rd30, %rd29, %rd18;    //

        .loc    15      129     0

        bar.sync        0;                      //

        mov.u64         %rd31, __cuda_sh_tests8;        //

Which sh_tests is an array of 32 bit unsigned ints, so it creates a race and the odd threads in the block clobber the even threads’ values. This is nothing like what the C code is supposed to do.

Any thoughts anyone?

If you can post a simple self-contained test program, I can file a bug here. Thanks!

#include <stdio.h>

#include <stdlib.h>

#include <time.h>

#include <cuda.h>

#define SAMP_SIZE (512 * 512)

typedef u_int32_t uint32;

typedef u_int64_t uint64;

typedef struct __align__(16) {

		uint64 x, y;

} uint128;

static __inline__ __host__ __device__ uint32 cump128_mod_ui(uint128 a, uint32 p)

{

		uint64 ret;

		ret = a.y % p;

		ret = ((ret << 32) | (a.x >> 32)) % p;

		ret = ((ret << 32) | ((uint32) a.x)) % p;

		return ret;

}

__global__ void repro_kernel(uint128 *src1, uint32 *src2, uint32 *dst)

{

		uint128 a;

		uint32 b, c, idx;

		idx = blockIdx.x * blockDim.x + threadIdx.x;

		a = src1[idx];

		b = src2[idx];

		c = cump128_mod_ui(a, b);

		dst[idx] = c;

		return;

}

int main(void)

{

		uint128 *h_src1, *d_src1;

		uint32 i, *h_src2, *d_src2, *h_dst, *d_dst, *h_tst;

		cudaMallocHost((void **) &h_src1, SAMP_SIZE * sizeof(uint128));

		cudaMalloc((void **) &d_src1, SAMP_SIZE * sizeof(uint128));

		cudaMallocHost((void **) &h_src2, SAMP_SIZE * sizeof(uint32));

		cudaMalloc((void **) &d_src2, SAMP_SIZE * sizeof(uint32));

		cudaMallocHost((void **) &h_dst, SAMP_SIZE * sizeof(uint128));

		cudaMalloc((void **) &d_dst, SAMP_SIZE * sizeof(uint32));

		cudaMallocHost((void **) &h_tst, SAMP_SIZE * sizeof(uint128));

		srandom(5);

		for(i = 0; i < SAMP_SIZE; i++)

		{

				h_src1[i].x = ((uint64) random() << 32) | random();

				h_src1[i].y = ((uint64) random() << 32) | random();

				h_src2[i] = (random() & 0xffff) + 1;

		}

		cudaMemcpy(d_src1, h_src1, SAMP_SIZE * sizeof(uint128),

				cudaMemcpyHostToDevice);

		cudaMemcpy(d_src2, h_src2, SAMP_SIZE * sizeof(uint32),

				cudaMemcpyHostToDevice);

		repro_kernel<<<SAMP_SIZE / 512, 512>>>(d_src1, d_src2, d_dst);

		cudaThreadSynchronize();

		cudaMemcpy(h_dst, d_dst, SAMP_SIZE * sizeof(uint32),

				cudaMemcpyDeviceToHost);

		for(i = 0; i < SAMP_SIZE; i++)

				h_tst[i] = cump128_mod_ui(h_src1[i], h_src2[i]);

		for(i = 0; i < SAMP_SIZE; i++)

		{

				if(h_tst[i] != h_dst[i])

				{

						printf("DISCREPANCY AT: %u host: %u gpu: %u\n",

								i, h_tst[i], h_dst[i]);

				}

		}

		return 0;

}
[pstach@beast ~]$ nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver

Copyright (c) 2005-2007 NVIDIA Corporation

Built on Thu_Jun_19_03:38:28_PDT_2008

Cuda compilation tools, release 2.0, V0.2.1221

[pstach@beast ~]$ nvcc -deviceemu -O3 -o repro repro.cu

[pstach@beast ~]$ ./repro |head

[pstach@beast ~]$ nvcc -O3 -o repro repro.cu

[pstach@beast ~]$ ./repro |head

DISCREPANCY AT: 0 host: 9414 gpu: 99605327

DISCREPANCY AT: 1 host: 42546 gpu: 602511920

DISCREPANCY AT: 2 host: 20224 gpu: 589806848

DISCREPANCY AT: 3 host: 29924 gpu: 780437020

DISCREPANCY AT: 4 host: 29089 gpu: 990569005

DISCREPANCY AT: 5 host: 1503 gpu: 2072822815

DISCREPANCY AT: 6 host: 5711 gpu: 1392349175

DISCREPANCY AT: 7 host: 15718 gpu: 1180618940

DISCREPANCY AT: 8 host: 38073 gpu: 1347951473

DISCREPANCY AT: 9 host: 271 gpu: 280271323

Extra information that doesn’t really have much of an effect:

GTX 280, driver rev 177.67, CentOS 5.2 x86_64

Thanks for reporting this, it does seem to be a real bug in the optimizer. I’ll let you know as soon as it’s fixed.