Addressing bug on sm_2x

Hi, while debugging a utility kernel that started to fail with CUDA 4.2 (and still fails with 5.0), I discovered that the addressing calculations used when adding a negative value to an address are bugged.

For the kernel embedded in the attached example, the following assembly code is produced on a GT 610, according to the disassembly from cuda-gdb:

0x1219f30 :	MOV R1, c [0x1] [0x100]
   0x1219f38 :	NOP CC.T
   0x1219f40 :	I2I.U16.U32 R0, c [0x0] [0x8]
   0x1219f48 :	S2R R3, SR_CTAid_X
   0x1219f50 :	I2I.U32.U16 R2, R0
   0x1219f58 :	S2R R4, SR_Tid_X
   0x1219f60 :	I2I.U32.U16 R0, R3
   0x1219f68 :	I2I.U16.U32 R3, c [0x0] [0x14]
   0x1219f70 :	IMAD.U32.U32 R0, R2, R0, R4
   0x1219f78 :	I2I.U32.U16 R3, R3
   0x1219f80 :	ISETP.GE.U32.AND P0, pt, R0, 0x5, pt
   0x1219f88 :	IMUL.U32.U32 R2, R3, R2
   0x1219f90 :	@P0 EXIT
   0x1219f98 :	SHL.W R4, R0, 0x2
   0x1219fa0 :	I2I.U32.U32 R3, -R4
   0x1219fa8 :	IADD R0, R0, R2
   0x1219fb0 :	IADD R6.CC, R3, c [0x0] [0x20]
   0x1219fb8 :	ISETP.LT.U32.AND P0, pt, R0, 0x5, pt
   0x1219fc0 :	IADD.X R7, RZ, c [0x0] [0x24]
   0x1219fc8 :	IADD R4.CC, R4, c [0x0] [0x28]
   0x1219fd0 :	LD.E R3, [R6+0x10]
   0x1219fd8 :	IADD.X R5, RZ, c [0x0] [0x2c]
   0x1219fe0 :	F2F.PASS R3, R3
   0x1219fe8 :	ST.E [R4], R3
   0x1219ff0 :	@P0 BRA 0x68
   0x1219ff8 :	EXIT

If you look more specifically at the address computation for the first parameter:

   0x1219fb0 :	IADD R6.CC, R3, c [0x0] [0x20]
   0x1219fc0 :	IADD.X R7, RZ, c [0x0] [0x24]
   0x1219fd0 :	LD.E R3, [R6+0x10]

On reaching the first line, for all threads except (0, 0, 0), the value of R3 is a negative number. When added to the lower half of the first parameter (with overflow going to R7) you get 0x1 in R7 and the proper value for R6. This in effect adds a 2**32 offset to any address, well enough to throw us out of the buffer.

Example to reproduce:

  • Copy the code below to a .c file
  • nvcc --run -lcuda
  • If you have the bug it will print "You have the bug!"

const int DEVICE_NUMBER = 0;

const char KERNEL[] = ".version 3.0\n"
".target sm_20\n"
".entry extcpy (\n"
".param .u64 a_data,\n"
".param .u64 b_data ) {\n"
".reg .u16 rh1, rh2;\n"
".reg .u32 numThreads;\n"
".reg .u32 i;\n"
".reg .u32 a_pi, b_pi;\n"
".reg .u32 a_p, b_p;\n"
".reg .u32 r1;\n"
".reg .u64 rp1, rp2;\n"
".reg .f32 tmpa;\n"
".reg .f32 tmpb;\n"
".reg .pred p;\n"
"mov.u16 rh1, %ntid.x;\n"
"mov.u16 rh2, %ctaid.x;\n"
"mul.wide.u16 i, rh1, rh2;\n"
"mov.u32 r1, %tid.x;\n"
"add.u32 i, i, r1;\n"
"mov.u16 rh2, %nctaid.x;\n"
"mul.wide.u16 numThreads, rh2, rh1;\n"
" p, i, 5U;\n"
"@p bra $end;\n"
"mov.u32 a_p, 0U;\n"
"mov.u32 b_p, 0U;\n"
"mov.u32 a_pi, i;\n"
"mul.lo.u32 r1, a_pi, 4;\n"
"sub.u32 a_p, a_p, r1;\n"
"mov.u32 b_pi, i;\n"
"mul.lo.u32 r1, b_pi, 4;\n"
"add.u32 b_p, b_p, r1;\n"
"ld.param.u64 rp1, [a_data];\n"
"cvt.u32.u32 rp2, a_p;\n"
"add.u64 rp1, rp1, rp2;\n"
" tmpa, [rp1+16];\n"
"cvt.f32.f32 tmpb, tmpa;\n"
"ld.param.u64 rp1, [b_data];\n"
"cvt.u32.u32 rp2, b_p;\n"
"add.u64 rp1, rp1, rp2;\n"
" [rp1+0], tmpb;\n"
"add.u32 i, i, numThreads;\n"
" p, i, 5U;\n"
"@p bra $loop_begin;\n"

static CUcontext ctx;
static CUdevice dev;
static CUresult err;
static CUmodule m;
static CUfunction f;

static CUdeviceptr a;
static CUdeviceptr b;

static void *args[2];

#define SAFECALL(exp) if ((exp) != CUDA_SUCCESS) return -1

int main() {
  SAFECALL(cuDeviceGet(&dev, DEVICE_NUMBER));
  SAFECALL(cuCtxCreate(&ctx, CU_CTX_SCHED_YIELD, dev));
  // These sizes are matched to the kernel source above                         
  SAFECALL(cuMemAlloc(&a, 20));
  SAFECALL(cuMemAlloc(&b, 20));

  SAFECALL(cuModuleLoadData(&m, KERNEL));
  SAFECALL(cuModuleGetFunction(&f, m, "extcpy"));
  args[0] = &a;
  args[1] = &b;
  SAFECALL(cuLaunchKernel(f, 1, 1, 1, 32, 1, 1, 0, NULL, args, NULL));
  err = cuCtxSynchronize();
    printf("You have the bug!\n");
  } else if (err == CUDA_SUCCESS) {
    printf("No bug, all is working\n");
  } else {
    printf("Unexpected error %d\n", err);

From your description, I understand that you have a kernel that used to work as desired up to and including CUDA 4.1, and no longer works as desired in CUDA 4.2 and CUDA 5.0.

If this understanding is correct, please file a bug report through the registered developer website, attaching your repro code. Thank you for your help.

And what c code can produce such an error?

@njuffa: I will as soon as my application is approved. Also this did not work on 4.0 when doing an explicit add of a negative number in the PTX, but I worked around by doing a sub instead in that case. It just seems that 4.2+ is cleverer and spots that this is just an addition of a negative number and does that.

The C code equivalent would be something like this:

extern "C" __device__ void extcpy(float *a_data, float *b_data) {
    const unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x;
    const unsigned int numThreads = blockDim.x * gridDim.x;
    unsigned int i;
    char *a_p, *b_p;
    float *a, b;

    for (i = idx; i< 5; i += numThreads) {
        a_p = (char *)a_data;
        /* This is the same substitute an add of a negative by a sub of the positive trick */
        a_p -= (i * 4);
        a = (float *)a_p;
        b_p = (char *)b_data;
        b_p += (i *4);
        b = (float *)b_p;
        /* The 4 there is basically an offset from the start of the memory region that we apply here. */
        b[0] = a[4];

This does not do strictly the same operations in the same order since the original kernel was programmed in PTX, but should be close. The purpose is to copy the content of an array with arbitrary layout to another (also of arbitrary layout). The hardcoded numbers are there because the kernels are generated to fit the data they work on.

Thanks. I remember this was mentioned somewhere, do not use unsigned int as loop variable, though maybe for performance reasons. Wonder, how c programmer could locate such an error. I was thinking about moving to cuda 5.0.