Hi, while debugging a utility kernel that started to fail with CUDA 4.2 (and still fails with 5.0), I discovered that the addressing calculations used when adding a negative value to an address are bugged.
For the kernel embedded in the attached example, the following assembly code is produced on a GT 610, according to the disassembly from cuda-gdb:
0x1219f30 : MOV R1, c [0x1] [0x100]
0x1219f38 : NOP CC.T
0x1219f40 : I2I.U16.U32 R0, c [0x0] [0x8]
0x1219f48 : S2R R3, SR_CTAid_X
0x1219f50 : I2I.U32.U16 R2, R0
0x1219f58 : S2R R4, SR_Tid_X
0x1219f60 : I2I.U32.U16 R0, R3
0x1219f68 : I2I.U16.U32 R3, c [0x0] [0x14]
0x1219f70 : IMAD.U32.U32 R0, R2, R0, R4
0x1219f78 : I2I.U32.U16 R3, R3
0x1219f80 : ISETP.GE.U32.AND P0, pt, R0, 0x5, pt
0x1219f88 : IMUL.U32.U32 R2, R3, R2
0x1219f90 : @P0 EXIT
0x1219f98 : SHL.W R4, R0, 0x2
0x1219fa0 : I2I.U32.U32 R3, -R4
0x1219fa8 : IADD R0, R0, R2
0x1219fb0 : IADD R6.CC, R3, c [0x0] [0x20]
0x1219fb8 : ISETP.LT.U32.AND P0, pt, R0, 0x5, pt
0x1219fc0 : IADD.X R7, RZ, c [0x0] [0x24]
0x1219fc8 : IADD R4.CC, R4, c [0x0] [0x28]
0x1219fd0 : LD.E R3, [R6+0x10]
0x1219fd8 : IADD.X R5, RZ, c [0x0] [0x2c]
0x1219fe0 : F2F.PASS R3, R3
0x1219fe8 : ST.E [R4], R3
0x1219ff0 : @P0 BRA 0x68
0x1219ff8 : EXIT
0x121a000:
If you look more specifically at the address computation for the first parameter:
...
0x1219fb0 : IADD R6.CC, R3, c [0x0] [0x20]
...
0x1219fc0 : IADD.X R7, RZ, c [0x0] [0x24]
...
0x1219fd0 : LD.E R3, [R6+0x10]
On reaching the first line, for all threads except (0, 0, 0), the value of R3 is a negative number. When added to the lower half of the first parameter (with overflow going to R7) you get 0x1 in R7 and the proper value for R6. This in effect adds a 2**32 offset to any address, well enough to throw us out of the buffer.
Example to reproduce:
- Copy the code below to a .c file
- nvcc --run -lcuda
- If you have the bug it will print "You have the bug!"
#include
#include
const int DEVICE_NUMBER = 0;
const char KERNEL[] = ".version 3.0\n"
".target sm_20\n"
"\n"
".entry extcpy (\n"
".param .u64 a_data,\n"
".param .u64 b_data ) {\n"
".reg .u16 rh1, rh2;\n"
".reg .u32 numThreads;\n"
".reg .u32 i;\n"
".reg .u32 a_pi, b_pi;\n"
".reg .u32 a_p, b_p;\n"
".reg .u32 r1;\n"
".reg .u64 rp1, rp2;\n"
".reg .f32 tmpa;\n"
".reg .f32 tmpb;\n"
".reg .pred p;\n"
"mov.u16 rh1, %ntid.x;\n"
"mov.u16 rh2, %ctaid.x;\n"
"mul.wide.u16 i, rh1, rh2;\n"
"mov.u32 r1, %tid.x;\n"
"add.u32 i, i, r1;\n"
"mov.u16 rh2, %nctaid.x;\n"
"mul.wide.u16 numThreads, rh2, rh1;\n"
"setp.ge.u32 p, i, 5U;\n"
"@p bra $end;\n"
"$loop_begin:\n"
"mov.u32 a_p, 0U;\n"
"mov.u32 b_p, 0U;\n"
"mov.u32 a_pi, i;\n"
"mul.lo.u32 r1, a_pi, 4;\n"
"sub.u32 a_p, a_p, r1;\n"
"mov.u32 b_pi, i;\n"
"mul.lo.u32 r1, b_pi, 4;\n"
"add.u32 b_p, b_p, r1;\n"
"ld.param.u64 rp1, [a_data];\n"
"cvt.u32.u32 rp2, a_p;\n"
"add.u64 rp1, rp1, rp2;\n"
"ld.global.f32 tmpa, [rp1+16];\n"
"cvt.f32.f32 tmpb, tmpa;\n"
"ld.param.u64 rp1, [b_data];\n"
"cvt.u32.u32 rp2, b_p;\n"
"add.u64 rp1, rp1, rp2;\n"
"st.global.f32 [rp1+0], tmpb;\n"
"add.u32 i, i, numThreads;\n"
"setp.lt.u32 p, i, 5U;\n"
"@p bra $loop_begin;\n"
"$end:\n"
"ret;\n"
"}";
static CUcontext ctx;
static CUdevice dev;
static CUresult err;
static CUmodule m;
static CUfunction f;
static CUdeviceptr a;
static CUdeviceptr b;
static void *args[2];
#define SAFECALL(exp) if ((exp) != CUDA_SUCCESS) return -1
int main() {
SAFECALL(cuInit(0));
SAFECALL(cuDeviceGet(&dev, DEVICE_NUMBER));
SAFECALL(cuCtxCreate(&ctx, CU_CTX_SCHED_YIELD, dev));
// These sizes are matched to the kernel source above
SAFECALL(cuMemAlloc(&a, 20));
SAFECALL(cuMemAlloc(&b, 20));
SAFECALL(cuModuleLoadData(&m, KERNEL));
SAFECALL(cuModuleGetFunction(&f, m, "extcpy"));
args[0] = &a;
args[1] = &b;
SAFECALL(cuLaunchKernel(f, 1, 1, 1, 32, 1, 1, 0, NULL, args, NULL));
err = cuCtxSynchronize();
if (err == CUDA_ERROR_LAUNCH_FAILED) {
printf("You have the bug!\n");
} else if (err == CUDA_SUCCESS) {
printf("No bug, all is working\n");
} else {
printf("Unexpected error %d\n", err);
}
cuCtxDestroy(ctx);
}