Array Assigned to Register and Unrolled Loop Variable Bug

The following code that runs in CUDA 2.0 and 2.1 Beta does not work well:

[codebox]#include <stdio.h>

global void

kernel(int* a)

{

int reg[4];

#pragma unroll

for (int i = 0; i < 4; i++) {

reg[i / 2 + (i % 2) * 2] = i;

}

#pragma unroll

for (int i = 0; i < 4; i++) {

a[i] = reg[i];

}

}

int

main()

{

static int a[32 * 4];

int* d_a;

cudaMalloc((void**) &d_a, 4 * sizeof(int));

kernel<<<1, 1>>>(d_a);

cudaMemcpy(a, d_a, 4 * sizeof(int), cudaMemcpyDeviceToHost);

for (int i = 0; i < 4; i++) {

printf("%d 0x%x\n", i, a[i]);

}

cudaFree(d_a);

}[/codebox]

This program is expected to output the following:

[codebox]0 0x0

1 0x2

2 0x1

3 0x3[/codebox]

In the emulation mode, the program outputs this. However, when the program runs at the GPU, it outputs the following:

[codebox]0 0x1

1 0x3

2 0x0

3 0x0[/codebox]

A condition to reproduce this bug seems that a array is assigned to registers and an unrolled loop variable with division is used to access this array.

No one cares about this bug? NVIDIA?

Btw, nishihei, do you mind running decuda over it to see better what’s going on?

The ptx output is following:

[codebox] .version 1.2

    .target sm_10, map_f64_to_f32

    // compiled with /usr/local/cuda/open64/lib//be

    // nvopencc built on 2008-06-19

.reg .u32 %ra<17>;

    .reg .u64 %rda<17>;

    .reg .f32 %fa<17>;

    .reg .f64 %fda<17>;

    .reg .u32 %rv<5>;

    .reg .u64 %rdv<5>;

    .reg .f32 %fv<5>;

    .reg .f64 %fdv<5>;

//-----------------------------------------------------------

    // Compiling /tmp/tmpxft_00000d1c_00000000-7_unrolltest.cpp3.i (/tmp/ccBI#.Z1hPzl)

    //-----------------------------------------------------------

//-----------------------------------------------------------

    // Options:

    //-----------------------------------------------------------

    //  Target:ptx, ISA:sm_10, Endian:little, Pointer Size:64

    //  -O3 (Optimization level)

    //  -g0 (Debug level)

    //  -m2 (Report advisories)

    //-----------------------------------------------------------

.file 1 “<コマンドライン>”

    .file   2       "/tmp/tmpxft_00000d1c_00000000-6_unrolltest.cudafe2.gpu"

    .file   3       "/usr/local/gcc-4.1.2/lib/gcc/x86_64-unknown-linux-gnu/4.1.2/include/stddef.h"

    .file   4       "/usr/local/cuda/bin/../include/crt/device_runtime.h"

    .file   5       "/usr/local/cuda/bin/../include/crt/../host_defines.h"

    .file   6       "/usr/local/cuda/bin/../include/crt/../builtin_types.h"

    .file   7       "/usr/local/cuda/bin/../include/crt/../device_types.h"

    .file   8       "/usr/local/cuda/bin/../include/crt/../driver_types.h"

    .file   9       "/usr/local/cuda/bin/../include/crt/../texture_types.h"

    .file   10      "/usr/local/cuda/bin/../include/crt/../vector_types.h"

    .file   11      "/usr/local/cuda/bin/../include/crt/../device_launch_parameters.h"

    .file   12      "/usr/local/cuda/bin/../include/crt/storage_class.h"

    .file   13      "/usr/include/bits/types.h"

    .file   14      "/usr/include/time.h"

    .file   15      "unrolltest.cu"

    .file   16      "/usr/local/cuda/bin/../include/common_functions.h"

    .file   17      "/usr/local/cuda/bin/../include/crt/func_macro.h"

    .file   18      "/usr/local/cuda/bin/../include/math_functions.h"

    .file   19      "/usr/local/cuda/bin/../include/device_functions.h"

    .file   20      "/usr/local/cuda/bin/../include/math_constants.h"

    .file   21      "/usr/local/cuda/bin/../include/sm_11_atomic_functions.h"

    .file   22      "/usr/local/cuda/bin/../include/sm_12_atomic_functions.h"

    .file   23      "/usr/local/cuda/bin/../include/sm_13_double_functions.h"

    .file   24      "/usr/local/cuda/bin/../include/texture_fetch_functions.h"

    .file   25      "/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h"

.entry __globfunc__Z6kernelPi

    {

    .reg .u32 %r<14>;

    .reg .u64 %rd<3>;

    .param .u64 __cudaparm___globfunc__Z6kernelPi_a;

    // reg = 0

    .loc    15      4       0

$LBB1___globfunc__Z6kernelPi:

    .loc    15      9       0

    mov.s32         %r1, 0;                 //

    mov.s32         %r2, %r1;               //

    mov.s32         %r3, 1;                 //

    mov.s32         %r2, %r3;               //

    mov.s32         %r4, 2;                 //

    mov.s32         %r5, %r4;               //

    mov.s32         %r6, 3;                 //

    mov.s32         %r5, %r6;               //

    .loc    15      13      0

    ld.param.u64    %rd1, [__cudaparm___globfunc__Z6kernelPi_a];    // id:20 __cudaparm___globfunc__Z6kernelPi_a+0x0

    mov.s32         %r7, 1;                 //

    st.global.s32   [%rd1+0], %r7;  // id:23

    mov.s32         %r8, 3;                 //

    st.global.s32   [%rd1+4], %r8;  // id:24

    mov.s32         %r9, %r10;              //

    st.global.s32   [%rd1+8], %r9;  // id:26

    mov.s32         %r11, %r12;             //

    st.global.s32   [%rd1+12], %r11;        // id:28

    .loc    15      15      0

    exit;                           //

$LDWend___globfunc__Z6kernelPi:

    } // __globfunc__Z6kernelPi

[/codebox]

This ptx output shows that both the r1 register and the r3 register are substituted for the r2 register. r1 and r3 should be substituted for different registers.

The decuda output is following:

[codebox]// Disassembling __globfunc__Z6kernelPi

000000: 1000c801 0423c780 mov.b32 $r0, s[0x0010]

000008: 10018009 00000003 mov.b32 $r2, 0x00000001

000010: d00e0009 a0c00780 mov.u32 g[$r0], $r2

000018: 2104e809 00000003 add.b32 $r2, s[0x0010], 0x00000004

000020: 10038001 00000003 mov.b32 $r0, 0x00000003

000028: d00e0401 a0c00780 mov.u32 g[$r2], $r0

000030: 2108e801 00000003 add.b32 $r0, s[0x0010], 0x00000008

000038: d00e0005 a0c00780 mov.u32 g[$r0], $r1

000040: 210ce801 00000003 add.b32 $r0, s[0x0010], 0x0000000c

000048: d00e0005 a0c00781 mov.end.u32 g[$r0], $r1[/codebox]

This bug does not seem to be fixed in CUDA 2.1. Such code as this enbugged code is useful for permutation of the array. I hope that NVIDIA will fix this bug in a next release of CUDA.

Thanks for reporting this problem.
I opened a bug report.

The bug is fixed in the upcoming CUDA 2.2.

I confirmed that this bug is fixed in the released CUDA 2.2.
Thanks for fixing!