Wrong output with loop unroll under CUDA 6.5

I have the following code:

#include<stdio.h>

#define N_ITERATIONS 2048
#define UNROLL 32

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/********************************************************/
/* KERNEL0 - NO INSTRUCTION LEVEL PARALLELISM (ILP = 0) */
/********************************************************/
__global__ void kernel0(int *d_a, int *d_b, int *d_c, unsigned int N) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x ;

    if (tid < N) {

        int a = d_a[tid];
        int b = d_b[tid];
        int c = d_c[tid];

        #pragma unroll UNROLL
        for(unsigned int i = 0; i < N_ITERATIONS; i++) {
            a = a * b + c;
        }

        d_a[tid] = a;
    }

}

/*****************************************************/
/* KERNEL1 - INSTRUCTION LEVEL PARALLELISM (ILP = 2) */
/*****************************************************/
__global__ void kernel1(int *d_a, int *d_b, int *d_c, unsigned int N) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N/2) {

        int a1 = d_a[tid];
        int b1 = d_b[tid];
        int c1 = d_c[tid];

        int a2 = d_a[tid+N/2];
        int b2 = d_b[tid+N/2];
        int c2 = d_c[tid+N/2];

        #pragma unroll UNROLL
        for(unsigned int i = 0; i < N_ITERATIONS; i++) {
            a1 = a1 * b1 + c1;
            a2 = a2 * b2 + c2;
        }

        d_a[tid]        = a1;
        d_a[tid+N/2]    = a2;
    }

}

/*****************************************************/
/* KERNEL2 - INSTRUCTION LEVEL PARALLELISM (ILP = 4) */
/*****************************************************/
__global__ void kernel2(int *d_a, int *d_b, int *d_c, unsigned int N) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N/4) {

        int a1 = d_a[tid];
        int b1 = d_b[tid];
        int c1 = d_c[tid];

        int a2 = d_a[tid+N/4];
        int b2 = d_b[tid+N/4];
        int c2 = d_c[tid+N/4];

        int a3 = d_a[tid+N/2];
        int b3 = d_b[tid+N/2];
        int c3 = d_c[tid+N/2];

        int a4 = d_a[tid+3*N/4];
        int b4 = d_b[tid+3*N/4];
        int c4 = d_c[tid+3*N/4];

        #pragma unroll UNROLL
        for(unsigned int i = 0; i < N_ITERATIONS; i++) {
            a1 = a1 * b1 + c1;
            //if (tid==0) printf("iteration %i %i\n",i,a1);
            a2 = a2 * b2 + c2;
            a3 = a3 * b3 + c3;
            a4 = a4 * b4 + c4;
        }
        //if (tid==0) printf("last iteration %i\n",a1);

        d_a[tid]        = a1;
        d_a[tid+N/4]    = a2;
        d_a[tid+N/2]    = a3;
        d_a[tid+3*N/4]  = a4;
    }

}

/********/
/* MAIN */
/********/
void main() {

    const int N = 1024;

    int blockSize;      // The launch configurator returned block size 
    int minGridSize;    // The minimum grid size needed to achieve the maximum occupancy for a full device launch 

    int *h_a                = (int*)malloc(N*sizeof(int));
    int *h_a_result_host    = (int*)malloc(N*sizeof(int));
    int *h_a_result_device  = (int*)malloc(N*sizeof(int));
    int *h_b                = (int*)malloc(N*sizeof(int));
    int *h_c                = (int*)malloc(N*sizeof(int));

    for (int i=0; i<N; i++) {
        h_a[i] = 2;
        h_b[i] = 1;
        h_c[i] = 2;
        h_a_result_host[i] = h_a[i];
        for(unsigned int k = 0; k < N_ITERATIONS; k++) {
            h_a_result_host[i] = h_a_result_host[i] * h_b[i] + h_c[i];
        }
    }

    int *d_a; gpuErrchk(cudaMalloc((void**)&d_a, N*sizeof(int)));
    int *d_b; gpuErrchk(cudaMalloc((void**)&d_b, N*sizeof(int)));
    int *d_c; gpuErrchk(cudaMalloc((void**)&d_c, N*sizeof(int)));

    gpuErrchk(cudaMemcpy(d_a, h_a, N*sizeof(int), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_b, h_b, N*sizeof(int), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_c, h_c, N*sizeof(int), cudaMemcpyHostToDevice));

    // --- Creating events for timing
    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    /***********/
    /* KERNEL2 */
    /***********/
    cudaEventRecord(start, 0);
    kernel2<<<1, N/4>>>(d_a, d_b, d_c, N);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("GFlops = %f\n", (1.e-6)*(float)(N*N_ITERATIONS)/time);
    gpuErrchk(cudaMemcpy(h_a_result_device, d_a, N*sizeof(int), cudaMemcpyDeviceToHost));
    for (int i=0; i<N; i++) if(h_a_result_device[i] != h_a_result_host[i]) { printf("Error at i=%i! Host = %i; Device = %i\n", i, h_a_result_host[i], h_a_result_device[i]); return; }

    cudaDeviceReset();

}

I’m compiling it using CUDA 6.5 for compute_20, sm_21 (GT540M) and it works correctly for UNROLL = 2, 4, 8, 16, but it does not work anymore for UNROLL = 32, in the sense that the result is incorrect already for the first element of the output array.

The error message is:

Error at i=0! Host = 4098; Device = 4036

If I monitor the processing for tid = 0, i.e., I uncomment the printf lines, I see that a1 holds the correct value until the last for iteration, while, immediately after exiting the for loop, the variable becomes incorrect. In other words, by uncommenting the two printf’s, the output is

....
iteration 2047 4098
last iteration 4036

which means that the for loop is executed correctly, but when exiting it, the value of the a1 variable changes.

If I compile the code for UNROLL = 32 with CUDA 5.5 and 6.0, it works.

Having a look at the disassembled code, I see

CUDA 5.5

Function : _Z7kernel2PiS_S_j
        .headerflags    @"EF_CUDA_SM21 EF_CUDA_PTX_SM(EF_CUDA_SM21)"
        /*0000*/         MOV R1, c[0x1][0x100];
        /*0008*/         NOP;
        /*0010*/         MOV R3, c[0x0][0x38];
        /*0018*/         S2R R0, SR_CTAID.X;
        /*0020*/         SHR.U32 R3, R3, 0x2;
        /*0028*/         S2R R2, SR_TID.X;
        /*0030*/         IMAD R15, R0, c[0x0][0x8], R2;
        /*0038*/         ISETP.GE.U32.AND P0, PT, R15, R3, PT;
        /*0040*/     @P0 EXIT;
        /*0048*/         MOV32I R17, 0x4;
        /*0050*/         IADD R7, R15, R3;
        /*0058*/         IMAD R8.CC, R15, R17, c[0x0][0x20];
        /*0060*/         IMAD.HI.X R9, R15, R17, c[0x0][0x24];
        /*0068*/         IMAD R20.CC, R15, R17, c[0x0][0x28];
        /*0070*/         LD.E R0, [R8];
        /*0078*/         IMAD.HI.X R21, R15, R17, c[0x0][0x2c];
        /*0080*/         IMAD R2.CC, R15, R17, c[0x0][0x30];
        /*0088*/         MOV R12, c[0x0][0x38];
        /*0090*/         IMAD.HI.X R3, R15, R17, c[0x0][0x34];
        /*0098*/         IMUL R13, R12, 0x3;
        /*00a0*/         IMAD.U32.U32 R10.CC, R7, R17, c[0x0][0x30];
        /*00a8*/         LD.E R14, [R2];
        /*00b0*/         IMAD.U32.U32.HI.X R11, R7, R17, c[0x0][0x34];
        /*00b8*/         LD.E R16, [R20];
        /*00c0*/         IMAD.U32.U32 R4.CC, R7, R17, c[0x0][0x20];
        /*00c8*/         SHR.U32 R3, R12, 0x1;
        /*00d0*/         IMAD.U32.U32.HI.X R5, R7, R17, c[0x0][0x24];
        /*00d8*/         IADD R20, R15, R3;
        /*00e0*/         IMAD.U32.U32 R6.CC, R7, R17, c[0x0][0x28];
        /*00e8*/         LD.E R18, [R4];
        /*00f0*/         IMAD.U32.U32.HI R13, R13, c[0x10][0x0], R15;
        /*00f8*/         LD.E R21, [R10];
        /*0100*/         IMAD.U32.U32.HI.X R7, R7, R17, c[0x0][0x2c];
        /*0108*/         IMAD.U32.U32 R8.CC, R13, R17, c[0x0][0x28];
        /*0110*/         LD.E R19, [R6];
        /*0118*/         IMAD.U32.U32.HI.X R9, R13, R17, c[0x0][0x2c];
        /*0120*/         IMAD.U32.U32 R2.CC, R13, R17, c[0x0][0x20];
        /*0128*/         LD.E R9, [R8];
        /*0130*/         IMAD.U32.U32.HI.X R3, R13, R17, c[0x0][0x24];
        /*0138*/         IMAD.U32.U32 R4.CC, R20, R17, c[0x0][0x28];
        /*0140*/         IMAD.U32.U32.HI.X R5, R20, R17, c[0x0][0x2c];
        /*0148*/         LD.E R8, [R2];
        /*0150*/         IMAD.U32.U32 R6.CC, R20, R17, c[0x0][0x30];
        /*0158*/         LD.E R5, [R4];
        /*0160*/         IMAD.U32.U32.HI.X R7, R20, R17, c[0x0][0x34];
        /*0168*/         MOV32I R4, 0xfffff800;
        /*0170*/         IMAD.U32.U32 R12.CC, R13, R17, c[0x0][0x30];
        /*0178*/         LD.E R6, [R6];
        /*0180*/         IMAD.U32.U32.HI.X R13, R13, R17, c[0x0][0x34];
        /*0188*/         IMAD.U32.U32 R10.CC, R20, R17, c[0x0][0x20];
        /*0190*/         LD.E R13, [R12];
        /*0198*/         IMAD.U32.U32.HI.X R11, R20, R17, c[0x0][0x24];
        /*01a0*/         LD.E R17, [R10];
        /*01a8*/         IMAD R0, R0, R16, R14;
        /*01b0*/         IMAD R7, R18, R19, R21;
        /*01b8*/         IMAD R12, R17, R5, R6;
        /*01c0*/         IMAD R8, R8, R9, R13;
        /*01c8*/         IMAD R0, R0, R16, R14;
        /*01d0*/         IMAD R7, R7, R19, R21;
        /*01d8*/         IMAD R12, R12, R5, R6;
        /*01e0*/         IMAD R8, R8, R9, R13;
        /*01e8*/         IMAD R0, R0, R16, R14;
        /*01f0*/         IMAD R7, R7, R19, R21;
        /*01f8*/         IMAD R12, R12, R5, R6;
        /*0200*/         IMAD R8, R8, R9, R13;
        /*0208*/         IMAD R0, R0, R16, R14;
        /*0210*/         IMAD R7, R7, R19, R21;
        /*0218*/         IMAD R12, R12, R5, R6;
        /*0220*/         IMAD R8, R8, R9, R13;
        /*0228*/         IMAD R0, R0, R16, R14;
        /*0230*/         IMAD R7, R7, R19, R21;
        /*0238*/         IMAD R12, R12, R5, R6;
        /*0240*/         IMAD R8, R8, R9, R13;
        /*0248*/         IMAD R0, R0, R16, R14;
        /*0250*/         IMAD R7, R7, R19, R21;
        /*0258*/         IMAD R12, R12, R5, R6;
        /*0260*/         IMAD R8, R8, R9, R13;
        /*0268*/         IMAD R0, R0, R16, R14;
        /*0270*/         IMAD R7, R7, R19, R21;
        /*0278*/         IMAD R12, R12, R5, R6;
        /*0280*/         IMAD R8, R8, R9, R13;
        /*0288*/         IMAD R0, R0, R16, R14;
        /*0290*/         IMAD R7, R7, R19, R21;
        /*0298*/         IMAD R12, R12, R5, R6;
        /*02a0*/         IMAD R8, R8, R9, R13;
        /*02a8*/         IMAD R0, R0, R16, R14;
        /*02b0*/         IMAD R7, R7, R19, R21;
        /*02b8*/         IMAD R12, R12, R5, R6;
        /*02c0*/         IMAD R8, R8, R9, R13;
        /*02c8*/         IMAD R0, R0, R16, R14;
        /*02d0*/         IMAD R7, R7, R19, R21;
        /*02d8*/         IMAD R12, R12, R5, R6;
        /*02e0*/         IMAD R8, R8, R9, R13;
        /*02e8*/         IMAD R0, R0, R16, R14;
        /*02f0*/         IMAD R7, R7, R19, R21;
        /*02f8*/         IMAD R12, R12, R5, R6;
        /*0300*/         IMAD R8, R8, R9, R13;
        /*0308*/         IMAD R0, R0, R16, R14;
        /*0310*/         IMAD R7, R7, R19, R21;
        /*0318*/         IMAD R12, R12, R5, R6;
        /*0320*/         IMAD R8, R8, R9, R13;
        /*0328*/         IMAD R0, R0, R16, R14;
        /*0330*/         IMAD R7, R7, R19, R21;
        /*0338*/         IMAD R12, R12, R5, R6;
        /*0340*/         IMAD R8, R8, R9, R13;
        /*0348*/         IMAD R0, R0, R16, R14;
        /*0350*/         IMAD R7, R7, R19, R21;
        /*0358*/         IMAD R12, R12, R5, R6;
        /*0360*/         IMAD R8, R8, R9, R13;
        /*0368*/         IMAD R0, R0, R16, R14;
        /*0370*/         IMAD R7, R7, R19, R21;
        /*0378*/         IMAD R12, R12, R5, R6;
        /*0380*/         IMAD R8, R8, R9, R13;
        /*0388*/         IMAD R0, R0, R16, R14;
        /*0390*/         IMAD R7, R7, R19, R21;
        /*0398*/         IMAD R12, R12, R5, R6;
        /*03a0*/         IMAD R8, R8, R9, R13;
        /*03a8*/         IMAD R0, R0, R16, R14;
        /*03b0*/         IMAD R7, R7, R19, R21;
        /*03b8*/         IMAD R12, R12, R5, R6;
        /*03c0*/         IMAD R8, R8, R9, R13;
        /*03c8*/         IMAD R0, R0, R16, R14;
        /*03d0*/         IMAD R7, R7, R19, R21;
        /*03d8*/         IMAD R12, R12, R5, R6;
        /*03e0*/         IMAD R8, R8, R9, R13;
        /*03e8*/         IMAD R0, R0, R16, R14;
        /*03f0*/         IMAD R7, R7, R19, R21;
        /*03f8*/         IMAD R12, R12, R5, R6;
        /*0400*/         IMAD R8, R8, R9, R13;
        /*0408*/         IMAD R0, R0, R16, R14;
        /*0410*/         IMAD R7, R7, R19, R21;
        /*0418*/         IMAD R12, R12, R5, R6;
        /*0420*/         IMAD R8, R8, R9, R13;
        /*0428*/         IMAD R0, R0, R16, R14;
        /*0430*/         IMAD R7, R7, R19, R21;
        /*0438*/         IMAD R12, R12, R5, R6;
        /*0440*/         IMAD R8, R8, R9, R13;
        /*0448*/         IMAD R0, R0, R16, R14;
        /*0450*/         IMAD R7, R7, R19, R21;
        /*0458*/         IMAD R12, R12, R5, R6;
        /*0460*/         IMAD R8, R8, R9, R13;
        /*0468*/         IMAD R0, R0, R16, R14;
        /*0470*/         IMAD R7, R7, R19, R21;
        /*0478*/         IMAD R12, R12, R5, R6;
        /*0480*/         IMAD R8, R8, R9, R13;
        /*0488*/         IMAD R0, R0, R16, R14;
        /*0490*/         IMAD R7, R7, R19, R21;
        /*0498*/         IMAD R12, R12, R5, R6;
        /*04a0*/         IMAD R8, R8, R9, R13;
        /*04a8*/         IMAD R0, R0, R16, R14;
        /*04b0*/         IMAD R7, R7, R19, R21;
        /*04b8*/         IMAD R12, R12, R5, R6;
        /*04c0*/         IMAD R8, R8, R9, R13;
        /*04c8*/         IMAD R0, R0, R16, R14;
        /*04d0*/         IMAD R7, R7, R19, R21;
        /*04d8*/         IMAD R12, R12, R5, R6;
        /*04e0*/         IMAD R8, R8, R9, R13;
        /*04e8*/         IMAD R0, R0, R16, R14;
        /*04f0*/         IMAD R7, R7, R19, R21;
        /*04f8*/         IMAD R12, R12, R5, R6;
        /*0500*/         IMAD R8, R8, R9, R13;
        /*0508*/         IMAD R0, R0, R16, R14;
        /*0510*/         IMAD R7, R7, R19, R21;
        /*0518*/         IMAD R12, R12, R5, R6;
        /*0520*/         IMAD R8, R8, R9, R13;
        /*0528*/         IMAD R0, R0, R16, R14;
        /*0530*/         IMAD R7, R7, R19, R21;
        /*0538*/         IMAD R12, R12, R5, R6;
        /*0540*/         IMAD R8, R8, R9, R13;
        /*0548*/         IADD R4, R4, 0x20;
        /*0550*/         IMAD R0, R0, R16, R14;
        /*0558*/         IMAD R7, R7, R19, R21;
        /*0560*/         IMAD R12, R12, R5, R6;
        /*0568*/         IMAD R8, R8, R9, R13;
        /*0570*/         ISETP.NE.AND P0, PT, R4, RZ, PT;
        /*0578*/         IMAD R0, R0, R16, R14;
        /*0580*/         IMAD R18, R7, R19, R21;
        /*0588*/         IMAD R17, R12, R5, R6;
        /*0590*/         IMAD R8, R8, R9, R13;
        /*0598*/         IMAD R0, R0, R16, R14;
        /*05a0*/         IMAD R18, R18, R19, R21;
        /*05a8*/         IMAD R17, R17, R5, R6;
        /*05b0*/         IMAD R8, R8, R9, R13;
        /*05b8*/     @P0 BRA 0x1a8;
        /*05c0*/         MOV32I R4, 0x40000000;
        /*05c8*/         MOV32I R9, 0x4;
        /*05d0*/         IMAD.U32.U32.HI R7, R4, c[0x0][0x38], R15;
        /*05d8*/         IMAD R4.CC, R15, R9, c[0x0][0x20];
        /*05e0*/         IMAD.HI.X R5, R15, R9, c[0x0][0x24];
        /*05e8*/         IMAD.U32.U32 R6.CC, R7, R9, c[0x0][0x20];
        /*05f0*/         IMAD.U32.U32.HI.X R7, R7, R9, c[0x0][0x24];
        /*05f8*/         ST.E [R4], R0;
        /*0600*/         ST.E [R6], R18;
        /*0608*/         ST.E [R10], R17;
        /*0610*/         ST.E [R2], R8;
        /*0618*/         EXIT;

CUDA 6.0

Function : _Z7kernel2PiS_S_j
    .headerflags    @"EF_CUDA_SM21 EF_CUDA_PTX_SM(EF_CUDA_SM21)"
    /*0000*/         MOV R1, c[0x1][0x100];
    /*0008*/         NOP;
    /*0010*/         MOV R3, c[0x0][0x38];
    /*0018*/         S2R R0, SR_CTAID.X;
    /*0020*/         SHR.U32 R3, R3, 0x2;
    /*0028*/         S2R R2, SR_TID.X;
    /*0030*/         IMAD R15, R0, c[0x0][0x8], R2;
    /*0038*/         ISETP.GE.U32.AND P0, PT, R15, R3, PT;
    /*0040*/     @P0 EXIT;
    /*0048*/         MOV32I R17, 0x4;
    /*0050*/         IADD R7, R15, R3;
    /*0058*/         IMAD R8.CC, R15, R17, c[0x0][0x20];
    /*0060*/         IMAD.HI.X R9, R15, R17, c[0x0][0x24];
    /*0068*/         IMAD R20.CC, R15, R17, c[0x0][0x28];
    /*0070*/         LD.E R0, [R8];
    /*0078*/         IMAD.HI.X R21, R15, R17, c[0x0][0x2c];
    /*0080*/         IMAD R2.CC, R15, R17, c[0x0][0x30];
    /*0088*/         MOV R12, c[0x0][0x38];
    /*0090*/         IMAD.HI.X R3, R15, R17, c[0x0][0x34];
    /*0098*/         IMUL R13, R12, 0x3;
    /*00a0*/         IMAD.U32.U32 R10.CC, R7, R17, c[0x0][0x30];
    /*00a8*/         LD.E R14, [R2];
    /*00b0*/         IMAD.U32.U32.HI.X R11, R7, R17, c[0x0][0x34];
    /*00b8*/         LD.E R16, [R20];
    /*00c0*/         IMAD.U32.U32 R4.CC, R7, R17, c[0x0][0x20];
    /*00c8*/         SHR.U32 R3, R12, 0x1;
    /*00d0*/         IMAD.U32.U32.HI.X R5, R7, R17, c[0x0][0x24];
    /*00d8*/         IADD R20, R15, R3;
    /*00e0*/         IMAD.U32.U32 R6.CC, R7, R17, c[0x0][0x28];
    /*00e8*/         LD.E R18, [R4];
    /*00f0*/         IMAD.U32.U32.HI R13, R13, c[0x10][0x0], R15;
    /*00f8*/         LD.E R21, [R10];
    /*0100*/         IMAD.U32.U32.HI.X R7, R7, R17, c[0x0][0x2c];
    /*0108*/         IMAD.U32.U32 R8.CC, R13, R17, c[0x0][0x28];
    /*0110*/         LD.E R19, [R6];
    /*0118*/         IMAD.U32.U32.HI.X R9, R13, R17, c[0x0][0x2c];
    /*0120*/         IMAD.U32.U32 R2.CC, R13, R17, c[0x0][0x20];
    /*0128*/         LD.E R9, [R8];
    /*0130*/         IMAD.U32.U32.HI.X R3, R13, R17, c[0x0][0x24];
    /*0138*/         IMAD.U32.U32 R4.CC, R20, R17, c[0x0][0x28];
    /*0140*/         IMAD.U32.U32.HI.X R5, R20, R17, c[0x0][0x2c];
    /*0148*/         LD.E R8, [R2];
    /*0150*/         IMAD.U32.U32 R6.CC, R20, R17, c[0x0][0x30];
    /*0158*/         LD.E R5, [R4];
    /*0160*/         IMAD.U32.U32.HI.X R7, R20, R17, c[0x0][0x34];
    /*0168*/         MOV32I R4, 0xfffff800;
    /*0170*/         IMAD.U32.U32 R12.CC, R13, R17, c[0x0][0x30];
    /*0178*/         LD.E R6, [R6];
    /*0180*/         IMAD.U32.U32.HI.X R13, R13, R17, c[0x0][0x34];
    /*0188*/         IMAD.U32.U32 R10.CC, R20, R17, c[0x0][0x20];
    /*0190*/         LD.E R13, [R12];
    /*0198*/         IMAD.U32.U32.HI.X R11, R20, R17, c[0x0][0x24];
    /*01a0*/         LD.E R17, [R10];
    /*01a8*/         IMAD R0, R0, R16, R14;
    /*01b0*/         IMAD R7, R18, R19, R21;
    /*01b8*/         IMAD R12, R17, R5, R6;
    /*01c0*/         IMAD R8, R8, R9, R13;
    /*01c8*/         IMAD R0, R0, R16, R14;
    /*01d0*/         IMAD R7, R7, R19, R21;
    /*01d8*/         IMAD R12, R12, R5, R6;
    /*01e0*/         IMAD R8, R8, R9, R13;
    /*01e8*/         IMAD R0, R0, R16, R14;
    /*01f0*/         IMAD R7, R7, R19, R21;
    /*01f8*/         IMAD R12, R12, R5, R6;
    /*0200*/         IMAD R8, R8, R9, R13;
    /*0208*/         IMAD R0, R0, R16, R14;
    /*0210*/         IMAD R7, R7, R19, R21;
    /*0218*/         IMAD R12, R12, R5, R6;
    /*0220*/         IMAD R8, R8, R9, R13;
    /*0228*/         IMAD R0, R0, R16, R14;
    /*0230*/         IMAD R7, R7, R19, R21;
    /*0238*/         IMAD R12, R12, R5, R6;
    /*0240*/         IMAD R8, R8, R9, R13;
    /*0248*/         IMAD R0, R0, R16, R14;
    /*0250*/         IMAD R7, R7, R19, R21;
    /*0258*/         IMAD R12, R12, R5, R6;
    /*0260*/         IMAD R8, R8, R9, R13;
    /*0268*/         IMAD R0, R0, R16, R14;
    /*0270*/         IMAD R7, R7, R19, R21;
    /*0278*/         IMAD R12, R12, R5, R6;
    /*0280*/         IMAD R8, R8, R9, R13;
    /*0288*/         IMAD R0, R0, R16, R14;
    /*0290*/         IMAD R7, R7, R19, R21;
    /*0298*/         IMAD R12, R12, R5, R6;
    /*02a0*/         IMAD R8, R8, R9, R13;
    /*02a8*/         IMAD R0, R0, R16, R14;
    /*02b0*/         IMAD R7, R7, R19, R21;
    /*02b8*/         IMAD R12, R12, R5, R6;
    /*02c0*/         IMAD R8, R8, R9, R13;
    /*02c8*/         IMAD R0, R0, R16, R14;
    /*02d0*/         IMAD R7, R7, R19, R21;
    /*02d8*/         IMAD R12, R12, R5, R6;
    /*02e0*/         IMAD R8, R8, R9, R13;
    /*02e8*/         IMAD R0, R0, R16, R14;
    /*02f0*/         IMAD R7, R7, R19, R21;
    /*02f8*/         IMAD R12, R12, R5, R6;
    /*0300*/         IMAD R8, R8, R9, R13;
    /*0308*/         IMAD R0, R0, R16, R14;
    /*0310*/         IMAD R7, R7, R19, R21;
    /*0318*/         IMAD R12, R12, R5, R6;
    /*0320*/         IMAD R8, R8, R9, R13;
    /*0328*/         IMAD R0, R0, R16, R14;
    /*0330*/         IMAD R7, R7, R19, R21;
    /*0338*/         IMAD R12, R12, R5, R6;
    /*0340*/         IMAD R8, R8, R9, R13;
    /*0348*/         IMAD R0, R0, R16, R14;
    /*0350*/         IMAD R7, R7, R19, R21;
    /*0358*/         IMAD R12, R12, R5, R6;
    /*0360*/         IMAD R8, R8, R9, R13;
    /*0368*/         IMAD R0, R0, R16, R14;
    /*0370*/         IMAD R7, R7, R19, R21;
    /*0378*/         IMAD R12, R12, R5, R6;
    /*0380*/         IMAD R8, R8, R9, R13;
    /*0388*/         IMAD R0, R0, R16, R14;
    /*0390*/         IMAD R7, R7, R19, R21;
    /*0398*/         IMAD R12, R12, R5, R6;
    /*03a0*/         IMAD R8, R8, R9, R13;
    /*03a8*/         IMAD R0, R0, R16, R14;
    /*03b0*/         IMAD R7, R7, R19, R21;
    /*03b8*/         IMAD R12, R12, R5, R6;
    /*03c0*/         IMAD R8, R8, R9, R13;
    /*03c8*/         IMAD R0, R0, R16, R14;
    /*03d0*/         IMAD R7, R7, R19, R21;
    /*03d8*/         IMAD R12, R12, R5, R6;
    /*03e0*/         IMAD R8, R8, R9, R13;
    /*03e8*/         IMAD R0, R0, R16, R14;
    /*03f0*/         IMAD R7, R7, R19, R21;
    /*03f8*/         IMAD R12, R12, R5, R6;
    /*0400*/         IMAD R8, R8, R9, R13;
    /*0408*/         IMAD R0, R0, R16, R14;
    /*0410*/         IMAD R7, R7, R19, R21;
    /*0418*/         IMAD R12, R12, R5, R6;
    /*0420*/         IMAD R8, R8, R9, R13;
    /*0428*/         IMAD R0, R0, R16, R14;
    /*0430*/         IMAD R7, R7, R19, R21;
    /*0438*/         IMAD R12, R12, R5, R6;
    /*0440*/         IMAD R8, R8, R9, R13;
    /*0448*/         IMAD R0, R0, R16, R14;
    /*0450*/         IMAD R7, R7, R19, R21;
    /*0458*/         IMAD R12, R12, R5, R6;
    /*0460*/         IMAD R8, R8, R9, R13;
    /*0468*/         IMAD R0, R0, R16, R14;
    /*0470*/         IMAD R7, R7, R19, R21;
    /*0478*/         IMAD R12, R12, R5, R6;
    /*0480*/         IMAD R8, R8, R9, R13;
    /*0488*/         IMAD R0, R0, R16, R14;
    /*0490*/         IMAD R7, R7, R19, R21;
    /*0498*/         IMAD R12, R12, R5, R6;
    /*04a0*/         IMAD R8, R8, R9, R13;
    /*04a8*/         IMAD R0, R0, R16, R14;
    /*04b0*/         IMAD R7, R7, R19, R21;
    /*04b8*/         IMAD R12, R12, R5, R6;
    /*04c0*/         IMAD R8, R8, R9, R13;
    /*04c8*/         IMAD R0, R0, R16, R14;
    /*04d0*/         IMAD R7, R7, R19, R21;
    /*04d8*/         IMAD R12, R12, R5, R6;
    /*04e0*/         IMAD R8, R8, R9, R13;
    /*04e8*/         IMAD R0, R0, R16, R14;
    /*04f0*/         IMAD R7, R7, R19, R21;
    /*04f8*/         IMAD R12, R12, R5, R6;
    /*0500*/         IMAD R8, R8, R9, R13;
    /*0508*/         IMAD R0, R0, R16, R14;
    /*0510*/         IMAD R7, R7, R19, R21;
    /*0518*/         IMAD R12, R12, R5, R6;
    /*0520*/         IMAD R8, R8, R9, R13;
    /*0528*/         IMAD R0, R0, R16, R14;
    /*0530*/         IMAD R7, R7, R19, R21;
    /*0538*/         IMAD R12, R12, R5, R6;
    /*0540*/         IMAD R8, R8, R9, R13;
    /*0548*/         IADD R4, R4, 0x20;
    /*0550*/         IMAD R0, R0, R16, R14;
    /*0558*/         IMAD R7, R7, R19, R21;
    /*0560*/         IMAD R12, R12, R5, R6;
    /*0568*/         IMAD R8, R8, R9, R13;
    /*0570*/         ISETP.NE.AND P0, PT, R4, RZ, PT;
    /*0578*/         IMAD R0, R0, R16, R14;
    /*0580*/         IMAD R18, R7, R19, R21;
    /*0588*/         IMAD R17, R12, R5, R6;
    /*0590*/         IMAD R8, R8, R9, R13;
    /*0598*/         IMAD R0, R0, R16, R14;
    /*05a0*/         IMAD R18, R18, R19, R21;
    /*05a8*/         IMAD R17, R17, R5, R6;
    /*05b0*/         IMAD R8, R8, R9, R13;
    /*05b8*/     @P0 BRA 0x1a8;
    /*05c0*/         MOV32I R4, 0x40000000;
    /*05c8*/         MOV32I R9, 0x4;
    /*05d0*/         IMAD.U32.U32.HI R7, R4, c[0x0][0x38], R15;
    /*05d8*/         IMAD R4.CC, R15, R9, c[0x0][0x20];
    /*05e0*/         IMAD.HI.X R5, R15, R9, c[0x0][0x24];
    /*05e8*/         IMAD.U32.U32 R6.CC, R7, R9, c[0x0][0x20];
    /*05f0*/         IMAD.U32.U32.HI.X R7, R7, R9, c[0x0][0x24];
    /*05f8*/         ST.E [R4], R0;
    /*0600*/         ST.E [R6], R18;
    /*0608*/         ST.E [R10], R17;
    /*0610*/         ST.E [R2], R8;
    /*0618*/         EXIT;

CUDA 6.5

Function : _Z7kernel2PiS_S_j
    .headerflags    @"EF_CUDA_SM21 EF_CUDA_PTX_SM(EF_CUDA_SM21)"
    /*0000*/         MOV R1, c[0x1][0x100];
    /*0008*/         NOP;
    /*0010*/         MOV R3, c[0x0][0x38];
    /*0018*/         S2R R0, SR_CTAID.X;
    /*0020*/         SHR.U32 R3, R3, 0x2;
    /*0028*/         S2R R2, SR_TID.X;
    /*0030*/         IMAD R0, R0, c[0x0][0x8], R2;
    /*0038*/         ISETP.GE.U32.AND P0, PT, R0, R3, PT;
    /*0040*/     @P0 EXIT;
    /*0048*/         MOV32I R14, 0x4;
    /*0050*/         IADD R21, R0, R3;
    /*0058*/         IMAD R18.CC, R0, R14, c[0x0][0x20];
    /*0060*/         IMAD.HI.X R19, R0, R14, c[0x0][0x24];
    /*0068*/         IMAD R10.CC, R0, R14, c[0x0][0x28];
    /*0070*/         LD.E R15, [R18];
    /*0078*/         IMAD.HI.X R11, R0, R14, c[0x0][0x2c];
    /*0080*/         IMAD R12.CC, R0, R14, c[0x0][0x30];
    /*0088*/         MOV R22, c[0x0][0x38];
    /*0090*/         IMAD.HI.X R13, R0, R14, c[0x0][0x34];
    /*0098*/         IMUL R2, R22, 0x3;
    /*00a0*/         IMAD.U32.U32 R8.CC, R21, R14, c[0x0][0x20];
    /*00a8*/         LD.E R17, [R10];
    /*00b0*/         IMAD.U32.U32.HI.X R9, R21, R14, c[0x0][0x24];
    /*00b8*/         LD.E R20, [R12];
    /*00c0*/         IMAD.U32.U32 R4.CC, R21, R14, c[0x0][0x28];
    /*00c8*/         SHR.U32 R13, R22, 0x1;
    /*00d0*/         IMAD.U32.U32.HI R16, R2, c[0x10][0x0], R0;
    /*00d8*/         LD.E R23, [R8];
    /*00e0*/         IMAD.U32.U32.HI.X R5, R21, R14, c[0x0][0x2c];
    /*00e8*/         IADD R19, R0, R13;
    /*00f0*/         IMAD.U32.U32 R2.CC, R16, R14, c[0x0][0x28];
    /*00f8*/         LD.E R22, [R4];
    /*0100*/         IMAD.U32.U32.HI.X R3, R16, R14, c[0x0][0x2c];
    /*0108*/         IMAD.U32.U32 R6.CC, R16, R14, c[0x0][0x20];
    /*0110*/         LD.E R2, [R2];
    /*0118*/         IMAD.U32.U32.HI.X R7, R16, R14, c[0x0][0x24];
    /*0120*/         MOV32I R3, 0xfffff800;
    /*0128*/         IMAD.U32.U32 R10.CC, R21, R14, c[0x0][0x30];
    /*0130*/         IMAD.U32.U32.HI.X R11, R21, R14, c[0x0][0x34];
    /*0138*/         IMAD.U32.U32 R12.CC, R16, R14, c[0x0][0x30];
    /*0140*/         IMAD.U32.U32.HI.X R13, R16, R14, c[0x0][0x34];
    /*0148*/         LD.E R10, [R10];
    /*0150*/         IMAD.U32.U32 R8.CC, R19, R14, c[0x0][0x28];
    /*0158*/         LD.E R16, [R6];
    /*0160*/         IMAD.U32.U32.HI.X R9, R19, R14, c[0x0][0x2c];
    /*0168*/         LD.E R12, [R12];
    /*0170*/         IMAD.U32.U32 R4.CC, R19, R14, c[0x0][0x30];
    /*0178*/         LD.E R8, [R8];
    /*0180*/         IMAD.U32.U32.HI.X R5, R19, R14, c[0x0][0x34];
    /*0188*/         IMAD.U32.U32 R18.CC, R19, R14, c[0x0][0x20];
    /*0190*/         LD.E R4, [R4];
    /*0198*/         IMAD.U32.U32.HI.X R19, R19, R14, c[0x0][0x24];
    /*01a0*/         LD.E R14, [R18];
    /*01a8*/         IMAD R5, R15, R17, R20;
    /*01b0*/         IMAD R9, R23, R22, R10;
    /*01b8*/         IMAD R11, R14, R8, R4;
    /*01c0*/         IMAD R13, R16, R2, R12;
    /*01c8*/         IMAD R15, R5, R17, R20;
    /*01d0*/         IMAD R21, R9, R22, R10;
    /*01d8*/         IMAD R14, R11, R8, R4;
    /*01e0*/         IMAD R16, R13, R2, R12;
    /*01e8*/         IMAD R15, R15, R17, R20;
    /*01f0*/         IMAD R21, R21, R22, R10;
    /*01f8*/         IMAD R14, R14, R8, R4;
    /*0200*/         IMAD R16, R16, R2, R12;
    /*0208*/         IMAD R15, R15, R17, R20;
    /*0210*/         IMAD R21, R21, R22, R10;
    /*0218*/         IMAD R14, R14, R8, R4;
    /*0220*/         IMAD R16, R16, R2, R12;
    /*0228*/         IMAD R15, R15, R17, R20;
    /*0230*/         IMAD R21, R21, R22, R10;
    /*0238*/         IMAD R14, R14, R8, R4;
    /*0240*/         IMAD R16, R16, R2, R12;
    /*0248*/         IMAD R15, R15, R17, R20;
    /*0250*/         IMAD R21, R21, R22, R10;
    /*0258*/         IMAD R14, R14, R8, R4;
    /*0260*/         IMAD R16, R16, R2, R12;
    /*0268*/         IMAD R15, R15, R17, R20;
    /*0270*/         IMAD R21, R21, R22, R10;
    /*0278*/         IMAD R14, R14, R8, R4;
    /*0280*/         IMAD R16, R16, R2, R12;
    /*0288*/         IMAD R15, R15, R17, R20;
    /*0290*/         IMAD R21, R21, R22, R10;
    /*0298*/         IMAD R14, R14, R8, R4;
    /*02a0*/         IMAD R16, R16, R2, R12;
    /*02a8*/         IMAD R15, R15, R17, R20;
    /*02b0*/         IMAD R21, R21, R22, R10;
    /*02b8*/         IMAD R14, R14, R8, R4;
    /*02c0*/         IMAD R16, R16, R2, R12;
    /*02c8*/         IMAD R15, R15, R17, R20;
    /*02d0*/         IMAD R21, R21, R22, R10;
    /*02d8*/         IMAD R14, R14, R8, R4;
    /*02e0*/         IMAD R16, R16, R2, R12;
    /*02e8*/         IMAD R15, R15, R17, R20;
    /*02f0*/         IMAD R21, R21, R22, R10;
    /*02f8*/         IMAD R14, R14, R8, R4;
    /*0300*/         IMAD R16, R16, R2, R12;
    /*0308*/         IMAD R15, R15, R17, R20;
    /*0310*/         IMAD R21, R21, R22, R10;
    /*0318*/         IMAD R14, R14, R8, R4;
    /*0320*/         IMAD R16, R16, R2, R12;
    /*0328*/         IMAD R15, R15, R17, R20;
    /*0330*/         IMAD R21, R21, R22, R10;
    /*0338*/         IMAD R14, R14, R8, R4;
    /*0340*/         IMAD R16, R16, R2, R12;
    /*0348*/         IMAD R15, R15, R17, R20;
    /*0350*/         IMAD R21, R21, R22, R10;
    /*0358*/         IMAD R14, R14, R8, R4;
    /*0360*/         IMAD R16, R16, R2, R12;
    /*0368*/         IMAD R15, R15, R17, R20;
    /*0370*/         IMAD R21, R21, R22, R10;
    /*0378*/         IMAD R14, R14, R8, R4;
    /*0380*/         IMAD R16, R16, R2, R12;
    /*0388*/         IMAD R15, R15, R17, R20;
    /*0390*/         IMAD R21, R21, R22, R10;
    /*0398*/         IMAD R14, R14, R8, R4;
    /*03a0*/         IMAD R16, R16, R2, R12;
    /*03a8*/         IMAD R15, R15, R17, R20;
    /*03b0*/         IMAD R21, R21, R22, R10;
    /*03b8*/         IMAD R14, R14, R8, R4;
    /*03c0*/         IMAD R16, R16, R2, R12;
    /*03c8*/         IMAD R15, R15, R17, R20;
    /*03d0*/         IMAD R21, R21, R22, R10;
    /*03d8*/         IMAD R14, R14, R8, R4;
    /*03e0*/         IMAD R16, R16, R2, R12;
    /*03e8*/         IMAD R15, R15, R17, R20;
    /*03f0*/         IMAD R21, R21, R22, R10;
    /*03f8*/         IMAD R14, R14, R8, R4;
    /*0400*/         IMAD R16, R16, R2, R12;
    /*0408*/         IMAD R15, R15, R17, R20;
    /*0410*/         IMAD R21, R21, R22, R10;
    /*0418*/         IMAD R14, R14, R8, R4;
    /*0420*/         IMAD R16, R16, R2, R12;
    /*0428*/         IMAD R15, R15, R17, R20;
    /*0430*/         IMAD R21, R21, R22, R10;
    /*0438*/         IMAD R14, R14, R8, R4;
    /*0440*/         IMAD R16, R16, R2, R12;
    /*0448*/         IMAD R15, R15, R17, R20;
    /*0450*/         IMAD R21, R21, R22, R10;
    /*0458*/         IMAD R14, R14, R8, R4;
    /*0460*/         IMAD R16, R16, R2, R12;
    /*0468*/         IMAD R15, R15, R17, R20;
    /*0470*/         IMAD R21, R21, R22, R10;
    /*0478*/         IMAD R14, R14, R8, R4;
    /*0480*/         IMAD R16, R16, R2, R12;
    /*0488*/         IMAD R15, R15, R17, R20;
    /*0490*/         IMAD R21, R21, R22, R10;
    /*0498*/         IMAD R14, R14, R8, R4;
    /*04a0*/         IMAD R16, R16, R2, R12;
    /*04a8*/         IMAD R15, R15, R17, R20;
    /*04b0*/         IMAD R21, R21, R22, R10;
    /*04b8*/         IMAD R14, R14, R8, R4;
    /*04c0*/         IMAD R16, R16, R2, R12;
    /*04c8*/         IMAD R15, R15, R17, R20;
    /*04d0*/         IMAD R21, R21, R22, R10;
    /*04d8*/         IMAD R14, R14, R8, R4;
    /*04e0*/         IMAD R16, R16, R2, R12;
    /*04e8*/         IMAD R15, R15, R17, R20;
    /*04f0*/         IMAD R21, R21, R22, R10;
    /*04f8*/         IMAD R14, R14, R8, R4;
    /*0500*/         IMAD R16, R16, R2, R12;
    /*0508*/         IMAD R15, R15, R17, R20;
    /*0510*/         IMAD R21, R21, R22, R10;
    /*0518*/         IMAD R14, R14, R8, R4;
    /*0520*/         IMAD R16, R16, R2, R12;
    /*0528*/         IMAD R15, R15, R17, R20;
    /*0530*/         IMAD R21, R21, R22, R10;
    /*0538*/         IMAD R14, R14, R8, R4;
    /*0540*/         IMAD R16, R16, R2, R12;
    /*0548*/         IADD R3, R3, 0x20;
    /*0550*/         IMAD R15, R15, R17, R20;
    /*0558*/         IMAD R21, R21, R22, R10;
    /*0560*/         IMAD R14, R14, R8, R4;
    /*0568*/         IMAD R16, R16, R2, R12;
    /*0570*/         ISETP.NE.AND P0, PT, R3, RZ, PT;
    /*0578*/         IMAD R15, R15, R17, R20;
    /*0580*/         IMAD R23, R21, R22, R10;
    /*0588*/         IMAD R14, R14, R8, R4;
    /*0590*/         IMAD R16, R16, R2, R12;
    /*0598*/         IMAD R15, R15, R17, R20;
    /*05a0*/         IMAD R23, R23, R22, R10;
    /*05a8*/         IMAD R14, R14, R8, R4;
    /*05b0*/         IMAD R16, R16, R2, R12;
    /*05b8*/     @P0 BRA 0x1a8;
    /*05c0*/         MOV32I R2, 0x40000000;
    /*05c8*/         MOV32I R8, 0x4;
    /*05d0*/         IMAD.U32.U32.HI R4, R2, c[0x0][0x38], R0;
    /*05d8*/         IMAD R2.CC, R0, R8, c[0x0][0x20];
    /*05e0*/         IMAD.HI.X R3, R0, R8, c[0x0][0x24];
    /*05e8*/         IMAD.U32.U32 R14.CC, R4, R8, c[0x0][0x20];
    /*05f0*/         IMAD.U32.U32.HI.X R15, R4, R8, c[0x0][0x24];
    /*05f8*/         ST.E [R2], R5;
    /*0600*/         ST.E [R14], R9;
    /*0608*/         ST.E [R18], R11;
    /*0610*/         ST.E [R6], R13;
    /*0618*/         EXIT;

The ST operations for both CUDA 5.5 and 6.0 operate on R10, R18, R17 and R8 which are updated by the last IMAD’s, while for CUDA 6.5 they operate on R5, R9, R11 and R13 which are not updated by the last IMAD’s.

What I’m doing wrong?

My set up: core i7 laptop with Windows 7, compiling a release project, either 32-bit or 64-bit (same problem). The above refers to 64-bit.

Command line:

# (Approximate command-line.  Settings inherited from host are not visible below.)
# (Please see the output window after a build for the full command-line)

# Driver API (NVCC Compilation Type is .cubin, .gpu, or .ptx)
set CUDAFE_FLAGS=--sdk_dir "C:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\"
"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\bin\nvcc.exe" --use-local-env --cl-  version 2010 -ccbin "c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\bin\x86_amd64"       --  keep --keep-dir x64\Release -maxrregcount=0 --ptxas-options=-v --machine 32 --compile -cudart static    -o x64\Release\%(Filename)%(Extension).obj "%(FullPath)"

# Runtime API (NVCC Compilation Type is hybrid object or .c file)
set CUDAFE_FLAGS=--sdk_dir "C:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\"
"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\bin\nvcc.exe" --use-local-env --cl-  version 2010 -ccbin "c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\bin\x86_amd64"       --  keep --keep-dir x64\Release -maxrregcount=0 --ptxas-options=-v --machine 32 --compile -cudart static        -Xcompiler "/EHsc  /nologo  /Zi    " -o x64\Release\%(Filename)%(Extension).obj "%(FullPath)"

OTHER INFORMATION

The problem only occurs when the two additional kernels are present. If I comment them out, everything works. The disassembled codes for the working and non-working cases are reported here http://pastebin.com/9ukPyZCe and here http://pastebin.com/vWUR4cnS, respectively.

I have also dropped the code above into the vectorAdd CUDA Sample project and, if I compile under the release mode instead of the debug one, I still experience the same issue.

A bug (1552767) has been filed by Robert Crovella.

how did you manage so many different cuda versions - are you using multiple machines…?

personally, i think such a non-reliance on a single version is the way to go when it comes to upgrades; i have not upgraded to 6.5 yet, and truthfully i am reluctant

I have many CUDA version on my laptop. They can simply co-exist.

I do not think you have to worry too much about this possible bug. It appears that it occurs only under very limited conditions, see [url]Wrong output with loop unroll under CUDA 6.5 - Stack Overflow.