Problem with accessing member variables using struct in release mode.

Hi there, i have been having some problems in a case where a local variable of struct type is loaded from global memory and then the members of the struct are accessed based on a conditional. The following code is a minimal example:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <iostream>

void checkCudaError(cudaError_t result)
{
    if (result != cudaSuccess)
    {
        std::cout << "ERROR: " << result << std::endl;
    }
}

std::ostream& operator << (std::ostream & out, const float3 & v)
{
    out << "(" << v.x << "," << v.y << "," << v.z << ")";
    return out;
}

struct StructCombined
{
    float3 m_a;
    float3 m_b;
};

struct StructSplit
{
    float3 m_singleValue;
};

__global__
void splitStructKernel(StructSplit* splitOut,
                       const StructCombined* combinedIn)
{
    int index = threadIdx.x;

    float3 result;
    StructCombined combinedStruct = combinedIn[0];

    //based on this logic the first thread should get (1.,1.,1.)
    //and the second should get (2.,2.,2.) but they both seem to get (2.,2.,2.) in release mode!
    if (index == 0)
    {
        float3 result1 = combinedStruct.m_a;
        result = result1;
    }
    else
    {
        float3 result2 = combinedStruct.m_b;
        result = result2;
    }
    splitOut[index].m_singleValue = result;
}


int main()
{
    checkCudaError(cudaSetDevice(0));

    StructCombined hostCombinedStruct;
    hostCombinedStruct.m_a = { 1.,1.,1. };
    hostCombinedStruct.m_b = { 2.,2.,2. };

    StructCombined* deviceCombinedStruct;
    checkCudaError(cudaMalloc(&deviceCombinedStruct, sizeof(StructCombined)));
    checkCudaError(cudaMemcpy(deviceCombinedStruct, &hostCombinedStruct, sizeof(StructCombined), cudaMemcpyHostToDevice));

    StructSplit hostSplitStruct[2];
    StructSplit* deviceSplitStruct;
    checkCudaError(cudaMalloc(&deviceSplitStruct, 2 * sizeof(StructSplit)));

    splitStructKernel << <1, 2>>> (deviceSplitStruct, deviceCombinedStruct);
    checkCudaError(cudaDeviceSynchronize());

    checkCudaError(cudaMemcpy(hostSplitStruct, deviceSplitStruct, 2 * sizeof(StructSplit), cudaMemcpyDeviceToHost));

    std::cout << hostSplitStruct[0].m_singleValue << std::endl;
    std::cout << hostSplitStruct[1].m_singleValue << std::endl;
    
    checkCudaError(cudaFree(deviceCombinedStruct));
    checkCudaError(cudaFree(deviceSplitStruct));

    return 0;
}

The code should take the input struct ‘StructCombined’ and then split it up into 2 separate ‘StructSplit’ variables, by assigning m_a and m_b to 2 separate instances of StructSplit which are in an array of length 2. The kernel ‘splitStructKernel’ is run with 2 threads and the first thread should handle m_a and the second should handle m_b. The code works fine when compiled in debug mode and produces the expected output ‘(1,1,1) (2,2,2)’ but in release mode it seems that both threads take m_b and the output is ‘(2,2,2) (2,2,2)’

Any help with this would be much appreciated.

system:

windows 10
microsoft visual studio 2019

nvcc version:

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:12:52_Pacific_Daylight_Time_2019
Cuda compilation tools, release 10.1, V10.1.243

Hardware:

GeForce RTX 2070

Hi there,
first of all I observe the same behaviour with sm_61 and cuda 10.2.

Judging from the generated ptx assembly ( Compiler Explorer ) I assume that the initial load of combinedIn[0] is incorrect.

ld.param.u64    %rd1, [_Z17splitStructKernelP11StructSplitPK14StructCombined_param_0];
        ld.param.u64    %rd2, [_Z17splitStructKernelP11StructSplitPK14StructCombined_param_1];
        cvta.to.global.u64      %rd3, %rd1;
        cvta.to.global.u64      %rd4, %rd2;
        mov.u32         %r1, %tid.x;
        ld.global.f32   %f1, [%rd4+12];
        ld.global.f32   %f2, [%rd4+16];
        ld.global.f32   %f3, [%rd4+20];
        mul.wide.s32    %rd5, %r1, 12;
        add.s64         %rd6, %rd3, %rd5;
        st.global.f32   [%rd6], %f1;
        st.global.f32   [%rd6+4], %f2;
        st.global.f32   [%rd6+8], %f3;
        ret;

Out of the 6 floats, only the last 3 (offset 12, 16, 20) are fetched from memory.

If the code is only slightly modified, the correct loads are performed.
For example when loading result2 directly from memory float3 result2 = combinedIn[0].m_b; both parts are loaded correctly. (0,4,8,12,16,20)

ld.param.u64    %rd3, [_Z17splitStructKernelP11StructSplitPK14StructCombined_param_0];
        ld.param.u64    %rd4, [_Z17splitStructKernelP11StructSplitPK14StructCombined_param_1];
        cvta.to.global.u64      %rd1, %rd3;
        cvta.to.global.u64      %rd2, %rd4;
        ld.global.f32   %f16, [%rd2];
        ld.global.f32   %f17, [%rd2+4];
        ld.global.f32   %f18, [%rd2+8];
        mov.u32         %r1, %tid.x;
        setp.eq.s32     %p1, %r1, 0;
        @%p1 bra        BB0_2;

        ld.global.f32   %f16, [%rd2+12];
        ld.global.f32   %f17, [%rd2+16];
        ld.global.f32   %f18, [%rd2+20];

        mul.wide.s32    %rd5, %r1, 12;
        add.s64         %rd6, %rd1, %rd5;
        st.global.f32   [%rd6], %f16;
        st.global.f32   [%rd6+4], %f17;
        st.global.f32   [%rd6+8], %f18;
        ret;

What’s also funny: enclosing the code with if(true) also solves the problem.

if(true){
    if (index == 0)
    {
        float3 result1 = combinedStruct.m_a;
        result = result1;
    }
    else
    {
        float3 result2 = combinedStruct.m_b;
        result = result2;
    }
    splitOut[index].m_singleValue = result;
    }

compiles to

ld.global.f32   %f1, [%rd4];
        ld.global.f32   %f2, [%rd4+4];
        ld.global.f32   %f3, [%rd4+8];
        ld.global.f32   %f4, [%rd4+12];
        ld.global.f32   %f5, [%rd4+16];
        ld.global.f32   %f6, [%rd4+20];
        setp.eq.s32     %p1, %r1, 0;
        selp.f32        %f7, %f1, %f4, %p1;
        selp.f32        %f8, %f2, %f5, %p1;
        selp.f32        %f9, %f3, %f6, %p1;
        mul.wide.s32    %rd5, %r1, 12;
        add.s64         %rd6, %rd3, %rd5;
        st.global.f32   [%rd6], %f7;
        st.global.f32   [%rd6+4], %f8;
        st.global.f32   [%rd6+8], %f9;

I suggest filing a bug. The instructions are linked in a sticky post at the top of this forum.

Thank you both for your replies and investigating - i have now filed a bug.