Hi there, i have been having some problems in a case where a local variable of struct type is loaded from global memory and then the members of the struct are accessed based on a conditional. The following code is a minimal example:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
void checkCudaError(cudaError_t result)
{
if (result != cudaSuccess)
{
std::cout << "ERROR: " << result << std::endl;
}
}
std::ostream& operator << (std::ostream & out, const float3 & v)
{
out << "(" << v.x << "," << v.y << "," << v.z << ")";
return out;
}
struct StructCombined
{
float3 m_a;
float3 m_b;
};
struct StructSplit
{
float3 m_singleValue;
};
__global__
void splitStructKernel(StructSplit* splitOut,
const StructCombined* combinedIn)
{
int index = threadIdx.x;
float3 result;
StructCombined combinedStruct = combinedIn[0];
//based on this logic the first thread should get (1.,1.,1.)
//and the second should get (2.,2.,2.) but they both seem to get (2.,2.,2.) in release mode!
if (index == 0)
{
float3 result1 = combinedStruct.m_a;
result = result1;
}
else
{
float3 result2 = combinedStruct.m_b;
result = result2;
}
splitOut[index].m_singleValue = result;
}
int main()
{
checkCudaError(cudaSetDevice(0));
StructCombined hostCombinedStruct;
hostCombinedStruct.m_a = { 1.,1.,1. };
hostCombinedStruct.m_b = { 2.,2.,2. };
StructCombined* deviceCombinedStruct;
checkCudaError(cudaMalloc(&deviceCombinedStruct, sizeof(StructCombined)));
checkCudaError(cudaMemcpy(deviceCombinedStruct, &hostCombinedStruct, sizeof(StructCombined), cudaMemcpyHostToDevice));
StructSplit hostSplitStruct[2];
StructSplit* deviceSplitStruct;
checkCudaError(cudaMalloc(&deviceSplitStruct, 2 * sizeof(StructSplit)));
splitStructKernel << <1, 2>>> (deviceSplitStruct, deviceCombinedStruct);
checkCudaError(cudaDeviceSynchronize());
checkCudaError(cudaMemcpy(hostSplitStruct, deviceSplitStruct, 2 * sizeof(StructSplit), cudaMemcpyDeviceToHost));
std::cout << hostSplitStruct[0].m_singleValue << std::endl;
std::cout << hostSplitStruct[1].m_singleValue << std::endl;
checkCudaError(cudaFree(deviceCombinedStruct));
checkCudaError(cudaFree(deviceSplitStruct));
return 0;
}
The code should take the input struct ‘StructCombined’ and then split it up into 2 separate ‘StructSplit’ variables, by assigning m_a and m_b to 2 separate instances of StructSplit which are in an array of length 2. The kernel ‘splitStructKernel’ is run with 2 threads and the first thread should handle m_a and the second should handle m_b. The code works fine when compiled in debug mode and produces the expected output ‘(1,1,1) (2,2,2)’ but in release mode it seems that both threads take m_b and the output is ‘(2,2,2) (2,2,2)’
Any help with this would be much appreciated.
system:
windows 10
microsoft visual studio 2019
nvcc version:
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:12:52_Pacific_Daylight_Time_2019
Cuda compilation tools, release 10.1, V10.1.243
Hardware:
GeForce RTX 2070