Hi,
I have discovered a problem with the cudaMemcpyToSymbol function and separable compilation when an executable is defined as a .cu file. I am using CMake to build my program.
Our application has a static initialized object which initializes the CUDA library and copies a constant to the constant memory cache as a cuComplex. This is occurring in the “Constants.cu” file listed below.
However, when the executable is defined as a .cu I am getting a CUDA error which states “invalid device symbol”. This issue does NOT occur if I make the executable a .cpp file. As every single file in our application goes through this startup process, I would like to allow .cu executables to be built, but at the moment I cannot allow this due this problem.
The static initialization code is defined in a static library which is linked into a shared library via separable compilation. The non functioning code has been placed inside either the static library or the shared library and works properly in neither of them.
Why is this not working when an executable is defined as a .cu file in CMake?
The code with non-relevant portions stripped out:
Constants.hpp
#include <cmath>
#include <complex>
#ifdef __CUDA_ARCH__
#include <cuComplex.h>
#else
#include <complex>
#endif
#include <DEWM/cuda/CudaInterface.hpp>
#ifndef DEWM_Constants_INC
#define DEWM_Constants_INC
namespace dewm
{
/*!< Average dielectric constant for polarization computation. Obtained from "Radar Reflectivity of Land and Sea".
Currently using the value stated for 'average land' of 10 for epsilon prime. Epsilon double prime isn't given, so a value was used inbetween moist and dry land in the table.*/
// CuComplex doesn't provide a constructor so this has to be stored in the constant memory cache.
// Also, this constant must be defined in a CPP file, to avoid multiple definition errors. This is due to how
// the CUDA file linking works.
extern __device__ __constant__ cuComplex CU_AVERAGE_LAND_DIELECTRIC_CONSTANT;
/*!
* \brief Structs that are defined as const in this header are not usable in CUDA kernels. CUDA can only use constants defind in the host
* if they are of floating or integral type. To use these structs in CUDA kernels, we must copy them to the constant memory cache on each
* cuda card.
*/
CUDA_HOST void writeCudaStructsToConstantMemory( );
}
#endif
Constants.cu ( in libDEWM.a or libDEWM.so, doesn’t work in either case )
#include <DEWM/Constants.hpp>
#include <iostream>
namespace dewm
{
#ifdef DEWM_CUDA_ENABLED
// Define it here so it is only defined once for the CUDA library.
__device__ __constant__ cuComplex CU_AVERAGE_LAND_DIELECTRIC_CONSTANT;
CUDA_HOST void writeCudaStructsToConstantMemory( )
{
static const cuComplex dieletricConstant( make_cuFloatComplex( 4.0f, -0.006f ) );
// This fails when being called from another .cu file..
checkCudaError( cudaMemcpyToSymbol( CU_AVERAGE_LAND_DIELECTRIC_CONSTANT,
&dieletricConstant,
sizeof(cuComplex),
0,
cudaMemcpyHostToDevice ) );
}
#endif
}
CudaInterface.hpp
#ifndef DEWM_SignalCudaInterface_INC
#define DEWM_SignalCudaInterface_INC 1
#include <cuda.h>
#include <cuda_profiler_api.h>
//#define DEWM_CUDA_DEBUG_LITE 1
// #define DEWM_CUDA_ERROR_OUTPUT 1
#define DEWM_CUDA_DEVICE_PROP_OUTPUT 1
// Define CUDA Host device macros for compatability with NON-CUDA builds.
#ifdef __CUDACC__
#define CUDA_HOST_DEVICE __host__ __device__
#define CUDA_KERNEL __global__
#define CUDA_DEVICE __device__
#define CUDA_HOST __host__
#define CUDA_CONSTANT_CACHE __constant__
#else
#define CUDA_HOST_DEVICE
#define CUDA_KERNEL
#define CUDA_CONSTANT_CACHE
#define CUDA_DEVICE
#define CUDA_HOST
#endif
#ifdef DEWM_CUDA_ENABLED
/**
* \def checkCudaError( cudaError )
* \brief Macro to call the cuda check function. Simply wrap this macro around any cuda runtime api library calls being made.
* \param cudaError result from a cuda runtime api function call.
*/
#define checkCudaError( cudaError ) __checkCudaError( cudaError, __FILE__, __LINE__ )
/**
* \brief Checks the return value of any cuda function for errors.
* If an error occurs, the line number and error type is displayed for debugging purposes. Additionally,
* the model will be terminated when an error is encountered. This method is preferred to the old method of
* checking cuda errors through looking at the last error as it allows the error to be isolated to a single line
* and file.
* NOTE: This function is not explicitly called. It can only be called properly via the use of the macro checkCudaError
* above.
*
* This can be used with kernal launches as well, by calling cudaPeekAtLastError and cudaDeviceSynchronize and
* wrapping both functions with the macro above.
* \param result_t result from a cuda runtime api library function
* \param file File that the cuda check was used in
* \param line Line number of the cuda check
*/
inline void __checkCudaError ( cudaError_t result_t, const char * file, const int line )
{
std::string error_string;
if ( cudaSuccess != result_t )
{
fprintf ( stderr, "\x1B[31m CUDA error encountered in file '%s', line %d\n Error %d: %s\n Terminating DEWM!\n \x1B[0m", file, line, result_t,
cudaGetErrorString ( result_t ) );
throw std::runtime_error ( "checkCUDAError : ERROR: CUDA Error" );
}
}
// Forward declare cudaDeviceProp
struct cudaDeviceProp;
namespace dewm
{
namespace cuda
{
const std::vector<uint32_t> & CudaGetAvailableDevices();
class SignalCudaInterface
{
public:
SignalCudaInterface();
~SignalCudaInterface();
void initializeDeviceData();
};
}
}
#endif
#endif
CudaInterface.cpp (in libDEWM_CUDA.a )
#include <iostream>
#include <map>
#include <DEWM/cuda/CudaInterface.hpp>
#include <cuComplex.h>
void checkCUDAError(const std::string & message)
{
cudaError_t error = cudaGetLastError();
if (cudaSuccess != error)
{
std::cerr << "CUDA Error: " << message << ": " << cudaGetErrorString( error) << std::endl;
throw std::runtime_error( "checkCUDAError : ERROR: CUDA Error");
}
}
std::mutex setCudaDeviceMutex;
std::vector<cudaDeviceProp> availableDevices;
std::vector<uint32_t> availableDevicesId; /*!< Vector of available CUDA devices */
int currentDeviceIndex; /*!< Iterator to currently selected CUDA device (global) */
std::map<std::thread::id, uint32_t> m_threadMappedCurDevIndex; /*!< The device that each thread is currently using. */
std::vector<cudaDeviceProp>::iterator currentDeviceProperties;
namespace dewm
{
namespace cuda
{
/**
* Performs CUDA library initialization functions, distinct from
* API locking.
*/
void CudaLibInitUnlocked()
{
if (availableDevices.empty())
{
checkCudaError( cudaProfilerStart());
int deviceCount;
cudaDeviceProp deviceProperties;
checkCudaError( cudaGetDeviceCount( &deviceCount));
for (int device( 0); device < deviceCount; ++device)
{
checkCudaError( cudaGetDeviceProperties( &deviceProperties, device));
#ifdef DEWM_CUDA_DEVICE_PROP_OUTPUT
std::cout << "Device : " << device << " has compute capability " << deviceProperties.major << "." << deviceProperties.minor << std::endl;
std::cout << "Device Name : " << deviceProperties.name << std::endl;
std::cout << "Number of MultiProcessors : " << deviceProperties.multiProcessorCount << std::endl;
std::cout << "Number of Threads Per MultiProcessor : " << deviceProperties.maxThreadsPerMultiProcessor << std::endl;
std::cout << "Number of Threads Per Block : " << deviceProperties.maxThreadsPerBlock << std::endl;
std::cout << "Clock Rate in kHz : " << deviceProperties.clockRate << std::endl;
std::cout << "Memory Clock in kHz : " << deviceProperties.memoryClockRate << std::endl;
std::cout << "Concurrent Kernels : " << deviceProperties.concurrentKernels << std::endl;
std::cout << std::endl;
#endif
// If this is the only CUDA device then try to use it even if there are monitors attached to it
#ifndef _WIN32 // On Windows, deviceProperties.kernelExecTimeoutEnabled is equal to 1 without monitors attached
//if (deviceCount == 1 || deviceProperties.kernelExecTimeoutEnabled == 0)
#endif
//{
// Check the CUDA compute version. We need greater than 3.0
if (deviceProperties.major >= 3)
{
//#ifdef DEWM_CUDA_DEBUG_LITE
std::cout << "Added cuda device: " << device << std::endl;
//#endif
availableDevices.push_back( deviceProperties);
availableDevicesId.push_back( device);
}
//}
}
// Set the first device as our first device
currentDeviceIndex = 0;
currentDeviceProperties = availableDevices.begin();
}
}
void CudaLibInit()
{
std::lock_guard < std::mutex > setCudaDeviceLock( setCudaDeviceMutex);
CudaLibInitUnlocked();
}
void CudaLibRelease()
{
// NOTE: CUDA Library releases at DEWM destruction.
std::lock_guard< std::mutex > setCudaDeviceLock ( setCudaDeviceMutex );
std::vector < cudaDeviceProp >::reverse_iterator propIter = availableDevices.rbegin();
std::vector < uint32_t >::reverse_iterator idIter = availableDevicesId.rbegin();
// Clear available ids and devices.
availableDevicesId.clear();
availableDevices.clear();
// Reset allocations on the current device.
checkCudaError( cudaDeviceReset());
checkCudaError( cudaProfilerStop() );
}
SignalCudaInterface::SignalCudaInterface()
{
dewm::cuda::CudaLibInit();
initializeDeviceData();
}
SignalCudaInterface::~SignalCudaInterface()
{
dewm::cuda::CudaLibRelease();
}
void SignalCudaInterface::initializeDeviceData()
{
// NOTE: CUDA Library releases at DEWM destruction.
std::lock_guard< std::mutex > setCudaDeviceLock ( setCudaDeviceMutex );
uint32_t index(0);
for ( uint32_t currentDevice : CudaGetAvailableDevices())
{
std::cout << "Writing struct to device: " << currentDevice << std::endl;
checkCudaError( cudaDeviceSetCacheConfig( cudaFuncCachePreferNone));
checkCudaError( cudaSetDevice(currentDevice));
// Write any necessary structures to the constant memory cache...
writeCudaStructsToConstantMemory();
std::cout << "Finishing writing initial device data" << std::endl;
}
}
static SignalCudaInterface m_interface; /*!< CUDA Library object created at runtime in DEWM. Initializes and releases the CUDA libraries. >*/
}
}
Main: compiled executable
/*
* testCompilation.cu
*
* Created on: Feb 22, 2019
* Author: tstrutz
*/
#include <DEWM/cuda/CudaInterface.hpp>
// This does nothing but initialize the static object inside CudaInterface.cpp
int main( int argc, char ** argv )
{
}