How can you correctly declare and use a global device variable across different CUDA translation units?

wusehn · March 27, 2025, 9:42am

I tried defining a __device__ variable in one source file, assigning it an address and initializing it. Then, I used extern to reference this variable in another source file. Before printing, I ensured the variable was initialized. However, when printing the variable in both files, the one that wasn’t defined and initialized appeared to be empty.

How should I correctly use a __device__ variable across multiple source files?

__device__ float *sx_begin;
cudaMalloc(sx_begin,sizeof(int));
cudaMemset(sx_begin,INT_MAX,sizeof(int));

In another file:
extern __device__ float *sx_begin;

striker159 · March 27, 2025, 10:15am

First of all, I think you need to use cudaMemcpyToSymbol to copy the pointer of allocated memory into the device variable sx_begin, i.e. cudaMalloc(ptr...); cudaMemset(ptr....); cudaMemcpyToSymbol(sx_begin, ptr....).

Second, you did not show how you compile your example. To use device variables across multiple translation units, one needs to compile with relocatable device code -rdc=true.

(To share a pointer to a cudaMalloc allocation, you do not need to use a __device__ variable)

wusehn · March 27, 2025, 10:37am

Here is my code,I use this to init my data,and I want to use in another file.

__constant__ RawImageParameter d_rawImageParameter;
__constant__ DisparityParameter d_disparityParameter;
__constant__ FilterParameterDevice d_filterPatameterDevice; 
__device__ MicroImageParameterDevice d_microImageParameter; 
__device__ float* d_costVol;
__device__ float* d_rawDisp;
__device__ float* d_ppLensMeanDisp;
__device__ float* d_renderCache;
__device__ float* d_inputImg;
__device__ float* d_inputImgRec;
__device__ RanderMapPatch *d_ppRanderMapPatch;
//__device__ float *d_randerImg;
__device__ float *d_tmp;
__device__ float *d_simg;
__device__ int *sx_begin, *sy_begin, *sx_end, *sy_end;
__device__ int *d_randerMapWidth, *d_randerMapHeight;

__constant__ float *d_fltMax;
void DataParameter::mapToGPU()
{
    CUDA_CHECK(cudaMemcpyToSymbol(d_rawImageParameter, &m_rawImageParameter, sizeof(RawImageParameter)));
    CUDA_CHECK(cudaMemcpyToSymbol(d_disparityParameter, &m_disparityParameter, sizeof(DisparityParameter)));

    float fltMax = FLT_MAX;
    CUDA_CHECK(cudaMemcpyToSymbol(d_fltMax, &fltMax, sizeof(float)));

    int* d_validNeighborPixelsNum;
    int* d_validPixelsMask;
    float* d_filterKernel;

    CUDA_CHECK(cudaMalloc((void**)&d_validNeighborPixelsNum, m_filterPatameter.m_pValidNeighborPixelsNum->total() * sizeof(int)));
    CUDA_CHECK(cudaMemcpy(d_validNeighborPixelsNum, m_filterPatameter.m_pValidNeighborPixelsNum->data, 
                          m_filterPatameter.m_pValidNeighborPixelsNum->total() * sizeof(int), cudaMemcpyHostToDevice));

    CUDA_CHECK(cudaMalloc((void**)&d_validPixelsMask, m_filterPatameter.m_pValidPixelsMask->total() * sizeof(int)));
    CUDA_CHECK(cudaMemcpy(d_validPixelsMask, m_filterPatameter.m_pValidPixelsMask->data, 
                          m_filterPatameter.m_pValidPixelsMask->total() * sizeof(int), cudaMemcpyHostToDevice));

    CUDA_CHECK(cudaMalloc((void**)&d_filterKernel, m_filterPatameter.m_filterKnernel.total() * sizeof(float)));
    CUDA_CHECK(cudaMemcpy(d_filterKernel, m_filterPatameter.m_filterKnernel.data, 
                          m_filterPatameter.m_filterKnernel.total() * sizeof(float), cudaMemcpyHostToDevice));

    FilterParameterDevice filterParamDevice = { d_validNeighborPixelsNum, d_validPixelsMask, d_filterKernel };
    CUDA_CHECK(cudaMemcpyToSymbol(d_filterPatameterDevice, &filterParamDevice, sizeof(FilterParameterDevice)));

    int rows = m_rawImageParameter.m_yLensNum;
    int cols = m_rawImageParameter.m_xLensNum;
    int srcImgHeight = m_rawImageParameter.m_srcImgHeight;
    int srcImgWidth = m_rawImageParameter.m_srcImgWidth;
    int neighborNum = NEIGHBOR_MATCH_LENS_NUM;


	CUDA_CHECK(cudaMalloc((void**)&d_microImageParameter, sizeof(MicroImageParameterDevice)));


    MicroImageParameterDevice h_microImageParameterDevice;

  
    h_microImageParameterDevice.m_circleDiameter = m_microImageParameter.m_circleDiameter;
    h_microImageParameterDevice.m_circleNarrow = m_microImageParameter.m_circleNarrow;
    h_microImageParameterDevice.m_radiusDisEqu = m_microImageParameter.m_radiusDisEqu;


    int lensCenterPointsSize = m_rawImageParameter.m_yLensNum * m_rawImageParameter.m_xLensNum * sizeof(cv::Point2d);
    CUDA_CHECK(cudaMalloc((void**)&h_microImageParameterDevice.m_ppLensCenterPoints, lensCenterPointsSize));
    CUDA_CHECK(cudaMemcpy(h_microImageParameterDevice.m_ppLensCenterPoints, m_microImageParameter.m_ppLensCenterPoints[0], lensCenterPointsSize, cudaMemcpyHostToDevice));

  
    int pixelsMappingSetSize = m_rawImageParameter.m_srcImgHeight * m_rawImageParameter.m_srcImgWidth * sizeof(int);
    CUDA_CHECK(cudaMalloc((void**)&h_microImageParameterDevice.m_ppPixelsMappingSet, pixelsMappingSetSize));
    CUDA_CHECK(cudaMemcpy(h_microImageParameterDevice.m_ppPixelsMappingSet, m_microImageParameter.m_ppPixelsMappingSet[0], pixelsMappingSetSize, cudaMemcpyHostToDevice));

  
    int matchNeighborLensSize = m_rawImageParameter.m_yLensNum * m_rawImageParameter.m_xLensNum * NEIGHBOR_MATCH_LENS_NUM * sizeof(MatchNeighborLens);
    CUDA_CHECK(cudaMalloc((void**)&h_microImageParameterDevice.m_ppMatchNeighborLens, matchNeighborLensSize));
    CUDA_CHECK(cudaMemcpy(h_microImageParameterDevice.m_ppMatchNeighborLens, m_microImageParameter.m_ppMatchNeighborLens[0][0], matchNeighborLensSize, cudaMemcpyHostToDevice));


	CUDA_CHECK(cudaMemcpyToSymbol(d_microImageParameter, &h_microImageParameterDevice, sizeof(MicroImageParameterDevice)));

    CUDA_CHECK(cudaMalloc((void**)&d_costVol, m_disparityParameter.m_disNum * m_rawImageParameter.m_recImgHeight * m_rawImageParameter.m_recImgWidth * sizeof(float)));
    CUDA_CHECK(cudaMemset(d_costVol, 0, m_disparityParameter.m_disNum * m_rawImageParameter.m_recImgHeight * m_rawImageParameter.m_recImgWidth * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)&d_rawDisp, m_rawImageParameter.m_recImgHeight * m_rawImageParameter.m_recImgWidth * sizeof(float)));
    CUDA_CHECK(cudaMemset(d_rawDisp, 15, m_rawImageParameter.m_recImgHeight * m_rawImageParameter.m_recImgWidth * sizeof(float)));
    CUDA_CHECK(cudaMalloc(&d_ppLensMeanDisp, m_rawImageParameter.m_yLensNum * m_rawImageParameter.m_xLensNum * sizeof(float)));
    CUDA_CHECK(cudaMalloc(&d_ppRanderMapPatch, m_rawImageParameter.m_yLensNum * m_rawImageParameter.m_xLensNum * sizeof(RanderMapPatch)));
    CUDA_CHECK(cudaMalloc(&d_tmp, DEST_WIDTH * DEST_HEIGHT * 3 * sizeof(float)));
    CUDA_CHECK(cudaMalloc(&d_simg, DEST_WIDTH * DEST_HEIGHT * 3 * sizeof(float)));
    CUDA_CHECK(cudaMalloc(&sx_begin, sizeof(int)));
    CUDA_CHECK(cudaMalloc(&sy_begin, sizeof(int)));
    CUDA_CHECK(cudaMalloc(&sx_end, sizeof(int)));
    CUDA_CHECK(cudaMalloc(&sy_end, sizeof(int)));
    CUDA_CHECK(cudaMemset(sx_begin, INT_MAX, sizeof(int)));
    CUDA_CHECK(cudaMemset(sy_begin, INT_MAX, sizeof(int)));
    CUDA_CHECK(cudaMemset(sx_end, INT_MIN, sizeof(int)));
    CUDA_CHECK(cudaMemset(sy_end, INT_MIN, sizeof(int)));
    CUDA_CHECK(cudaMalloc(&d_randerMapWidth, sizeof(int)));
    CUDA_CHECK(cudaMalloc(&d_randerMapHeight, sizeof(int)));

	printf("sx_begin pointer: %p\n", d_inputImg);
}

another file:

extern __constant__ RawImageParameter d_rawImageParameter;
extern __constant__ DisparityParameter d_disparityParameter;
extern __constant__ FilterParameterDevice d_filterPatameterDevice; 
extern __device__ MicroImageParameterDevice d_microImageParameter; 
extern __device__ float* d_costVol;
extern __device__ float* d_rawDisp;
extern __device__ float* d_inputImg;
extern __device__ float* d_ppLensMeanDisp;
extern __device__ RanderMapPatch *d_ppRanderMapPatch;
extern __device__ float *d_tmp;
extern __device__ float *d_simg;
extern __device__ float* d_inputImgRec;
extern __device__ int *sx_begin, *sy_begin, *sx_end, *sy_end;
extern __device__ int *d_randerMapWidth, *d_randerMapHeight;

I try to print some paramter

printf("sx_begin pointer: %p\n", d_inputImg);

they are null,I just started learning CUDA, and I’m not very familiar with it.

wusehn · March 27, 2025, 10:40am

I have already initialized the data in the file where it is defined before using it in other files. They are in the correct order. This means that the data is initialized before I call the logic part.

wusehn · March 27, 2025, 10:44am

Or is there any good alternative solution? I want the GPU variables to be usable across different files. Otherwise, if all the variables are in one file, the project will be too bloated.

wusehn · March 27, 2025, 10:46am

I use vscode to build my program
here is my configure json

{
    "tasks": [
        {
            "type": "cppbuild",
            "label": "C/C++: nvcc 编译 OpenCV + CUDA 项目",
            "command": "/usr/local/cuda/bin/nvcc",
            "args": [
                "-std=c++17",
                "-g",
                "-o",
                "${workspaceFolder}/bin/main",
                "${workspaceFolder}/main.cu",
                //"${workspaceFolder}/ConfidenceCompute.cpp",
                //"${workspaceFolder}/ToolOneTestDemo.cpp",
                "${workspaceFolder}/ToolTwoTestDemo.cpp",
                "${workspaceFolder}/CostVolCompute.cpp", // 
                "${workspaceFolder}/DataDeal.cu",
                //"${workspaceFolder}/DepthComputeToolOne.cpp",
                "${workspaceFolder}/DepthComputeToolTwo.cu",
                "${workspaceFolder}/GlobalOptimization.cpp",
                "${workspaceFolder}/ImageRander.cu",
                //"${workspaceFolder}/SceneDepthCompute.cpp",
                //"${workspaceFolder}/VirtualDepthCompute.cpp",
                "${workspaceFolder}/CostVolFilter.cu",
                "${workspaceFolder}/DataParameter.cu",
                "${workspaceFolder}/CAST/STCA.cpp",
                "${workspaceFolder}/CAST/StereoDisparity.cpp",
                "${workspaceFolder}/CAST/StereoHelper.cpp",
                "${workspaceFolder}/CAST/SegmentTree.cpp",
                "${workspaceFolder}/CAST/Toolkit.cpp",
                "${workspaceFolder}/CAST/ctmf.o",
                "-I/usr/local/include/opencv4",
                "-I${workspaceFolder}/gcov3",
                "-I${workspaceFolder}/CAST",
                "-I/usr/local/cuda/include",
                "-L/usr/local/lib",
                "-L/usr/local/cuda/lib64",
                "-L${workspaceFolder}/gcov3",
                "-lopencv_core",
                "-lopencv_imgcodecs",
                "-lopencv_highgui",
                "-lopencv_imgproc",
                "-lopencv_cudaimgproc",
                "-lopencv_cudafilters",
                "-lopencv_cudaimgproc",  
                "-lopencv_cudafilters",  
                "-lopencv_cudaarithm", 
                "-lcudart",
                "-lcublas",
                "-lcudnn", 
                "-lGCoptimization"
            ],
            "options": {
                "cwd": "${workspaceFolder}"
            },
            "problemMatcher": [
                "$gcc"
            ],
            "group": {
                "kind": "build",
                "isDefault": true
            },
            "detail": "OpenCV + CUDA 项目编译任务"
        }
    ],
    "version": "2.0.0"
}

Curefab · March 27, 2025, 11:40am

You never use or set d_inputImg to a value. You just define it in the beginning and print it at the end of the program. E.g. you could call cudaMalloc() for it?

wusehn · March 27, 2025, 11:58am

I use them like this :

void ImageRander::imageRander(const RawImageParameter &rawImageParameter, 
    const MicroImageParameter &microImageParameter, float *d_randerImg)
{
   
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // Step 1: Process patch kernel
    cudaEventRecord(start); 
    dim3 blockSize(32, 32);
    dim3 gridSize((rawImageParameter.m_xLensNum + blockSize.x - 1) / blockSize.x, (rawImageParameter.m_yLensNum + blockSize.y - 1) / blockSize.y);
    processPatchKernel<<<gridSize, blockSize>>>(d_randerImg, d_tmp, d_simg, 
                                                d_ppLensMeanDisp, 0, 0, DEST_WIDTH, DEST_HEIGHT, 3, d_ppRanderMapPatch); // 3通道
    CUDA_CHECK(cudaGetLastError()); 
    CUDA_CHECK(cudaDeviceSynchronize());
    cudaEventRecord(stop); 
    cudaEventSynchronize(stop); 
    float ms = 0;
    cudaEventElapsedTime(&ms, start, stop);
    printf("Process Patch Kernel Time: %f ms\n", ms); 



    // Step 2: Compute boundary kernel
    cudaEventRecord(start); 
    computeBoundaryKernel<<<gridSize, blockSize>>>(d_ppRanderMapPatch,  sx_begin, sy_begin, 
                                                   sx_end, sy_end, DEST_WIDTH, DEST_HEIGHT);
    CUDA_CHECK(cudaGetLastError()); 
    CUDA_CHECK(cudaDeviceSynchronize());
    cudaEventRecord(stop); 
    cudaEventSynchronize(stop); 
    cudaEventElapsedTime(&ms, start, stop);
    printf("Compute Boundary Kernel Time: %f ms\n", ms); 


    // Step 3: Compute width and height kernel

    printf("sx_begin pointer: %p\n", d_inputImg);

    //printf("sx_begin pointer: %p\n", sx_begin);
    //printf("sy_begin pointer: %p\n", sy_begin);
    //printf("sx_end pointer: %p\n", sx_end);
    //printf("sy_end pointer: %p\n", sy_end);
    
    int h_sx_begin, h_sy_begin, h_sx_end, h_sy_end;
    CUDA_CHECK(cudaMemcpy(&h_sx_begin, sx_begin, sizeof(int), cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(&h_sy_begin, sy_begin, sizeof(int), cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(&h_sx_end, sx_end, sizeof(int), cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(&h_sy_end, sy_end, sizeof(int), cudaMemcpyDeviceToHost));

    int h_randerMapWidth = h_sx_end - h_sx_begin + 1;
    int h_randerMapHeight = h_sy_end - h_sy_begin + 1;

    size_t randerMapSize = h_randerMapWidth * h_randerMapHeight * 3 * sizeof(float); 
    size_t randerCountSize = h_randerMapWidth * h_randerMapHeight * sizeof(float);
    CUDA_CHECK(cudaMalloc(&d_randerMapHeight,sizeof(int)));
    CUDA_CHECK(cudaMalloc(&d_randerMapWidth,sizeof(int)));
    CUDA_CHECK(cudaMemcpy(d_randerMapWidth, &h_randerMapWidth, sizeof(int), cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_randerMapHeight, &h_randerMapHeight, sizeof(int), cudaMemcpyHostToDevice));

    CUDA_CHECK(cudaMalloc(&d_randerMap, randerMapSize));
    CUDA_CHECK(cudaMalloc(&d_randerCount, randerCountSize));
    CUDA_CHECK(cudaMemset(d_randerMap, 0, randerMapSize));
    CUDA_CHECK(cudaMemset(d_randerCount, 0, randerCountSize)); 

    // Step 5: Accumulate kernel
    cudaEventRecord(start); // 记录开始时间
    gridSize.x = (rawImageParameter.m_xLensNum + blockSize.x - 1) / blockSize.x;
    gridSize.y = (rawImageParameter.m_yLensNum + blockSize.y - 1) / blockSize.y;

    accumulateKernel<<<gridSize, blockSize>>>(d_randerMap, d_randerCount, d_ppRanderMapPatch,  d_randerMapWidth, d_randerMapHeight,
                                              DEST_WIDTH, DEST_HEIGHT, 3); // 3通道
    CUDA_CHECK(cudaGetLastError()); 
    CUDA_CHECK(cudaDeviceSynchronize());
    cudaEventRecord(stop); 
    cudaEventSynchronize(stop); 
    cudaEventElapsedTime(&ms, start, stop);
    printf("Accumulate Kernel Time: %f ms\n", ms); 


    // Step 6: Normalize kernel
    cudaEventRecord(start); 
    gridSize.x = (h_randerMapWidth + blockSize.x - 1) / blockSize.x;
    gridSize.y = (h_randerMapHeight + blockSize.y - 1) / blockSize.y;
    normalizeKernel<<<gridSize, blockSize>>>(d_randerMap, d_randerCount, d_randerMapWidth, d_randerMapHeight, 3); // 3通道
    CUDA_CHECK(cudaGetLastError()); 
    CUDA_CHECK(cudaDeviceSynchronize());
    cudaEventRecord(stop); 
    cudaEventSynchronize(stop); 
    cudaEventElapsedTime(&ms, start, stop);
    printf("Normalize Kernel Time: %f ms\n", ms); 


   
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
}

I hope these data using an init file to init them,and I hope to use them in another file. I use cudaMalloc for these data in initial file. They won’t take effects on another file?
How can I achieve such an effect?

Curefab · March 27, 2025, 12:15pm

You have not shown any code, where you initialize the pointer and allocate memory for it.

striker159 · March 27, 2025, 12:30pm

Here is a minimal example:

File main.cu

#include <cstdio>

#include "sharedvariables.cuh"

__global__
void kernel(){
    printf("printing pointer value from kernel: %p\n", global_device_pointer);
}

int main(){
    cudaSetDevice(0);
    kernel<<<1,1>>>();
    cudaDeviceSynchronize();

    int* d_ptr;
    cudaMalloc(&d_ptr, sizeof(int));
    cudaMemset(d_ptr, 0, sizeof(int));
    set_global_device_pointer(d_ptr);

    kernel<<<1,1>>>();
    cudaDeviceSynchronize();
}

File sharedvariables.cuh

#ifndef SHAREDVARIABLES_CUH
#define SHAREDVARIABLES_CUH
extern __device__ int* global_device_pointer;

cudaError_t set_global_device_pointer(int* value);

#endif

File sharedvariables.cu

#include <iostream>

__device__ int* global_device_pointer;

cudaError_t set_global_device_pointer(int* value){
    std::cout << "set_global_device_pointer(" << value << ")" << "\n";
    return cudaMemcpyToSymbol(global_device_pointer, &value, sizeof(int*));
}

Compilation: nvcc -rdc=true main.cu sharedvariables.cu -o main

Program output:

printing pointer value from kernel: (nil)
set_global_device_pointer(0x7f62efa00000)
printing pointer value from kernel: 0x7f62efa00000

wusehn · March 27, 2025, 12:46pm

Thank you! I will try to wriet code in .cuh.

system · April 10, 2025, 12:46pm

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
Different __device__ variables behind one name CUDA Programming and Performance	7	1522	June 29, 2012
Different __device__ variables behind the one name CUDA Programming and Performance	0	1035	June 26, 2012
__device__ variariable doubt CUDA Programming and Performance	1	3665	June 6, 2008
how to use global device struct variables in device functions CUDA Programming and Performance	4	9290	May 19, 2011
extern __device__ unsigned char deviceArray[] impossible? CUDA Programming and Performance	5	1637	December 29, 2016
CUDA header input file Declaring device variables in a separate .h or .cuh file CUDA Programming and Performance	7	8696	June 11, 2011
__device__ variables __device__ variables required to be allocated using cudaMalloc?? CUDA Programming and Performance	5	6425	October 22, 2010
Global device pointer access using cudaMemcpyToSymbol and cudaMemcpyFromSymbol CUDA Programming and Performance	4	3612	November 21, 2013
Invalid Device Pointer CUDA Programming and Performance	9	24678	January 15, 2009
On which device are __device__ variables allocated? CUDA Programming and Performance	21	6750	March 13, 2009

How can you correctly declare and use a global __device__ variable across different CUDA translation units?

Related topics

How can you correctly declare and use a global device variable across different CUDA translation units?