How can you correctly declare and use a global __device__ variable across different CUDA translation units?

I tried defining a __device__ variable in one source file, assigning it an address and initializing it. Then, I used extern to reference this variable in another source file. Before printing, I ensured the variable was initialized. However, when printing the variable in both files, the one that wasn’t defined and initialized appeared to be empty.

How should I correctly use a __device__ variable across multiple source files?

__device__ float *sx_begin;
cudaMalloc(sx_begin,sizeof(int));
cudaMemset(sx_begin,INT_MAX,sizeof(int));

In another file:
extern __device__ float *sx_begin;

First of all, I think you need to use cudaMemcpyToSymbol to copy the pointer of allocated memory into the device variable sx_begin, i.e. cudaMalloc(ptr...); cudaMemset(ptr....); cudaMemcpyToSymbol(sx_begin, ptr....).

Second, you did not show how you compile your example. To use device variables across multiple translation units, one needs to compile with relocatable device code -rdc=true.

(To share a pointer to a cudaMalloc allocation, you do not need to use a __device__ variable)

Here is my code,I use this to init my data,and I want to use in another file.

__constant__ RawImageParameter d_rawImageParameter;
__constant__ DisparityParameter d_disparityParameter;
__constant__ FilterParameterDevice d_filterPatameterDevice; 
__device__ MicroImageParameterDevice d_microImageParameter; 
__device__ float* d_costVol;
__device__ float* d_rawDisp;
__device__ float* d_ppLensMeanDisp;
__device__ float* d_renderCache;
__device__ float* d_inputImg;
__device__ float* d_inputImgRec;
__device__ RanderMapPatch *d_ppRanderMapPatch;
//__device__ float *d_randerImg;
__device__ float *d_tmp;
__device__ float *d_simg;
__device__ int *sx_begin, *sy_begin, *sx_end, *sy_end;
__device__ int *d_randerMapWidth, *d_randerMapHeight;

__constant__ float *d_fltMax;
void DataParameter::mapToGPU()
{
    CUDA_CHECK(cudaMemcpyToSymbol(d_rawImageParameter, &m_rawImageParameter, sizeof(RawImageParameter)));
    CUDA_CHECK(cudaMemcpyToSymbol(d_disparityParameter, &m_disparityParameter, sizeof(DisparityParameter)));

    float fltMax = FLT_MAX;
    CUDA_CHECK(cudaMemcpyToSymbol(d_fltMax, &fltMax, sizeof(float)));

    int* d_validNeighborPixelsNum;
    int* d_validPixelsMask;
    float* d_filterKernel;

    CUDA_CHECK(cudaMalloc((void**)&d_validNeighborPixelsNum, m_filterPatameter.m_pValidNeighborPixelsNum->total() * sizeof(int)));
    CUDA_CHECK(cudaMemcpy(d_validNeighborPixelsNum, m_filterPatameter.m_pValidNeighborPixelsNum->data, 
                          m_filterPatameter.m_pValidNeighborPixelsNum->total() * sizeof(int), cudaMemcpyHostToDevice));

    CUDA_CHECK(cudaMalloc((void**)&d_validPixelsMask, m_filterPatameter.m_pValidPixelsMask->total() * sizeof(int)));
    CUDA_CHECK(cudaMemcpy(d_validPixelsMask, m_filterPatameter.m_pValidPixelsMask->data, 
                          m_filterPatameter.m_pValidPixelsMask->total() * sizeof(int), cudaMemcpyHostToDevice));

    CUDA_CHECK(cudaMalloc((void**)&d_filterKernel, m_filterPatameter.m_filterKnernel.total() * sizeof(float)));
    CUDA_CHECK(cudaMemcpy(d_filterKernel, m_filterPatameter.m_filterKnernel.data, 
                          m_filterPatameter.m_filterKnernel.total() * sizeof(float), cudaMemcpyHostToDevice));

    FilterParameterDevice filterParamDevice = { d_validNeighborPixelsNum, d_validPixelsMask, d_filterKernel };
    CUDA_CHECK(cudaMemcpyToSymbol(d_filterPatameterDevice, &filterParamDevice, sizeof(FilterParameterDevice)));

    int rows = m_rawImageParameter.m_yLensNum;
    int cols = m_rawImageParameter.m_xLensNum;
    int srcImgHeight = m_rawImageParameter.m_srcImgHeight;
    int srcImgWidth = m_rawImageParameter.m_srcImgWidth;
    int neighborNum = NEIGHBOR_MATCH_LENS_NUM;


	CUDA_CHECK(cudaMalloc((void**)&d_microImageParameter, sizeof(MicroImageParameterDevice)));


    MicroImageParameterDevice h_microImageParameterDevice;

  
    h_microImageParameterDevice.m_circleDiameter = m_microImageParameter.m_circleDiameter;
    h_microImageParameterDevice.m_circleNarrow = m_microImageParameter.m_circleNarrow;
    h_microImageParameterDevice.m_radiusDisEqu = m_microImageParameter.m_radiusDisEqu;


    int lensCenterPointsSize = m_rawImageParameter.m_yLensNum * m_rawImageParameter.m_xLensNum * sizeof(cv::Point2d);
    CUDA_CHECK(cudaMalloc((void**)&h_microImageParameterDevice.m_ppLensCenterPoints, lensCenterPointsSize));
    CUDA_CHECK(cudaMemcpy(h_microImageParameterDevice.m_ppLensCenterPoints, m_microImageParameter.m_ppLensCenterPoints[0], lensCenterPointsSize, cudaMemcpyHostToDevice));

  
    int pixelsMappingSetSize = m_rawImageParameter.m_srcImgHeight * m_rawImageParameter.m_srcImgWidth * sizeof(int);
    CUDA_CHECK(cudaMalloc((void**)&h_microImageParameterDevice.m_ppPixelsMappingSet, pixelsMappingSetSize));
    CUDA_CHECK(cudaMemcpy(h_microImageParameterDevice.m_ppPixelsMappingSet, m_microImageParameter.m_ppPixelsMappingSet[0], pixelsMappingSetSize, cudaMemcpyHostToDevice));

  
    int matchNeighborLensSize = m_rawImageParameter.m_yLensNum * m_rawImageParameter.m_xLensNum * NEIGHBOR_MATCH_LENS_NUM * sizeof(MatchNeighborLens);
    CUDA_CHECK(cudaMalloc((void**)&h_microImageParameterDevice.m_ppMatchNeighborLens, matchNeighborLensSize));
    CUDA_CHECK(cudaMemcpy(h_microImageParameterDevice.m_ppMatchNeighborLens, m_microImageParameter.m_ppMatchNeighborLens[0][0], matchNeighborLensSize, cudaMemcpyHostToDevice));


	CUDA_CHECK(cudaMemcpyToSymbol(d_microImageParameter, &h_microImageParameterDevice, sizeof(MicroImageParameterDevice)));

    CUDA_CHECK(cudaMalloc((void**)&d_costVol, m_disparityParameter.m_disNum * m_rawImageParameter.m_recImgHeight * m_rawImageParameter.m_recImgWidth * sizeof(float)));
    CUDA_CHECK(cudaMemset(d_costVol, 0, m_disparityParameter.m_disNum * m_rawImageParameter.m_recImgHeight * m_rawImageParameter.m_recImgWidth * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)&d_rawDisp, m_rawImageParameter.m_recImgHeight * m_rawImageParameter.m_recImgWidth * sizeof(float)));
    CUDA_CHECK(cudaMemset(d_rawDisp, 15, m_rawImageParameter.m_recImgHeight * m_rawImageParameter.m_recImgWidth * sizeof(float)));
    CUDA_CHECK(cudaMalloc(&d_ppLensMeanDisp, m_rawImageParameter.m_yLensNum * m_rawImageParameter.m_xLensNum * sizeof(float)));
    CUDA_CHECK(cudaMalloc(&d_ppRanderMapPatch, m_rawImageParameter.m_yLensNum * m_rawImageParameter.m_xLensNum * sizeof(RanderMapPatch)));
    CUDA_CHECK(cudaMalloc(&d_tmp, DEST_WIDTH * DEST_HEIGHT * 3 * sizeof(float)));
    CUDA_CHECK(cudaMalloc(&d_simg, DEST_WIDTH * DEST_HEIGHT * 3 * sizeof(float)));
    CUDA_CHECK(cudaMalloc(&sx_begin, sizeof(int)));
    CUDA_CHECK(cudaMalloc(&sy_begin, sizeof(int)));
    CUDA_CHECK(cudaMalloc(&sx_end, sizeof(int)));
    CUDA_CHECK(cudaMalloc(&sy_end, sizeof(int)));
    CUDA_CHECK(cudaMemset(sx_begin, INT_MAX, sizeof(int)));
    CUDA_CHECK(cudaMemset(sy_begin, INT_MAX, sizeof(int)));
    CUDA_CHECK(cudaMemset(sx_end, INT_MIN, sizeof(int)));
    CUDA_CHECK(cudaMemset(sy_end, INT_MIN, sizeof(int)));
    CUDA_CHECK(cudaMalloc(&d_randerMapWidth, sizeof(int)));
    CUDA_CHECK(cudaMalloc(&d_randerMapHeight, sizeof(int)));

	printf("sx_begin pointer: %p\n", d_inputImg);
}

another file:

extern __constant__ RawImageParameter d_rawImageParameter;
extern __constant__ DisparityParameter d_disparityParameter;
extern __constant__ FilterParameterDevice d_filterPatameterDevice; 
extern __device__ MicroImageParameterDevice d_microImageParameter; 
extern __device__ float* d_costVol;
extern __device__ float* d_rawDisp;
extern __device__ float* d_inputImg;
extern __device__ float* d_ppLensMeanDisp;
extern __device__ RanderMapPatch *d_ppRanderMapPatch;
extern __device__ float *d_tmp;
extern __device__ float *d_simg;
extern __device__ float* d_inputImgRec;
extern __device__ int *sx_begin, *sy_begin, *sx_end, *sy_end;
extern __device__ int *d_randerMapWidth, *d_randerMapHeight;

I try to print some paramter

printf("sx_begin pointer: %p\n", d_inputImg);

they are null,I just started learning CUDA, and I’m not very familiar with it.

I have already initialized the data in the file where it is defined before using it in other files. They are in the correct order. This means that the data is initialized before I call the logic part.

Or is there any good alternative solution? I want the GPU variables to be usable across different files. Otherwise, if all the variables are in one file, the project will be too bloated.

I use vscode to build my program
here is my configure json

{
    "tasks": [
        {
            "type": "cppbuild",
            "label": "C/C++: nvcc 编译 OpenCV + CUDA 项目",
            "command": "/usr/local/cuda/bin/nvcc",
            "args": [
                "-std=c++17",
                "-g",
                "-o",
                "${workspaceFolder}/bin/main",
                "${workspaceFolder}/main.cu",
                //"${workspaceFolder}/ConfidenceCompute.cpp",
                //"${workspaceFolder}/ToolOneTestDemo.cpp",
                "${workspaceFolder}/ToolTwoTestDemo.cpp",
                "${workspaceFolder}/CostVolCompute.cpp", // 
                "${workspaceFolder}/DataDeal.cu",
                //"${workspaceFolder}/DepthComputeToolOne.cpp",
                "${workspaceFolder}/DepthComputeToolTwo.cu",
                "${workspaceFolder}/GlobalOptimization.cpp",
                "${workspaceFolder}/ImageRander.cu",
                //"${workspaceFolder}/SceneDepthCompute.cpp",
                //"${workspaceFolder}/VirtualDepthCompute.cpp",
                "${workspaceFolder}/CostVolFilter.cu",
                "${workspaceFolder}/DataParameter.cu",
                "${workspaceFolder}/CAST/STCA.cpp",
                "${workspaceFolder}/CAST/StereoDisparity.cpp",
                "${workspaceFolder}/CAST/StereoHelper.cpp",
                "${workspaceFolder}/CAST/SegmentTree.cpp",
                "${workspaceFolder}/CAST/Toolkit.cpp",
                "${workspaceFolder}/CAST/ctmf.o",
                "-I/usr/local/include/opencv4",
                "-I${workspaceFolder}/gcov3",
                "-I${workspaceFolder}/CAST",
                "-I/usr/local/cuda/include",
                "-L/usr/local/lib",
                "-L/usr/local/cuda/lib64",
                "-L${workspaceFolder}/gcov3",
                "-lopencv_core",
                "-lopencv_imgcodecs",
                "-lopencv_highgui",
                "-lopencv_imgproc",
                "-lopencv_cudaimgproc",
                "-lopencv_cudafilters",
                "-lopencv_cudaimgproc",  
                "-lopencv_cudafilters",  
                "-lopencv_cudaarithm", 
                "-lcudart",
                "-lcublas",
                "-lcudnn", 
                "-lGCoptimization"
            ],
            "options": {
                "cwd": "${workspaceFolder}"
            },
            "problemMatcher": [
                "$gcc"
            ],
            "group": {
                "kind": "build",
                "isDefault": true
            },
            "detail": "OpenCV + CUDA 项目编译任务"
        }
    ],
    "version": "2.0.0"
}

You never use or set d_inputImg to a value. You just define it in the beginning and print it at the end of the program. E.g. you could call cudaMalloc() for it?

I use them like this :

void ImageRander::imageRander(const RawImageParameter &rawImageParameter, 
    const MicroImageParameter &microImageParameter, float *d_randerImg)
{
   
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // Step 1: Process patch kernel
    cudaEventRecord(start); 
    dim3 blockSize(32, 32);
    dim3 gridSize((rawImageParameter.m_xLensNum + blockSize.x - 1) / blockSize.x, (rawImageParameter.m_yLensNum + blockSize.y - 1) / blockSize.y);
    processPatchKernel<<<gridSize, blockSize>>>(d_randerImg, d_tmp, d_simg, 
                                                d_ppLensMeanDisp, 0, 0, DEST_WIDTH, DEST_HEIGHT, 3, d_ppRanderMapPatch); // 3通道
    CUDA_CHECK(cudaGetLastError()); 
    CUDA_CHECK(cudaDeviceSynchronize());
    cudaEventRecord(stop); 
    cudaEventSynchronize(stop); 
    float ms = 0;
    cudaEventElapsedTime(&ms, start, stop);
    printf("Process Patch Kernel Time: %f ms\n", ms); 



    // Step 2: Compute boundary kernel
    cudaEventRecord(start); 
    computeBoundaryKernel<<<gridSize, blockSize>>>(d_ppRanderMapPatch,  sx_begin, sy_begin, 
                                                   sx_end, sy_end, DEST_WIDTH, DEST_HEIGHT);
    CUDA_CHECK(cudaGetLastError()); 
    CUDA_CHECK(cudaDeviceSynchronize());
    cudaEventRecord(stop); 
    cudaEventSynchronize(stop); 
    cudaEventElapsedTime(&ms, start, stop);
    printf("Compute Boundary Kernel Time: %f ms\n", ms); 


    // Step 3: Compute width and height kernel

    printf("sx_begin pointer: %p\n", d_inputImg);

    //printf("sx_begin pointer: %p\n", sx_begin);
    //printf("sy_begin pointer: %p\n", sy_begin);
    //printf("sx_end pointer: %p\n", sx_end);
    //printf("sy_end pointer: %p\n", sy_end);
    
    int h_sx_begin, h_sy_begin, h_sx_end, h_sy_end;
    CUDA_CHECK(cudaMemcpy(&h_sx_begin, sx_begin, sizeof(int), cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(&h_sy_begin, sy_begin, sizeof(int), cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(&h_sx_end, sx_end, sizeof(int), cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(&h_sy_end, sy_end, sizeof(int), cudaMemcpyDeviceToHost));

    int h_randerMapWidth = h_sx_end - h_sx_begin + 1;
    int h_randerMapHeight = h_sy_end - h_sy_begin + 1;

    size_t randerMapSize = h_randerMapWidth * h_randerMapHeight * 3 * sizeof(float); 
    size_t randerCountSize = h_randerMapWidth * h_randerMapHeight * sizeof(float);
    CUDA_CHECK(cudaMalloc(&d_randerMapHeight,sizeof(int)));
    CUDA_CHECK(cudaMalloc(&d_randerMapWidth,sizeof(int)));
    CUDA_CHECK(cudaMemcpy(d_randerMapWidth, &h_randerMapWidth, sizeof(int), cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_randerMapHeight, &h_randerMapHeight, sizeof(int), cudaMemcpyHostToDevice));

    CUDA_CHECK(cudaMalloc(&d_randerMap, randerMapSize));
    CUDA_CHECK(cudaMalloc(&d_randerCount, randerCountSize));
    CUDA_CHECK(cudaMemset(d_randerMap, 0, randerMapSize));
    CUDA_CHECK(cudaMemset(d_randerCount, 0, randerCountSize)); 

    // Step 5: Accumulate kernel
    cudaEventRecord(start); // 记录开始时间
    gridSize.x = (rawImageParameter.m_xLensNum + blockSize.x - 1) / blockSize.x;
    gridSize.y = (rawImageParameter.m_yLensNum + blockSize.y - 1) / blockSize.y;

    accumulateKernel<<<gridSize, blockSize>>>(d_randerMap, d_randerCount, d_ppRanderMapPatch,  d_randerMapWidth, d_randerMapHeight,
                                              DEST_WIDTH, DEST_HEIGHT, 3); // 3通道
    CUDA_CHECK(cudaGetLastError()); 
    CUDA_CHECK(cudaDeviceSynchronize());
    cudaEventRecord(stop); 
    cudaEventSynchronize(stop); 
    cudaEventElapsedTime(&ms, start, stop);
    printf("Accumulate Kernel Time: %f ms\n", ms); 


    // Step 6: Normalize kernel
    cudaEventRecord(start); 
    gridSize.x = (h_randerMapWidth + blockSize.x - 1) / blockSize.x;
    gridSize.y = (h_randerMapHeight + blockSize.y - 1) / blockSize.y;
    normalizeKernel<<<gridSize, blockSize>>>(d_randerMap, d_randerCount, d_randerMapWidth, d_randerMapHeight, 3); // 3通道
    CUDA_CHECK(cudaGetLastError()); 
    CUDA_CHECK(cudaDeviceSynchronize());
    cudaEventRecord(stop); 
    cudaEventSynchronize(stop); 
    cudaEventElapsedTime(&ms, start, stop);
    printf("Normalize Kernel Time: %f ms\n", ms); 


   
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
}

I hope these data using an init file to init them,and I hope to use them in another file. I use cudaMalloc for these data in initial file. They won’t take effects on another file?
How can I achieve such an effect?

You have not shown any code, where you initialize the pointer and allocate memory for it.

Here is a minimal example:

File main.cu

#include <cstdio>

#include "sharedvariables.cuh"

__global__
void kernel(){
    printf("printing pointer value from kernel: %p\n", global_device_pointer);
}

int main(){
    cudaSetDevice(0);
    kernel<<<1,1>>>();
    cudaDeviceSynchronize();

    int* d_ptr;
    cudaMalloc(&d_ptr, sizeof(int));
    cudaMemset(d_ptr, 0, sizeof(int));
    set_global_device_pointer(d_ptr);

    kernel<<<1,1>>>();
    cudaDeviceSynchronize();
}

File sharedvariables.cuh

#ifndef SHAREDVARIABLES_CUH
#define SHAREDVARIABLES_CUH
extern __device__ int* global_device_pointer;

cudaError_t set_global_device_pointer(int* value);

#endif

File sharedvariables.cu

#include <iostream>

__device__ int* global_device_pointer;

cudaError_t set_global_device_pointer(int* value){
    std::cout << "set_global_device_pointer(" << value << ")" << "\n";
    return cudaMemcpyToSymbol(global_device_pointer, &value, sizeof(int*));
}

Compilation: nvcc -rdc=true main.cu sharedvariables.cu -o main

Program output:

printing pointer value from kernel: (nil)
set_global_device_pointer(0x7f62efa00000)
printing pointer value from kernel: 0x7f62efa00000

Thank you! I will try to wriet code in .cuh.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.