Constant memory, cudaMemcpyToSymbol and compilation units

In my project there are three files: main.cu, where the main function sits; kernel.cuh, the kernels headers and kernels.cu. In particular I would like to use cudaMemcpyToSymbol from the main.cu file.

//kernles.cuh

#include "cuda.h"
#include "cuda_runtime.h"


__global__ void kGetBary(int* ptr);
void copyBary(const int* src, size_t n);
void copyToConstant(int* dst, const int* src, size_t n);
int* getBaryPtr();

and

//kernels.cu

__constant__ int bary[10];

void copyBary(const int* src, size_t n)
{
    cudaMemcpyToSymbol(bary, src, n * sizeof(int));
}

int* getBaryPtr()
{
    return bary;
}

void copyToConstant(int* dst, const int* src, size_t n)
{
    cudaMemcpyToSymbol(dst, src, n * sizeof(int));
}

__global__ void kGetBary(int* ptr)
{
    const int i = threadIdx.x;
    printf("[%d]= %d\n", i, bary[i]);

    ptr[i] = bary[i];
}

and

//main.cu

#include "kernels.cuh"

int main()
{

    int b[10] = { 0, 1, 2, 3, 4, 5, 6, 77, 8, 9 };
    int res[10];

    int* d_v;
    cudaMalloc((void**)&d_v, 10 * sizeof(int));

    copyBary(b, 10); //does work
    //cudaMemcpyToSymbol(getBaryPtr(), b, 10 * sizeof(int));  // does not work
    //copyToConstant(getBaryPtr(), b, 10); // does not work

    kGetBary << <1, 10 >> > (d_v);

    ...

    return 0;
}

As you can see, in the main.cu file I tried three ways to write initial data to constant memory, however just the first worked (i.e. the one with copyBary). If I use one of the other two methods, the kernel finds the constant memory filled by zeros.
Now, I would like to avoid to use functions like copyBary since it “contains” the bary variable; i would like to pass the destination symbol as an argument.
How can I do?

NOTE ADDED: I’m using cuda 11.8 and I activated the switch -rdc=true

cudaGetSymbolAddress is one method (e.g. in getBaryPtr())
using extern on the symbol name is another method

Here is a get symbol address example:

$ cat kernels.cuh
#include "cuda.h"
#include "cuda_runtime.h"


__global__ void kGetBary1(int* ptr);
void copyBary(const int* src, size_t n);
void copyToConstant(int* dst, const int* src, size_t n);
void* getBaryPtr(int i);
$ cat kernels.cu
#include <cstdio>

__constant__ int bary1[10];
__constant__ int bary2[10];

void copyBary(const int* src, size_t n)
{
    cudaMemcpyToSymbol(bary1, src, n * sizeof(int));
}

void *getBaryPtr(int i)
{
    void *p;
    if (i == 0) cudaGetSymbolAddress(&p, bary1);
    else cudaGetSymbolAddress(&p, bary2);
    return p;
}

void copyToConstant(int* dst, const int* src, size_t n)
{
    cudaMemcpyToSymbol(dst, src, n * sizeof(int));
}

__global__ void kGetBary1(int* ptr)
{
    const int i = threadIdx.x;
    printf("[%d]= %d\n", i, bary1[i]);

    ptr[i] = bary1[i];
}
$ cat main.cu
#include "kernels.cuh"

int main()
{

    int b[10] = { 0, 1, 2, 3, 4, 5, 6, 77, 8, 9 };
    int res[10];

    int* d_v;
    cudaMalloc((void**)&d_v, 10 * sizeof(int));

    //copyBary(b, 10); //does work
    cudaMemcpy(getBaryPtr(0), b, 10 * sizeof(int), cudaMemcpyHostToDevice);
    //copyToConstant(getBaryPtr(), b, 10); // does not work

    kGetBary1 << <1, 10 >> > (d_v);

    cudaDeviceSynchronize();

    return 0;
}
$ nvcc -o test main.cu kernels.cu -rdc=true
main.cu(7): warning: variable "res" was declared but never referenced

$ ./test
[0]= 0
[1]= 1
[2]= 2
[3]= 3
[4]= 4
[5]= 5
[6]= 6
[7]= 77
[8]= 8
[9]= 9
$

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.