how to use global device struct variables in device functions

The problem is as title. The code is as below (tested). Compilation had no problem. The program gave wrong result (member ‘m’ is not assigned to 10). What’s the problem?

#include <stdio.h>

struct g{

  int m;

};

__device__ struct g *d;

__global__ void kernel()

{

  int tid=blockIdx.x * blockDim.x + threadIdx.x;

  d[tid].m=10;

}

int main()

{

  size_t size = 1 * sizeof(struct g);

  cudaMalloc(&d, size);

  kernel<<<1,1>>>();

  cudaDeviceSynchronize();

  struct g *h = (struct g*)malloc(size);

  cudaMemcpy(h, d, size, cudaMemcpyDeviceToHost);

  printf("Result: %d\n",h[0].m);

}

If you add some error checking (all of those API functions return error codes), things will become a lot clearer.

The basic problem is the cudaMalloc call. You cannot call cudaMalloc directly onto a device symbol. Instead, perform cudaMalloc onto a host pointer, then copy the value of that pointer onto the device variable using cudaMemcpyToSymbol.

I have error checking in my original code. To shorten the code here (for easy reading), I removed the checking code. But the checking code didn’t give the right error place. It told me kernel launch had unspecified error. My checking code is as below. Is it ok?

// check runtime call error

#define cudaSafeCall(call) {  \

  cudaError err = call;       \

  if(cudaSuccess != err){     \

    fprintf(stderr, "%s(%i) : %s.\n", __FILE__, __LINE__, cudaGetErrorString(err));   \

    exit(EXIT_FAILURE);       \

}}

// check kernel launch error

#define cudaCheckErr(errorMessage) {    \

  cudaError_t err = cudaGetLastError(); \

  if(cudaSuccess != err){               \

    fprintf(stderr, "%s(%i) : %s : %s.\n", __FILE__, __LINE__, errorMessage, cudaGetErrorString(err)); \

    exit(EXIT_FAILURE);                 \

}}

So, I

  1. cudaMalloc a host pointer (so it’s in device memory)

  2. modify its value in kernel functions

  3. cudaMemcpyToSymbol its value to the device variable

This may work. However, what I intended to do is: I have tens of kernel functions. Most of them will use the device variable (to share infomation). So by setting the device variable as global, all kernel functions can see it (and hence use it). Following the way above, I can pass everything in parameters, but it needs to modify all definitions and calling of the kernel functions, looks not convenient. Any way can let me get what I wanted?

Which is completely correct, because the device symbol d is uninitialized in your current code.

No, you do this:

  1. cudaMalloc a host pointer (so it’s in device memory)

  2. cudaMemcpyToSymbol its value (ie. its device address) to the device variable

  3. modify its value in kernel functions using the canonical device symbol

which, if I understand you correctly, is what you are asking for.

Thanks, it worked! “Result: 10”! See new code below (note ‘sizep’ used in symbol copies). But I’m still not sure about the meaning of “copy symbol address” (not the content!), and why NVIDIA created such complex memory processing way for global variables (global here means only the scope of the variable - visible for whole application, not global memory). Could you please explain the process in detail and the reason if possible? It seems not in NVIDIA guides. And it’s not using double size of device memory (one for ‘d’, one for ‘ld’), right?

#include <stdio.h>

// check runtime call error

#define cudaSafeCall(call) {  \

  cudaError err = call;       \

  if(cudaSuccess != err){     \

    fprintf(stderr, "%s(%i) : %s.\n", __FILE__, __LINE__, cudaGetErrorString(err));   \

    exit(EXIT_FAILURE);       \

}}

// check kernel launch error

#define cudaCheckErr(errorMessage) {    \

  cudaError_t err = cudaGetLastError(); \

  if(cudaSuccess != err){               \

    fprintf(stderr, "%s(%i) : %s : %s.\n", __FILE__, __LINE__, errorMessage, cudaGetErrorString(err)); \

    exit(EXIT_FAILURE);                 \

}}

struct g{

  int m;

};

__device__ struct g *d; // device (global)

__global__ void kernel()

{

  int tid=blockIdx.x * blockDim.x + threadIdx.x;

  d[tid].m=10;

}

int main()

{

  size_t size = 1 * sizeof(struct g);

  size_t sizep = 1 * sizeof(struct g*);

  struct g *ld; // device (local)

  cudaSafeCall(cudaMalloc(&ld, size));

  cudaSafeCall(cudaMemcpyToSymbol(d,&ld,sizep));

  kernel<<<1,1>>>();

  cudaSafeCall(cudaDeviceSynchronize());

  cudaCheckErr("kernel error");

  struct g *h = (struct g*)malloc(size);

  if(h==NULL){

    fprintf(stderr, "%s(%i) : malloc error.\n", __FILE__, __LINE__);

    exit(EXIT_FAILURE);

  }

  //cudaSafeCall(cudaMemcpyFromSymbol(&ld,d,sizep)); // not necessary

  cudaSafeCall(cudaMemcpy(h, ld, size, cudaMemcpyDeviceToHost));

  printf("Result: %d\n",h[0].m);

}