Accessing pointer values inside struct copied to CUDA device

ivantezoto · April 23, 2024, 8:20am

I have a code in CUDA as follows that is being compiled simply by doing “$ nvcc test.cu”

#include<stdio.h>
#include<stdlib.h>
#include<cuda_runtime.h>

struct someStruct
{
  int someInteger;
  int *somePointer;
};

typedef struct someStruct someStruct;

someStruct* start_struct()
{
  someStruct *myStruct = (someStruct*)malloc(sizeof(someStruct));
  if(myStruct)
    {
      myStruct->somePointer = (int*)malloc(10*sizeof(int));
      myStruct->someInteger = 13;
      
      for(int i = 0;i<10;i++)
	myStruct->somePointer[i] = i+10;
    }
  return myStruct;
}

__global__ void print_on_cuda(someStruct* myStruct_d, int* somePointer)
{
  int threadId = blockIdx.x * blockDim.x + threadIdx.x;
  int sum = 0;
  for(int i = 0; i<10; i++)
    sum += somePointer[i];
  for(int i = 0; i<10; i++){
    printf("someInteger %d on thread %d with index = %d and sum = %d \n",myStruct_d->someInteger, threadId, i, sum/*, myStruct_d->somePointer[i]*/);
    // Print the value of myStruct_d->somePointer[i] for debugging
    printf("Value at somePointer[%d] = %d\n", i, somePointer[i]);
  }
  return;
}

int main(){
  // myStruct is initialized
  someStruct* myStruct;
  myStruct = start_struct();
  someStruct* myStruct_d;
  int* somePointer_d;
  
  cudaMalloc((void**)&myStruct_d, sizeof(someStruct));
  cudaMalloc((int**)&somePointer_d, 10*sizeof(int));
  cudaMemcpy(myStruct_d, myStruct, sizeof(someStruct), cudaMemcpyHostToDevice);
  cudaMemcpy(somePointer_d, myStruct->somePointer, 10*sizeof(int), cudaMemcpyHostToDevice);
  
  for(int counter = 0; counter < 3; counter++){
    print_on_cuda<<<2,10>>>(myStruct_d, somePointer_d);
    cudaDeviceSynchronize();
    printf("Counter = %d\n", counter);
  }
  
  cudaFree(myStruct_d);
  cudaFree(somePointer_d);
  free(myStruct->somePointer);
  free(myStruct);
  return 0;
}

Here, my objective is to access myStruct_d->somePointer[i] inside the kernel without the need of copying it to somePointer_d using cudaMalloc. Notice that here I can print myStruct_d->someInteger insde my kernel call, but whenever I tried to print myStruct_d->somePointer[i] directly instead of doing the copy to somePointer_d I get nothing. Literally nothing, my execution does not stop or give me a segmentation fault. It simply ignores the printf line with this variable, my computer freezes for a few seconds and then it end the execution of the program printing the rest.

I also should say that I tried to use myStruct_d->somePointer for the computing of the variable sum inside the kernel. But he cannot access not even the values inside this pointer. So I get a similar problem.

I am searching for a way to fix this problem without having the need to copy pointer by pointer inside the kernel separately, because in my original application my struct is bigger and it would demand too many changes in my functions and parameters. I was taking a look at the function cudaMemcpySymbol for instance. But I am not quite sure if I understand what it does and if it would solve my problem, but I am open to any suggestions.

Thanks

ivantezoto · April 23, 2024, 1:10pm

I found two solutions to my problem. One is using Unified memory by means of function cudaMallocManaged() (as mentioned in cuda - Unified memory and struct with arrays - Stack Overflow). The code is as follows and execute as I intended:

#include<stdio.h>
#include<stdlib.h>
#include<cuda_runtime.h>

struct someStruct
{
  int someInteger;
  int *somePointer;
};

typedef struct someStruct someStruct;

someStruct* start_struct()
{
  someStruct *myStruct = (someStruct*)malloc(sizeof(someStruct));
  if(myStruct)
    {
      myStruct->somePointer = (int*)malloc(10*sizeof(int));
      myStruct->someInteger = 13;
      
      for(int i = 0;i<10;i++)
	myStruct->somePointer[i] = i+10;
    }
  return myStruct;
}

__global__ void print_on_cuda(someStruct* myStruct_d, int* somePointer)
{
  int threadId = blockIdx.x * blockDim.x + threadIdx.x;
  int sum = 0;
  for(int i = 0; i<10; i++)
    sum += somePointer[i];
  for(int i = 0; i<10; i++){
    printf("someInteger %d on thread %d with index = %d and sum = %d \n",myStruct_d->someInteger, threadId, i, sum/*, myStruct_d->somePointer[i]*/);
    // Print the value of myStruct_d->somePointer[i] for debugging
    printf("Value at somePointer_d[%d] = %d and myStruct_d->somePointer[%d] = %d\n", i, somePointer[i], i, myStruct_d->somePointer[i]);
  }
  return;
}

int main(){
  // myStruct is initialized on CPU
  someStruct* myStruct;
  myStruct = start_struct();

  // Device variables
  int* somePointer_d;
  
  // Allocating and copying to some test pointer
  cudaMalloc((int**)&somePointer_d, 10*sizeof(int));
  cudaMemcpy(somePointer_d, myStruct->somePointer, 10*sizeof(int), cudaMemcpyHostToDevice);

  // Using Unified Memory property
  someStruct* structTest;
  cudaMallocManaged(&structTest,sizeof(someStruct));

  structTest->somePointer = somePointer_d;
  structTest->someInteger = myStruct->someInteger;

  for(int counter = 0; counter < 3; counter++){
    print_on_cuda<<<2,10>>>(structTest, somePointer_d);
    cudaDeviceSynchronize();
    printf("Counter = %d\n", counter);
  }
  
  cudaFree(somePointer_d);
  free(myStruct->somePointer);
  free(myStruct);
  return 0;
}

The other one is performing a deep copy (CUDA : How to allocate memory for data member of a class - Stack Overflow or even more explicitly stated in Copying a struct containing pointers to CUDA device - Stack Overflow). The code is as follows:

#include<stdio.h>
#include<stdlib.h>
#include<cuda_runtime.h>

struct someStruct
{
  int someInteger;
  int *somePointer;
};

typedef struct someStruct someStruct;

someStruct* start_struct()
{
  someStruct *myStruct = (someStruct*)malloc(sizeof(someStruct));
  if(myStruct)
    {
      myStruct->somePointer = (int*)malloc(10*sizeof(int));
      myStruct->someInteger = 13;
      
      for(int i = 0;i<10;i++)
	myStruct->somePointer[i] = i+10;
    }
  return myStruct;
}

__global__ void print_on_cuda(someStruct myStruct_d, int* somePointer)
{
  int threadId = blockIdx.x * blockDim.x + threadIdx.x;
  int sum = 0;
  for(int i = 0; i<10; i++)
    sum += somePointer[i];
  for(int i = 0; i<10; i++){
    printf("someInteger %d on thread %d with index = %d and sum = %d \n",myStruct_d.someInteger, threadId, i, sum);
    // Print the value of myStruct_d->somePointer[i] for debugging
    printf("Value at somePointer_d[%d] = %d and myStruct_d->somePointer[%d] = %d\n", i, somePointer[i], i, myStruct_d.somePointer[i]);
  }
  return;
}

int main(){
  // myStruct is initialized on CPU
  someStruct* myStruct;
  myStruct = start_struct();

  // Device variables
  someStruct myStruct_d;
  int* somePointer_d;
  
  // Allocating and copying to some test pointer
  cudaMalloc((int**)&somePointer_d, 10*sizeof(int));
  cudaMemcpy(somePointer_d, myStruct->somePointer, 10*sizeof(int), cudaMemcpyHostToDevice);

  myStruct_d.somePointer = somePointer_d;
  myStruct_d.someInteger = myStruct->someInteger;

  for(int counter = 0; counter < 3; counter++){
    print_on_cuda<<<2,10>>>(myStruct_d, somePointer_d);
    cudaDeviceSynchronize();
    printf("Counter = %d\n", counter);
  }
  
  cudaFree(somePointer_d);
  free(myStruct->somePointer);
  free(myStruct);
  return 0;
}

According to some references online these two versions might differ in performance. The one using Unified Memory tend to be a little less performant. Also notice that the syntax in function calling for both functions change given the way we have to declare variables structTest (as a pointer) and myStruct_d (not a pointer).

Other references: Clean way of copying a struct with pointers to the GPU , https://developer.nvidia.com/blog/unified-memory-in-cuda-6/ and Help me understand cuda memory management - why is cudaMemcpy (not) needed? .

Also, I am compiling both codes only by doing “$ nvcc myCudaFile.cu”. This might be preventing some errors or warnings to appear when one more flags are added to compilation process.

Curefab · April 24, 2024, 9:07am

One possible solution could be to create your ‘own pointers’.

You would be storing all the memory in a global buffer and all the pointers would just store an offset within that global buffer. You also could have a separate buffer for each type, e.g. for SomeStruct and for int. This is especially helpful for data structures, which are densely connected and where the pointer signifies identity; there a deep copy would create more objects and loose the identity of the objects.

You could dereference those objects manually, or by using a specific function for pointer dereference or by creating special smart pointers.

When calling kernels, you just have to copy the global buffer.