I found two solutions to my problem. One is using Unified memory by means of function cudaMallocManaged() (as mentioned in cuda - Unified memory and struct with arrays - Stack Overflow). The code is as follows and execute as I intended:
#include<stdio.h>
#include<stdlib.h>
#include<cuda_runtime.h>
struct someStruct
{
int someInteger;
int *somePointer;
};
typedef struct someStruct someStruct;
someStruct* start_struct()
{
someStruct *myStruct = (someStruct*)malloc(sizeof(someStruct));
if(myStruct)
{
myStruct->somePointer = (int*)malloc(10*sizeof(int));
myStruct->someInteger = 13;
for(int i = 0;i<10;i++)
myStruct->somePointer[i] = i+10;
}
return myStruct;
}
__global__ void print_on_cuda(someStruct* myStruct_d, int* somePointer)
{
int threadId = blockIdx.x * blockDim.x + threadIdx.x;
int sum = 0;
for(int i = 0; i<10; i++)
sum += somePointer[i];
for(int i = 0; i<10; i++){
printf("someInteger %d on thread %d with index = %d and sum = %d \n",myStruct_d->someInteger, threadId, i, sum/*, myStruct_d->somePointer[i]*/);
// Print the value of myStruct_d->somePointer[i] for debugging
printf("Value at somePointer_d[%d] = %d and myStruct_d->somePointer[%d] = %d\n", i, somePointer[i], i, myStruct_d->somePointer[i]);
}
return;
}
int main(){
// myStruct is initialized on CPU
someStruct* myStruct;
myStruct = start_struct();
// Device variables
int* somePointer_d;
// Allocating and copying to some test pointer
cudaMalloc((int**)&somePointer_d, 10*sizeof(int));
cudaMemcpy(somePointer_d, myStruct->somePointer, 10*sizeof(int), cudaMemcpyHostToDevice);
// Using Unified Memory property
someStruct* structTest;
cudaMallocManaged(&structTest,sizeof(someStruct));
structTest->somePointer = somePointer_d;
structTest->someInteger = myStruct->someInteger;
for(int counter = 0; counter < 3; counter++){
print_on_cuda<<<2,10>>>(structTest, somePointer_d);
cudaDeviceSynchronize();
printf("Counter = %d\n", counter);
}
cudaFree(somePointer_d);
free(myStruct->somePointer);
free(myStruct);
return 0;
}
The other one is performing a deep copy (CUDA : How to allocate memory for data member of a class - Stack Overflow or even more explicitly stated in Copying a struct containing pointers to CUDA device - Stack Overflow). The code is as follows:
#include<stdio.h>
#include<stdlib.h>
#include<cuda_runtime.h>
struct someStruct
{
int someInteger;
int *somePointer;
};
typedef struct someStruct someStruct;
someStruct* start_struct()
{
someStruct *myStruct = (someStruct*)malloc(sizeof(someStruct));
if(myStruct)
{
myStruct->somePointer = (int*)malloc(10*sizeof(int));
myStruct->someInteger = 13;
for(int i = 0;i<10;i++)
myStruct->somePointer[i] = i+10;
}
return myStruct;
}
__global__ void print_on_cuda(someStruct myStruct_d, int* somePointer)
{
int threadId = blockIdx.x * blockDim.x + threadIdx.x;
int sum = 0;
for(int i = 0; i<10; i++)
sum += somePointer[i];
for(int i = 0; i<10; i++){
printf("someInteger %d on thread %d with index = %d and sum = %d \n",myStruct_d.someInteger, threadId, i, sum);
// Print the value of myStruct_d->somePointer[i] for debugging
printf("Value at somePointer_d[%d] = %d and myStruct_d->somePointer[%d] = %d\n", i, somePointer[i], i, myStruct_d.somePointer[i]);
}
return;
}
int main(){
// myStruct is initialized on CPU
someStruct* myStruct;
myStruct = start_struct();
// Device variables
someStruct myStruct_d;
int* somePointer_d;
// Allocating and copying to some test pointer
cudaMalloc((int**)&somePointer_d, 10*sizeof(int));
cudaMemcpy(somePointer_d, myStruct->somePointer, 10*sizeof(int), cudaMemcpyHostToDevice);
myStruct_d.somePointer = somePointer_d;
myStruct_d.someInteger = myStruct->someInteger;
for(int counter = 0; counter < 3; counter++){
print_on_cuda<<<2,10>>>(myStruct_d, somePointer_d);
cudaDeviceSynchronize();
printf("Counter = %d\n", counter);
}
cudaFree(somePointer_d);
free(myStruct->somePointer);
free(myStruct);
return 0;
}
According to some references online these two versions might differ in performance. The one using Unified Memory tend to be a little less performant. Also notice that the syntax in function calling for both functions change given the way we have to declare variables structTest (as a pointer) and myStruct_d (not a pointer).
Other references: Clean way of copying a struct with pointers to the GPU , https://developer.nvidia.com/blog/unified-memory-in-cuda-6/ and Help me understand cuda memory management - why is cudaMemcpy (not) needed? .
Also, I am compiling both codes only by doing “$ nvcc myCudaFile.cu”. This might be preventing some errors or warnings to appear when one more flags are added to compilation process.