How to copy Device Struct with pointers to Host?

Hi, I’m pretty new to CUDA programming and I’m having a little bit of a problem trying to use cudaMemCpy correctly to copy a device array back to the host array.

This is the struct that I have:

typedef struct {
   int width;
   int height;
   float* elements;
} Matrix;

So, the host allocates the Struct POINTER with “new” (it’s a C++ program).
Let’s say:
matrix.cpp

Matrix* C = NULL; // C = A * B
   C = new Matrix;
   C->height = A->height;
   C->width = B->width;
   C->elements = new float[C->height * C->width](); // initialize it all to '0'

in my matrix.cu file is where I’m having the problem. I think I got it to copy the Matrix to the Device correctly, but when the kernel execution is done, I want to store the result back in “C->elements”.

This is what I did in matrix.cu:

// Matrix* A, Matrix* B, Matrix* C is allocated in the .cpp file w/ new() and passed here

   Matrix* hostMatrix = C; // let's say we're copying C to the device and then back
   Matrix* deviceMatrix = NULL;
   float*  d_elements;

   // allocate the deviceMatrix and d_elements
   cudaMalloc(&deviceMatrix, sizeof(Matrix))
   int size = hostMatrix->width * hostMatrix->height * sizeof(float);
   cudaMalloc(&d_elements, size)

   // copy each piece of data separately                                        
   cudaMemcpy(deviceMatrix, hostMatrix, sizeof(Matrix), cudaMemcpyHostToDevice)
   cudaMemcpy(d_elements, hostMatrix->elements, size, cudaMemcpyHostToDevice)
   cudaMemcpy(&(deviceMatrix->elements), &d_elements, sizeof(float*), cudaMemcpyHostToDevice)

   // so far so good, no compilation errors from HOST -> DEVICE

   // call kernel (let's say it changes deviceMatrix->elements)
   ...

   // now I want to store the new elements to hostMatrix->elements (remember, this was allocated w/ new)
   cudaMemcpy((hostMatrix->elements), (deviceMatrix->elements), size, cudaMemcpyDeviceToHost);
   // SEGFAULT!!! Program received signal SIGSEGV, Segmentation fault.

   // don't forget to free the device pointers

In the given snippet of code, how can I fix line 23 such that I’ll use cudaMemcpy correctly to copy the deviceMatrix->elements correctly to hostMatrix->elements ?
Thanks

cudaMemcpy(hostMatrix->elements, d_elements, size, cudaMemcpyDeviceToHost);

and check CUDA API returns on every API function, or at least run your code with cuda-memcheck

thanks “txbob”! that has fixed it!

Ok, the original question was about copying a Host Struct allocated pointer to the Device, however, how do I do the opposite?

I keep getting an error when I try the following (snippet): SEE LINE 71

typedef struct {
   int width;
   int height;
   float* elements;
} Matrix;

template <int BLOCK_SIZE>
__global__ void matrixMulCUDA(Matrix* A, Matrix* B, Matrix* C);

__host__ void allocateDeviceMatrix(Matrix* hostMatrix, Matrix* deviceMatrix)
{
   float* d_elements;

   // allocate the Device Matrix                                                
   if (cudaMalloc(&deviceMatrix, sizeof(Matrix)) != cudaSuccess)
      printf("FAILED TO ALLOCATE MATRIX ON DEVICE!\n");

   // allocate the elements: deviceMatrix->elements                             
   int size = hostMatrix->width * hostMatrix->height * sizeof(float);
   if (cudaMalloc(&d_elements, size) != cudaSuccess)
      printf("FAILED TO ALLOCATE MATRIX ELEMENTS ON DEVICE!\n");

   // copy each piece of data separately                                        
   if (cudaMemcpy(deviceMatrix, hostMatrix, sizeof(Matrix), cudaMemcpyHostToDev\
ice) != cudaSuccess)
      printf("FAILED TO COPY MATRIX TO DEVICE!\n");
   if (cudaMemcpy(d_elements, hostMatrix->elements, size, cudaMemcpyHostToDevic\
e) != cudaSuccess)
      printf("FAILED TO COPY MATRIX ELEMENTS TO DEVICE!\n");
   if (cudaMemcpy(&(deviceMatrix->elements), &d_elements, sizeof(float*), cudaM\
emcpyHostToDevice) != cudaSuccess)
      printf("FAILED TO COPY MATRIX ELEMENTS POINTER TO DEVICE!\n");
}

void matrixMulOnDevice(Matrix* A, Matrix* B, Matrix* C)
{
   // query the Device and decide on the block size                             
   int devID = 0; // the default device ID                                      
   cudaError_t error;
   cudaDeviceProp deviceProp;
   error = cudaGetDevice(&devID);
   if (error != cudaSuccess)
   {
      printf("cudaGetDevice returned error code %d, line(%d)\n", error, __LINE_\
_);
      exit(-1);
   }

   error = cudaGetDeviceProperties(&deviceProp, devID);

   if (deviceProp.computeMode == cudaComputeModeProhibited ||
       error != cudaSuccess)
   {
      // print error and return                                                 
   }

   // Use a larger block size for Fermi and above                               
   int block_size = (deviceProp.major < 2) ? 16 : 32;

   Matrix* d_A = NULL;
   Matrix* d_B = NULL;
   Matrix* d_C = NULL;

   // allocate the matrices on the device                                       
   allocateDeviceMatrix(A, d_A);
   allocateDeviceMatrix(B, d_B);
   allocateDeviceMatrix(C, d_C);

   // HERE IS WHEN I TRY TO COPY IT BACK (FOR TESTING PURPOSES...)
   int mem_size_C = C->height * C->width * sizeof(float);
   error = cudaMemcpy(C->elements, d_C->elements, mem_size_C, cudaMemcpyDeviceT\
oHost);
   // SEGFAULT!!!
   if (error != cudaSuccess)
   {
      printf("cudaMemcpy (h_C,d_C) returned error code %d, line(%d)\n", error, \
__LINE__);
      exit(EXIT_FAILURE);
   }
   displayTheMatrix(C);

   ...
}

So… on line 71, I get a segfault!! allocateDeviceMatrix() should have copied HostMatrix C to DeviceMatrix d_C. So on that line I try to copy the contents back and see if C->elements have changed, however I keep getting errors. Can anyone help me out with this problem??
Thanks

I think the problem is in

d_C->elements

d_C is a device object and you try to access its member variable on host. You could do the same as in allocateDeviceMatrix(). There you copy the pointer to elements to the device. Now you need to copy the address back to use it. I think it would be more convenient to keep the address of

float* d_elements;

on host instead of the pointer to the device matrix which is not needed in host code (unless your matrix object has more data than the matrix elements).

If you just copy the entire structure back to the host, you’ll get everything back including the data (just reverse line 13 of the code)? That’s how I handle my arrays within structures and it works fine. But going from host to device is an entirely different story.

so, if what blade613x says is correct, the cudaMemCpy from device to host does a deep copy? (in other words, the pointer inside the struct will also be copied?)

The thing is that the hostMatrix C is already pre-allocated in c++ with the “new” operator.

I tried what “blade613x” suggested, but cudaMemCpy returned error code 11.

cudaMemcpy(C, d_C, sizeof(Matrix), cudaMemcpyDeviceToHost)

Is my syntax correct?

@hadschi118, you’re right. I might be over complicating things by using structs on the device. I might go that route if I can’t figure how to use structs on the device and back.

Your syntax looks correct. No idea why it doesn’t work. What’s the output of the compilation?

I get no compilation errors. @MutantJohn, note that both “C” and “d_C” are pointers.
but when I ran the program:

...
   Matrix* C; // this is passed here, so is already allocated and etc...
   Matrix* d_C;

   // HOST TO DEVICE
   allocateDeviceMatrix(C, d_C); // allocates / sets d_C
   ...

   // DEVICE TO HOST
   error = cudaMemcpy(C, d_C, sizeof(Matrix), cudaMemcpyDeviceToHost);
   if (error != cudaSuccess)
   {
      printf("cudaMemcpy (C,d_C) returned error code %d, line(%d)\n", error, __LINE__);
      exit(EXIT_FAILURE);
   }

it gives me this output error:

cudaMemcpy (C,d_C) returned error code 11, line(293)

Your deviceMatrix pointer is passed by value:

allocateDeviceMatrix(C, d_C);

and:

__host__ void allocateDeviceMatrix(Matrix* hostMatrix, Matrix* deviceMatrix)

Passing by value means a copy of the pointer is made for the function, separate from what exists in the calling environment.

If you then take the address of this copy:

if (cudaMalloc(&deviceMatrix, sizeof(Matrix)) != cudaSuccess)

that line of code will work, but the value of d_C in the calling environment will not be modified.

Later, when you return to the calling environment, and try to use d_C, you will get unexpected results:

error = cudaMemcpy(C, d_C, sizeof(Matrix), cudaMemcpyDeviceToHost);

You can fix this by either passing the address of the deviceMatrix (your function will have to be modified to handle a double-pointer accordingly), or else pass by reference instead of by value:

__host__ void allocateDeviceMatrix(Matrix* hostMatrix, Matrix& deviceMatrix)

Awesome! thanks a lot for your input guys! I was able to get it working after your comments :)

I did something like this:

// modified the kernel prototype ~ C = A * B
template <int BLOCK_SIZE>
__global__ void matrixMulCUDA(const Matrix A, const Matrix B, Matrix C);

// modified this function
__host__ void allocateDeviceMatrix(Matrix* hostMatrix, Matrix& deviceMatrix)
{
   deviceMatrix.width  = hostMatrix->width;
   deviceMatrix.height = hostMatrix->height;

   // allocate the elements: deviceMatrix.elements                             
   int size = hostMatrix->width * hostMatrix->height * sizeof(float);
   if (cudaMalloc(&deviceMatrix.elements, size) != cudaSuccess)
      printf("FAILED TO ALLOCATE MATRIX ELEMENTS ON DEVICE!\n");

   // copy matrix elements
   if (cudaMemcpy(deviceMatrix.elements, hostMatrix->elements, size, cudaMemcpyHostToDevice) != cudaSuccess)
      printf("FAILED TO COPY MATRIX TO DEVICE!\n");
}

// the pointers passed in here are coming from a C++ file (allocated w/ new)
void matrixMulOnDevice(Matrix* A, Matrix* B, Matrix* C)
{
   ...
   Matrix d_A;
   Matrix d_B;
   Matrix d_C;

   // allocate the matrices on the device                                       
   allocateDeviceMatrix(A, d_A);
   allocateDeviceMatrix(B, d_B);
   allocateDeviceMatrix(C, d_C);

   // setup the execution configuration                                         
   dim3 dimGrid(1, 1);
   dim3 dimBlock(A->width, A->width);

   matrixMulCUDA<32><<< dimGrid, dimBlock >>>(d_A, d_B, d_C);

   // copy the result from device to host                                       
   error = cudaMemcpy(C->elements, d_C.elements, mem_size_C, cudaMemcpyDeviceToHost);
   if (error != cudaSuccess)
   {
      printf("cudaMemcpy (h_C,d_C) returned error code %d, line(%d)\n", error, __LINE__);
      exit(EXIT_FAILURE);
   }

   ...
}