C++ class functions for management of data transfer

Hi,
I am working on porting a large CPU code to GPU using CUDA C++. The code extensively uses OOP and we want to retain the paradigm. The structure of the code is such that there is a top level class which has object of other classes as data members. These 2nd-level classes then use objects of other classes as data members. The hierarchy runs 6-7 steps. Now, to transfer the object of the top level class, i have to move down towards the basic class which does not use any other class’ object as data member. For easy data management, i want to put the declarations of the device objects within the class functions. But I am unsure how will I free those objects using cudaFree. Most of the examples I have seen use public data members in the classes and perform data management in main.
I have produced a small example for demonstration. Although, here I have kept some of class data members as public, but in real code, I have accessor functions.

Top level class definition (mixture.cuh)

#ifndef MIXTURE_H
#define MIXTURE_H

#include <cstdint>
#include "compound.cuh"
#include <cuda.h>
#include <cuda_runtime.h>

namespace chem
{
        
class Mixture
{
    private:
        int* N{nullptr};
	public:
		Compound* eg{nullptr};
		int SIZE{0};
		Mixture() = default;
		Mixture(int sz, int* arr);
		void destroy_Mixture();
		void transfer_to_device(Mixture& dh_xs, Mixture*& d_xs);
		void free_device(Mixture& dh_xs, Mixture*& d_xs);
};

}  // End of namespace

#endif  // MIXTURE_H Closing

Top level class Implementation (mixture.cu)

#include "compound.cuh"
#include "mixture.cuh"
#include <cuda.h>
#include <cuda_runtime.h>

namespace chem
{
		Mixture::Mixture(int sz, int* arr)
        {
			SIZE = sz;
			eg = new Compound(SIZE);
			N = new int [SIZE];
			for(int i = 0; i < SIZE; i++)
				N[i] = arr[i];
		}

		void Mixture::destroy_Mixture()
		{
			if(SIZE > 0)
			{
				delete [] N;
				eg->destroy_Compound();
			}
		}

		void Mixture::transfer_to_device(Mixture& dh_xs, Mixture*& d_xs)
		{	
			Compound dh_eg;  
			cudaMalloc(&d_xs, sizeof(Mixture));
			dh_xs.SIZE = SIZE;
			cudaMalloc(&dh_xs.N, SIZE*sizeof(int));
			cudaMalloc(&dh_xs.eg, sizeof(Compound));
			cudaMemcpy(dh_xs.N, N, SIZE*sizeof(int), cudaMemcpyHostToDevice);
			eg->transfer_to_device(dh_eg);
			cudaMemcpy((dh_xs.eg), &dh_eg, sizeof(Compound), cudaMemcpyHostToDevice);
			cudaMemcpy(d_xs, &dh_xs, sizeof(Mixture), cudaMemcpyHostToDevice);
		}

		void Mixture::free_device(Mixture& dh_xs, Mixture*& d_xs)
		{
			(dh_xs.eg)->free_device();
			cudaFree(dh_xs.eg);
			cudaFree(dh_xs.N);
			cudaFree(d_xs);
		}

}  // End of namespace

==================================
Base level class Definition (compound.cuh)

#ifndef COMPOUND_H
#define COMPOUND_H

#include <cstdint>
#include <cuda.h>
#include <cuda_runtime.h>

namespace chem
{
class Compound
{
    private:
        int N_POINTS{0};
        double urr_start_energy{0.0};
		double* energy{nullptr};     
    public:
		Compound() = default;
		void set_Compound(int size);
        Compound(int size);
		void print_Compound();
        void destroy_Compound();
		void transfer_to_device(Compound& dh_eg);
		void free_device();
        __host__ __device__ double get_energy(int i);		
};
}  // End of namespace GNDL
#endif  // COMPOUND_H Closing

Base level class Implementation (compound.cu)

#include "compound.cuh"
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>

namespace chem
{
    Compound::Compound(int size) 
	{	set_Compound(size); }


	void Compound::set_Compound(int size)
	{
		energy = new double[size];
		N_POINTS = size;
		for(int i = 0; i < N_POINTS; i++)
			energy[i] = double(size*i);
	}
	
	
	void Compound::print_Compound()
	{
		for(int i = 0; i < N_POINTS; i++)
			std::cout << energy[i] << " ";
		std::cout << std::endl;
	}

	void Compound::destroy_Compound()
	{
		if(N_POINTS > 0)
			delete [] energy;
	}

	void Compound::transfer_to_device(Compound& dh_eg)
	{
		dh_eg.N_POINTS = N_POINTS;
		dh_eg.urr_start_energy = urr_start_energy;
		cudaMalloc(&dh_eg.energy, N_POINTS*sizeof(double));
		cudaMemcpy(dh_eg.energy, energy, N_POINTS*sizeof(double), cudaMemcpyHostToDevice);
	}


	__host__ __device__ double Compound::get_energy(int i)
	{	return energy[i];   }


	void Compound::free_device()
	{	cudaFree(energy);	}
          // Above line creates Seg fault
}  // End of namespace

=========
main

#include "mixture.cuh"
#include "compound.cuh"
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>

using namespace chem;


__global__ void calculate_XS(int* i, int* j, Mixture* XSS, double* result)
{
	int idx=threadIdx.x + blockDim.x*blockIdx.x;
	if(idx >= XSS->SIZE)
		return;
	if(i[idx] >= XSS->SIZE)
		result[idx] = -444;
	else
	{
		double val = 0.0;
		val = (XSS->eg)->get_energy(j[idx]);
		result[idx] = val;
	}
}

int main()
{
	int SZ = 10;
	int kk;
    int a[SZ];
    int b[SZ];
	double result[SZ];
	int arr[SZ] = {3, 4, 5, 7, 3, 6, 8, 9, 3, 2};
	int* d_a;
	int* d_b;
	double* d_result;
	Mixture dh_xs;
	Mixture* d_xs;

	Mixture XS(SZ, arr);
	XS.transfer_to_device(dh_xs, d_xs);
    for(kk = 0; kk < SZ; kk++)
	{
		a[kk] = kk;
		b[kk] = arr[kk]-1;
	}
	// trasnfer a and b
	cudaMalloc( &d_a ,SZ*sizeof(int));
	cudaMalloc( &d_b ,SZ*sizeof(int));
	cudaMemcpy(d_a, a, SZ*sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(d_b, b, SZ*sizeof(int), cudaMemcpyHostToDevice);
	// transfer result
	cudaMalloc( &d_result ,SZ*sizeof(double));
	cudaMemcpy(d_result, result, SZ*sizeof(double), cudaMemcpyHostToDevice);
    calculate_XS<<<1, 32>>>(d_a, d_b, d_xs, d_result);
	cudaDeviceSynchronize();
	cudaMemcpy(result, d_result, SZ*sizeof(double), cudaMemcpyDeviceToHost);
	std::cout << "Printing in CPU after returning\n";
	for(kk = 0; kk < SZ; kk++)
		std::cout << result[kk] << std::endl;
	
	XS.free_device(dh_xs, d_xs);
	cudaFree(d_result);
	cudaFree(d_a);
	cudaFree(d_b);
	std::cout << "Successful termination\n";
	return 0;
}

I perform compilation using following
nvcc -o test compound.cu mixture.cu main.cu -arch=sm_86 -lcudart -rdc=true

When I run the above program, the result is correct but i get a segmentation fault while freeing the memory.
Any help would be appreciated.
Thank you!

A seg fault always originates from host code activity, and can be isolated to a single line of code that you wrote, that does not involve calling a function or class member that you wrote. As a matter of courtesy to others that you are asking help from, my suggestion is, if you have not isolated this line, do so. Once you have isolated this line, share it with others.

The proximal line for the seg fault is here:

cudaFree requires a pointer in host memory. Yes, that pointer should point to device memory, but the storage location of that pointer is in host memory. Similar syntax to cudaMalloc in terms of the address/location of the pointer you pass to it. That address must be an address in host memory (or you will likewise get a seg fault there.)

I would say the trouble starts here:

That is passing a reference to dh_xs, a basically uninitialized Mixture object, to the transfer_to_device method. That method initializes dh_xs with a pointer to device memory here:

That is creating storage for a Compound object in device memory, and the storage for the pointer that points to that object is in host memory at dh_xs.eg.

That Compound object has a pointer called energy. The location of the address of energy is in device memory, not host memory.

When you go to free it with cudaFree, you are passing a location that is in device memory. Not only does the pointer value point to device memory, but the address of the pointer is in device memory also.

This doesn’t work either for cudaMalloc or cudaFree, and you get a seg fault, when host code (cudaFree library routine, executing on the CPU) attempts to directly access a location in device memory, to retrieve the pointer value.

There are at least 2 simplistic methods to solve this.

  1. When you allocate for such a device pointer stored in device memory using cudaMalloc, you must have had a host variable copy of it, that you copied to device memory. Use that host variable to free the allocated memory.

  2. Copy the energy pointer value back to the host, before freeing it.

You happen to initialize that energy pointer here:

In particular, in the last line:

So the energy there, that you copy to dh_eg.energy, is your host copy of the value you would want to free (method 1).

As a simple test/proof point (method 2), you could do this:

void Compound::free_device()
{	double *temp; 
    cudaMemcpy(&temp, &this->energy, sizeof (double *), cudaMemcpyDeviceToHost);
    cudaFree(temp);	}
1 Like

Thank you for your reply. I understand the problem now and the solution designated as Method 2.
Can you please clarify one more thing about the Method 1.
What I understood from the Method 1 is that in the free_device function in mixture.cu, If i comment the first line, it will be sufficient and I have nothing else to do as I am already clearing the host memory in destroy_Mixture() that i copied to dh_eg.energy
So, the only change I need to do for Method 1 is as below.

void Mixture::free_device(Mixture& dh_xs, Mixture*& d_xs)
		{
			//(dh_xs.eg)->free_device();
			cudaFree(dh_xs.eg);
			cudaFree(dh_xs.N);
			cudaFree(d_xs);
		}

The line cudaFree(dh_xs.eg); will be sufficient for clearing the object memory of the Compound object?
Is my understanding for the Method 1 correct?

No, not correct.

This is no different than if you allocate a top level object with malloc() and then allocate an embedded pointer with malloc(). Calling free() on the top level object will not free the memory allocated for the embedded pointer. cudaMalloc is the same in this respect.

Method 1 refers to the idea that when you created the allocation for an embedded pointer, you used a cudaMalloc call on a pointer variable stored in host memory (and then copied that pointer from host memory to the device object). So when you use cudaMalloc on the pointer stored in host memory, save that value in a host variable, and then use cudaFree on it, when you are ready.

1 Like

Thank you, again. I got the point.

In succession to the problem I had, I modified the function as below.
In main.cu, I am calling the free_device( ) as below.

dh_xs.free_device();
	cudaFree(d_xs);

The function free_device( ) in the file mixture.cu is now modified as below.

void Mixture::free_device()
{
	eg->free_device();
	cudaFree(eg);
	cudaFree(N);
}

I did not get an error. Just want to confirm that this method is logically correct for freeing the memory?