C++ class functions for management of data transfer

Ismkhan · February 12, 2024, 1:05am

Hi,
I am working on porting a large CPU code to GPU using CUDA C++. The code extensively uses OOP and we want to retain the paradigm. The structure of the code is such that there is a top level class which has object of other classes as data members. These 2nd-level classes then use objects of other classes as data members. The hierarchy runs 6-7 steps. Now, to transfer the object of the top level class, i have to move down towards the basic class which does not use any other class’ object as data member. For easy data management, i want to put the declarations of the device objects within the class functions. But I am unsure how will I free those objects using cudaFree. Most of the examples I have seen use public data members in the classes and perform data management in main.
I have produced a small example for demonstration. Although, here I have kept some of class data members as public, but in real code, I have accessor functions.

Top level class definition (mixture.cuh)

#ifndef MIXTURE_H
#define MIXTURE_H

#include <cstdint>
#include "compound.cuh"
#include <cuda.h>
#include <cuda_runtime.h>

namespace chem
{
        
class Mixture
{
    private:
        int* N{nullptr};
	public:
		Compound* eg{nullptr};
		int SIZE{0};
		Mixture() = default;
		Mixture(int sz, int* arr);
		void destroy_Mixture();
		void transfer_to_device(Mixture& dh_xs, Mixture*& d_xs);
		void free_device(Mixture& dh_xs, Mixture*& d_xs);
};

}  // End of namespace

#endif  // MIXTURE_H Closing

Top level class Implementation (mixture.cu)

#include "compound.cuh"
#include "mixture.cuh"
#include <cuda.h>
#include <cuda_runtime.h>

namespace chem
{
		Mixture::Mixture(int sz, int* arr)
        {
			SIZE = sz;
			eg = new Compound(SIZE);
			N = new int [SIZE];
			for(int i = 0; i < SIZE; i++)
				N[i] = arr[i];
		}

		void Mixture::destroy_Mixture()
		{
			if(SIZE > 0)
			{
				delete [] N;
				eg->destroy_Compound();
			}
		}

		void Mixture::transfer_to_device(Mixture& dh_xs, Mixture*& d_xs)
		{	
			Compound dh_eg;  
			cudaMalloc(&d_xs, sizeof(Mixture));
			dh_xs.SIZE = SIZE;
			cudaMalloc(&dh_xs.N, SIZE*sizeof(int));
			cudaMalloc(&dh_xs.eg, sizeof(Compound));
			cudaMemcpy(dh_xs.N, N, SIZE*sizeof(int), cudaMemcpyHostToDevice);
			eg->transfer_to_device(dh_eg);
			cudaMemcpy((dh_xs.eg), &dh_eg, sizeof(Compound), cudaMemcpyHostToDevice);
			cudaMemcpy(d_xs, &dh_xs, sizeof(Mixture), cudaMemcpyHostToDevice);
		}

		void Mixture::free_device(Mixture& dh_xs, Mixture*& d_xs)
		{
			(dh_xs.eg)->free_device();
			cudaFree(dh_xs.eg);
			cudaFree(dh_xs.N);
			cudaFree(d_xs);
		}

}  // End of namespace

==================================
Base level class Definition (compound.cuh)

#ifndef COMPOUND_H
#define COMPOUND_H

#include <cstdint>
#include <cuda.h>
#include <cuda_runtime.h>

namespace chem
{
class Compound
{
    private:
        int N_POINTS{0};
        double urr_start_energy{0.0};
		double* energy{nullptr};     
    public:
		Compound() = default;
		void set_Compound(int size);
        Compound(int size);
		void print_Compound();
        void destroy_Compound();
		void transfer_to_device(Compound& dh_eg);
		void free_device();
        __host__ __device__ double get_energy(int i);		
};
}  // End of namespace GNDL
#endif  // COMPOUND_H Closing

Base level class Implementation (compound.cu)

#include "compound.cuh"
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>

namespace chem
{
    Compound::Compound(int size) 
	{	set_Compound(size); }


	void Compound::set_Compound(int size)
	{
		energy = new double[size];
		N_POINTS = size;
		for(int i = 0; i < N_POINTS; i++)
			energy[i] = double(size*i);
	}
	
	
	void Compound::print_Compound()
	{
		for(int i = 0; i < N_POINTS; i++)
			std::cout << energy[i] << " ";
		std::cout << std::endl;
	}

	void Compound::destroy_Compound()
	{
		if(N_POINTS > 0)
			delete [] energy;
	}

	void Compound::transfer_to_device(Compound& dh_eg)
	{
		dh_eg.N_POINTS = N_POINTS;
		dh_eg.urr_start_energy = urr_start_energy;
		cudaMalloc(&dh_eg.energy, N_POINTS*sizeof(double));
		cudaMemcpy(dh_eg.energy, energy, N_POINTS*sizeof(double), cudaMemcpyHostToDevice);
	}


	__host__ __device__ double Compound::get_energy(int i)
	{	return energy[i];   }


	void Compound::free_device()
	{	cudaFree(energy);	}
          // Above line creates Seg fault
}  // End of namespace

=========
main

#include "mixture.cuh"
#include "compound.cuh"
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>

using namespace chem;


__global__ void calculate_XS(int* i, int* j, Mixture* XSS, double* result)
{
	int idx=threadIdx.x + blockDim.x*blockIdx.x;
	if(idx >= XSS->SIZE)
		return;
	if(i[idx] >= XSS->SIZE)
		result[idx] = -444;
	else
	{
		double val = 0.0;
		val = (XSS->eg)->get_energy(j[idx]);
		result[idx] = val;
	}
}

int main()
{
	int SZ = 10;
	int kk;
    int a[SZ];
    int b[SZ];
	double result[SZ];
	int arr[SZ] = {3, 4, 5, 7, 3, 6, 8, 9, 3, 2};
	int* d_a;
	int* d_b;
	double* d_result;
	Mixture dh_xs;
	Mixture* d_xs;

	Mixture XS(SZ, arr);
	XS.transfer_to_device(dh_xs, d_xs);
    for(kk = 0; kk < SZ; kk++)
	{
		a[kk] = kk;
		b[kk] = arr[kk]-1;
	}
	// trasnfer a and b
	cudaMalloc( &d_a ,SZ*sizeof(int));
	cudaMalloc( &d_b ,SZ*sizeof(int));
	cudaMemcpy(d_a, a, SZ*sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(d_b, b, SZ*sizeof(int), cudaMemcpyHostToDevice);
	// transfer result
	cudaMalloc( &d_result ,SZ*sizeof(double));
	cudaMemcpy(d_result, result, SZ*sizeof(double), cudaMemcpyHostToDevice);
    calculate_XS<<<1, 32>>>(d_a, d_b, d_xs, d_result);
	cudaDeviceSynchronize();
	cudaMemcpy(result, d_result, SZ*sizeof(double), cudaMemcpyDeviceToHost);
	std::cout << "Printing in CPU after returning\n";
	for(kk = 0; kk < SZ; kk++)
		std::cout << result[kk] << std::endl;
	
	XS.free_device(dh_xs, d_xs);
	cudaFree(d_result);
	cudaFree(d_a);
	cudaFree(d_b);
	std::cout << "Successful termination\n";
	return 0;
}

I perform compilation using following
nvcc -o test compound.cu mixture.cu main.cu -arch=sm_86 -lcudart -rdc=true

When I run the above program, the result is correct but i get a segmentation fault while freeing the memory.
Any help would be appreciated.
Thank you!

Robert_Crovella · February 12, 2024, 4:40am

A seg fault always originates from host code activity, and can be isolated to a single line of code that you wrote, that does not involve calling a function or class member that you wrote. As a matter of courtesy to others that you are asking help from, my suggestion is, if you have not isolated this line, do so. Once you have isolated this line, share it with others.

The proximal line for the seg fault is here:

cudaFree requires a pointer in host memory. Yes, that pointer should point to device memory, but the storage location of that pointer is in host memory. Similar syntax to cudaMalloc in terms of the address/location of the pointer you pass to it. That address must be an address in host memory (or you will likewise get a seg fault there.)

I would say the trouble starts here:

That is passing a reference to dh_xs, a basically uninitialized Mixture object, to the transfer_to_device method. That method initializes dh_xs with a pointer to device memory here:

That is creating storage for a Compound object in device memory, and the storage for the pointer that points to that object is in host memory at dh_xs.eg.

That Compound object has a pointer called energy. The location of the address of energy is in device memory, not host memory.

When you go to free it with cudaFree, you are passing a location that is in device memory. Not only does the pointer value point to device memory, but the address of the pointer is in device memory also.

This doesn’t work either for cudaMalloc or cudaFree, and you get a seg fault, when host code (cudaFree library routine, executing on the CPU) attempts to directly access a location in device memory, to retrieve the pointer value.

There are at least 2 simplistic methods to solve this.

When you allocate for such a device pointer stored in device memory using cudaMalloc, you must have had a host variable copy of it, that you copied to device memory. Use that host variable to free the allocated memory.
Copy the energy pointer value back to the host, before freeing it.

You happen to initialize that energy pointer here:

Ismkhan:

	void Compound::transfer_to_device(Compound& dh_eg)
	{
		dh_eg.N_POINTS = N_POINTS;
		dh_eg.urr_start_energy = urr_start_energy;
		cudaMalloc(&dh_eg.energy, N_POINTS*sizeof(double));
		cudaMemcpy(dh_eg.energy, energy, N_POINTS*sizeof(double), cudaMemcpyHostToDevice);
	}

In particular, in the last line:

So the energy there, that you copy to dh_eg.energy, is your host copy of the value you would want to free (method 1).

As a simple test/proof point (method 2), you could do this:

void Compound::free_device()
{	double *temp; 
    cudaMemcpy(&temp, &this->energy, sizeof (double *), cudaMemcpyDeviceToHost);
    cudaFree(temp);	}

Ismkhan · February 12, 2024, 6:12am

Thank you for your reply. I understand the problem now and the solution designated as Method 2.
Can you please clarify one more thing about the Method 1.
What I understood from the Method 1 is that in the free_device function in mixture.cu, If i comment the first line, it will be sufficient and I have nothing else to do as I am already clearing the host memory in destroy_Mixture() that i copied to dh_eg.energy
So, the only change I need to do for Method 1 is as below.

void Mixture::free_device(Mixture& dh_xs, Mixture*& d_xs)
		{
			//(dh_xs.eg)->free_device();
			cudaFree(dh_xs.eg);
			cudaFree(dh_xs.N);
			cudaFree(d_xs);
		}

The line cudaFree(dh_xs.eg); will be sufficient for clearing the object memory of the Compound object?
Is my understanding for the Method 1 correct?

Robert_Crovella · February 12, 2024, 7:57am

No, not correct.

This is no different than if you allocate a top level object with malloc() and then allocate an embedded pointer with malloc(). Calling free() on the top level object will not free the memory allocated for the embedded pointer. cudaMalloc is the same in this respect.

Method 1 refers to the idea that when you created the allocation for an embedded pointer, you used a cudaMalloc call on a pointer variable stored in host memory (and then copied that pointer from host memory to the device object). So when you use cudaMalloc on the pointer stored in host memory, save that value in a host variable, and then use cudaFree on it, when you are ready.

Ismkhan · February 12, 2024, 8:12am

Thank you, again. I got the point.

Ismkhan · February 16, 2024, 1:56am

In succession to the problem I had, I modified the function as below.
In main.cu, I am calling the free_device( ) as below.

dh_xs.free_device();
	cudaFree(d_xs);

The function free_device( ) in the file mixture.cu is now modified as below.

void Mixture::free_device()
{
	eg->free_device();
	cudaFree(eg);
	cudaFree(N);
}

I did not get an error. Just want to confirm that this method is logically correct for freeing the memory?

system · March 1, 2024, 1:56am

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
How to cudaMalloc two-dimensional array ? CUDA Programming and Performance	46	66110	September 7, 2023
How to initialize a Class on the device CUDA Programming and Performance	6	2887	February 15, 2023
How can I use __device__ function pointer in CUDA ? CUDA Programming and Performance	34	59900	June 3, 2020
Constant memory when having more than one file external does not work CUDA Programming and Performance	24	3191	August 27, 2010
2d array testing in very simple code using CUDA CUDA Programming and Performance	29	30400	November 15, 2010
Kernel lunch overhead increases significantly (10x) when using unified memory on TK1 and TX1 Jetson TK1	5	3259	August 31, 2018
Is it possible to process multidimensional arrays inside the kernel? CUDA Programming and Performance	13	9032	March 31, 2015
cudaMalloc and Structs and Pointers problem CUDA Programming and Performance	7	7079	January 17, 2014
Another Device Memory Question CUDA Programming and Performance	7	2305	February 9, 2010
How to improve the performance of using CUDA IPC shared memory? CUDA Programming and Performance cuda	5	117	October 23, 2024

C++ class functions for management of data transfer

Related topics