Hi,
I am working on porting a large CPU code to GPU using CUDA C++. The code extensively uses OOP and we want to retain the paradigm. The structure of the code is such that there is a top level class which has object of other classes as data members. These 2nd-level classes then use objects of other classes as data members. The hierarchy runs 6-7 steps. Now, to transfer the object of the top level class, i have to move down towards the basic class which does not use any other class’ object as data member. For easy data management, i want to put the declarations of the device objects within the class functions. But I am unsure how will I free those objects using cudaFree. Most of the examples I have seen use public data members in the classes and perform data management in main.
I have produced a small example for demonstration. Although, here I have kept some of class data members as public, but in real code, I have accessor functions.
Top level class definition (mixture.cuh)
#ifndef MIXTURE_H
#define MIXTURE_H
#include <cstdint>
#include "compound.cuh"
#include <cuda.h>
#include <cuda_runtime.h>
namespace chem
{
class Mixture
{
private:
int* N{nullptr};
public:
Compound* eg{nullptr};
int SIZE{0};
Mixture() = default;
Mixture(int sz, int* arr);
void destroy_Mixture();
void transfer_to_device(Mixture& dh_xs, Mixture*& d_xs);
void free_device(Mixture& dh_xs, Mixture*& d_xs);
};
} // End of namespace
#endif // MIXTURE_H Closing
Top level class Implementation (mixture.cu)
#include "compound.cuh"
#include "mixture.cuh"
#include <cuda.h>
#include <cuda_runtime.h>
namespace chem
{
Mixture::Mixture(int sz, int* arr)
{
SIZE = sz;
eg = new Compound(SIZE);
N = new int [SIZE];
for(int i = 0; i < SIZE; i++)
N[i] = arr[i];
}
void Mixture::destroy_Mixture()
{
if(SIZE > 0)
{
delete [] N;
eg->destroy_Compound();
}
}
void Mixture::transfer_to_device(Mixture& dh_xs, Mixture*& d_xs)
{
Compound dh_eg;
cudaMalloc(&d_xs, sizeof(Mixture));
dh_xs.SIZE = SIZE;
cudaMalloc(&dh_xs.N, SIZE*sizeof(int));
cudaMalloc(&dh_xs.eg, sizeof(Compound));
cudaMemcpy(dh_xs.N, N, SIZE*sizeof(int), cudaMemcpyHostToDevice);
eg->transfer_to_device(dh_eg);
cudaMemcpy((dh_xs.eg), &dh_eg, sizeof(Compound), cudaMemcpyHostToDevice);
cudaMemcpy(d_xs, &dh_xs, sizeof(Mixture), cudaMemcpyHostToDevice);
}
void Mixture::free_device(Mixture& dh_xs, Mixture*& d_xs)
{
(dh_xs.eg)->free_device();
cudaFree(dh_xs.eg);
cudaFree(dh_xs.N);
cudaFree(d_xs);
}
} // End of namespace
==================================
Base level class Definition (compound.cuh)
#ifndef COMPOUND_H
#define COMPOUND_H
#include <cstdint>
#include <cuda.h>
#include <cuda_runtime.h>
namespace chem
{
class Compound
{
private:
int N_POINTS{0};
double urr_start_energy{0.0};
double* energy{nullptr};
public:
Compound() = default;
void set_Compound(int size);
Compound(int size);
void print_Compound();
void destroy_Compound();
void transfer_to_device(Compound& dh_eg);
void free_device();
__host__ __device__ double get_energy(int i);
};
} // End of namespace GNDL
#endif // COMPOUND_H Closing
Base level class Implementation (compound.cu)
#include "compound.cuh"
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
namespace chem
{
Compound::Compound(int size)
{ set_Compound(size); }
void Compound::set_Compound(int size)
{
energy = new double[size];
N_POINTS = size;
for(int i = 0; i < N_POINTS; i++)
energy[i] = double(size*i);
}
void Compound::print_Compound()
{
for(int i = 0; i < N_POINTS; i++)
std::cout << energy[i] << " ";
std::cout << std::endl;
}
void Compound::destroy_Compound()
{
if(N_POINTS > 0)
delete [] energy;
}
void Compound::transfer_to_device(Compound& dh_eg)
{
dh_eg.N_POINTS = N_POINTS;
dh_eg.urr_start_energy = urr_start_energy;
cudaMalloc(&dh_eg.energy, N_POINTS*sizeof(double));
cudaMemcpy(dh_eg.energy, energy, N_POINTS*sizeof(double), cudaMemcpyHostToDevice);
}
__host__ __device__ double Compound::get_energy(int i)
{ return energy[i]; }
void Compound::free_device()
{ cudaFree(energy); }
// Above line creates Seg fault
} // End of namespace
=========
main
#include "mixture.cuh"
#include "compound.cuh"
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
using namespace chem;
__global__ void calculate_XS(int* i, int* j, Mixture* XSS, double* result)
{
int idx=threadIdx.x + blockDim.x*blockIdx.x;
if(idx >= XSS->SIZE)
return;
if(i[idx] >= XSS->SIZE)
result[idx] = -444;
else
{
double val = 0.0;
val = (XSS->eg)->get_energy(j[idx]);
result[idx] = val;
}
}
int main()
{
int SZ = 10;
int kk;
int a[SZ];
int b[SZ];
double result[SZ];
int arr[SZ] = {3, 4, 5, 7, 3, 6, 8, 9, 3, 2};
int* d_a;
int* d_b;
double* d_result;
Mixture dh_xs;
Mixture* d_xs;
Mixture XS(SZ, arr);
XS.transfer_to_device(dh_xs, d_xs);
for(kk = 0; kk < SZ; kk++)
{
a[kk] = kk;
b[kk] = arr[kk]-1;
}
// trasnfer a and b
cudaMalloc( &d_a ,SZ*sizeof(int));
cudaMalloc( &d_b ,SZ*sizeof(int));
cudaMemcpy(d_a, a, SZ*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, SZ*sizeof(int), cudaMemcpyHostToDevice);
// transfer result
cudaMalloc( &d_result ,SZ*sizeof(double));
cudaMemcpy(d_result, result, SZ*sizeof(double), cudaMemcpyHostToDevice);
calculate_XS<<<1, 32>>>(d_a, d_b, d_xs, d_result);
cudaDeviceSynchronize();
cudaMemcpy(result, d_result, SZ*sizeof(double), cudaMemcpyDeviceToHost);
std::cout << "Printing in CPU after returning\n";
for(kk = 0; kk < SZ; kk++)
std::cout << result[kk] << std::endl;
XS.free_device(dh_xs, d_xs);
cudaFree(d_result);
cudaFree(d_a);
cudaFree(d_b);
std::cout << "Successful termination\n";
return 0;
}
I perform compilation using following
nvcc -o test compound.cu mixture.cu main.cu -arch=sm_86 -lcudart -rdc=true
When I run the above program, the result is correct but i get a segmentation fault while freeing the memory.
Any help would be appreciated.
Thank you!