Thank you all for your help @striker159 , @Robert_Crovella and @njuffa
Let me try to demonstrate it using a simple case.
Assume we have the following class A, which represents the main data-type and some basic functions for creating a plan for batched 1D FFTs and a function that all it does is to execute the plan using the object’s device-data.
#include <iostream>
#include "cufft.h"
#include "thrust/complex.h"
cufftHandle plan_batched_1d_ffts(int Nbatch, int Nx) {
int dimN[1] = {Nx}; // signal's size (number of spatial points)
int inembed[1] = {Nx}; // storage is same as dimN - no-padding!
int onembed[1] = {Nx};
int inputStride = 1; // dist. between successive input elements
int outputStride = inputStride;
int inputDist = Nx; // dist. between 1st elem. in successive input signals
int outputDist = inputDist;
cufftHandle plan;
cufftPlanMany(&plan, 1, dimN, inembed, inputStride, inputDist, onembed,
outputStride, outputDist, CUFFT_C2C, Nbatch);
return plan;
}
void fft_execute_forward(thrust::complex<float> *data, cufftHandle *handle) {
cufftExecC2C(*handle, (cufftComplex *)data, (cufftComplex *)data,
CUFFT_FORWARD);
}
class A {
public:
cufftHandle _handle;
int _Nbatch;
int _Nx;
thrust::complex<float> *_data;
A() = default;
A(int Nbatch, int Nx);
A(const A &other);
A &operator=(const A &other);
~A();
};
/* Constructor */
A::A(int Nbatch, int Nx) : _Nbatch(Nbatch), _Nx(Nx) {
cudaMalloc((void **)&_data, sizeof(thrust::complex<float>) * _Nbatch * _Nx);
_handle = plan_batched_1d_ffts(_Nbatch, _Nx);
}
/* copy-Constructor */
A::A(const A &other) {
_Nbatch = other._Nbatch;
_Nx = other._Nx;
cudaMalloc((void **)&_data, sizeof(thrust::complex<float>) * _Nbatch * _Nx);
cudaMemcpy(_data, other._data, sizeof(thrust::complex<float>) * _Nbatch * _Nx,
cudaMemcpyDeviceToDevice);
_handle = other._handle;
}
/* copy-assignment operator */
A &A::operator=(const A &other) {
if (this != &other) {
_Nbatch = other._Nbatch;
_Nx = other._Nx;
cudaMalloc((void **)&_data, sizeof(thrust::complex<float>) * _Nbatch * _Nx);
cudaMemcpy(_data, other._data,
sizeof(thrust::complex<float>) * _Nbatch * _Nx,
cudaMemcpyDeviceToDevice);
_handle = other._handle;
}
return *this;
}
/* Destructor */
A::~A() {
cudaFree(_data);
cufftDestroy(_handle);
}
Lets see a first example where everything works fine:
int main() {
const int Nbatch = 10;
const int Nx = 5;
A obj1(Nbatch, Nx); /* create obj1 using constructor */
A obj2;
obj2 = obj1; /* obj2 is a copy of obj1 */
/* perform a forward FFT on the data of obj2 */
fft_execute_forward(obj2._data, &obj2._handle);
return 0;
}
If I compile and run this application using nvprof I obtain the following output:
(base) ahadji05@ahadji05:~/Downloads$ sudo nvprof ./a.out
==17355== NVPROF is profiling process 17355, command: ./a.out
==17355== Profiling application: ./a.out
==17355== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 63.98% 4.8320us 1 4.8320us 4.8320us 4.8320us void vector_fft<unsigned int=5, unsigned int=5, unsigned int=16, padding_t=0, twiddle_t=0, loadstore_modifier_t=2, layout_t=0, unsigned int, float>(kernel_arguments_t<unsigned int>)
36.02% 2.7200us 1 2.7200us 2.7200us 2.7200us [CUDA memcpy DtoD]
API calls: 55.22% 139.29ms 4 34.822ms 3.7240us 139.16ms cudaFree
44.37% 111.92ms 3 37.307ms 7.0520us 111.90ms cudaMalloc
0.20% 508.75us 2 254.38us 238.84us 269.91us cuDeviceTotalMem
0.11% 284.22us 191 1.4880us 103ns 65.726us cuDeviceGetAttribute
0.05% 137.33us 1 137.33us 137.33us 137.33us cudaGetDeviceProperties
0.01% 36.938us 2 18.469us 17.829us 19.109us cuDeviceGetName
0.01% 25.024us 1 25.024us 25.024us 25.024us cudaMemcpy
0.01% 14.022us 1 14.022us 14.022us 14.022us cudaLaunchKernel
0.00% 10.974us 1 10.974us 10.974us 10.974us cuDeviceGetPCIBusId
0.00% 4.7250us 7 675ns 257ns 1.5590us cudaGetDevice
0.00% 2.0420us 4 510ns 121ns 1.1190us cuDeviceGetCount
0.00% 1.5960us 1 1.5960us 1.5960us 1.5960us cuInit
0.00% 1.0160us 3 338ns 174ns 567ns cuDeviceGet
0.00% 427ns 2 213ns 208ns 219ns cuDeviceGetUuid
0.00% 391ns 1 391ns 391ns 391ns cuDriverGetVersion
0.00% 351ns 1 351ns 351ns 351ns cudaGetErrorString
Note the kernel void vector_fft
which appears once in the GPU activities section.
Now lets see the second example:
int main() {
const int Nbatch = 10;
const int Nx = 5;
A obj2;
{
A obj1(Nbatch, Nx);
obj2 = obj1;
} /* here obj1 goes out of scope */
fft_execute_forward(obj2._data, &obj2._handle);
return 0;
}
here, obj1
goes out of scope before fft is performed on the data of object 2.
The output of nvprof for this program is:
(base) ahadji05@ahadji05:~/Downloads$ sudo nvprof ./a.out
==18394== NVPROF is profiling process 18394, command: ./a.out
==18394== Profiling application: ./a.out
==18394== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 100.00% 3.1040us 1 3.1040us 3.1040us 3.1040us [CUDA memcpy DtoD]
API calls: 56.06% 140.70ms 4 35.176ms 4.1090us 140.58ms cudaFree
43.55% 109.30ms 3 36.433ms 4.0100us 109.28ms cudaMalloc
0.19% 476.65us 2 238.32us 234.50us 242.15us cuDeviceTotalMem
0.10% 244.44us 191 1.2790us 103ns 57.878us cuDeviceGetAttribute
0.06% 160.48us 1 160.48us 160.48us 160.48us cudaGetDeviceProperties
0.01% 35.673us 2 17.836us 17.332us 18.341us cuDeviceGetName
0.01% 23.878us 1 23.878us 23.878us 23.878us cudaMemcpy
0.00% 10.829us 1 10.829us 10.829us 10.829us cuDeviceGetPCIBusId
0.00% 4.4690us 7 638ns 260ns 1.5210us cudaGetDevice
0.00% 2.2000us 4 550ns 115ns 1.0350us cuDeviceGetCount
0.00% 1.4250us 1 1.4250us 1.4250us 1.4250us cuInit
0.00% 1.1820us 3 394ns 198ns 748ns cuDeviceGet
0.00% 399ns 2 199ns 189ns 210ns cuDeviceGetUuid
0.00% 361ns 1 361ns 361ns 361ns cuDriverGetVersion
We have a silence fail of the function call fft_execute_forward(obj2._data, &obj2._handle);
The _data
of obj2
are still accessible on the host (I checked this with a cudaMemcpy that is not shown here)! Thus, I strongly believe that the silence failure of FFT is because the _handle
of obj2
has been invalidated when obj1
gone out of scope!
@njuffa This is what I expect from the copy operation to do, to make a separate copy of the cufftHandle
with its own copy of the data that describe the plan, in this case, batched 1D ffts.
@Robert_Crovella the cufftHandle
is copied to a different memory location, however the plan descriptor for the FFTs I guess it is not, right?
Again thank you for your help and I hope my example is clear!
please let me know if I misunderstood or misinterpreted anything.