What is the correct way to copy a cufftHandle?

I am developing a CUDA application, where some of the objects that I use in my simulation perform multiple FFT operations on their member data.

In order to avoid creating and destroying my FFT-plans over and over again (imposing cudaMalloc/Free everytime), I added a member variable in the necessary classes of type cufftHandle, and now I can directly call fft_forward and fft_inverse on the member data when is necessary!

Although, I face a bug because I do not handle properly the copy semantics of this member variable “handle” when I need to copy the entire object. What is the correct way to copy a cufftHandle type ?

Let me provide an example from another library that I use as backend for my host FFTs, the Intel MKL library. In that case, the library provides a function called DftiCopyDescriptor, which allows me to handle properly the copy of their equivalent handle type.

Is there an equivalent routine in cuFFT?

Thank you in advance,

Andreas

Why do want to create a deep copy of the handle instead of using only one handle?

There are times where my objects are passed or returned by value, and this is where the copy needs to be implemented properly!

At the moment my code is not safe when an object is initialized using the copy constructor or assigned using the copy assignment operator due to the handle is not copied correctly.

I do not think there is a function to deep copy a handle. At least I could not find one in the documentation. (I have not used cuFFT before.)

But let me rephrase my question.

Is there a point in the program when there are multiple copies of your object and each object needs its own cufftHandle which it can modify individually?

If yes, you need to create a new handle for each copy. If you also need the same configurations, you have to somehow replay the previous API calls with the new handle.

However, if all you require is to preserve the handle inside the copy when the original object is destroyed, you could for example wrap the handle in a std::shared_ptr with a custom deleter.

I’m fairly certain a cufftHandle can be copied like:

cufftHandle a,b;
cufftCreate(&a);
b = a;

A handle is by its nature opaque: basically an abstract name a programmer uses to identify a particular data object when talking to an API.

As the name says, DftiCopyDescriptor does not copy a handle. It makes a copy of a descriptor which is identified by a source handle and returns a destination handle that identifies the copy. It is not clear to me what the use case for this would be. Dynamic modification of FFT parameters, e.g. by creating a “parent plan” and then changing just a few properties of copies of it to create various “child plans”?

What does your use case look like?

Thank you all for your help @striker159 , @Robert_Crovella and @njuffa

Let me try to demonstrate it using a simple case.

Assume we have the following class A, which represents the main data-type and some basic functions for creating a plan for batched 1D FFTs and a function that all it does is to execute the plan using the object’s device-data.

#include <iostream>

#include "cufft.h"
#include "thrust/complex.h"

cufftHandle plan_batched_1d_ffts(int Nbatch, int Nx) {
  int dimN[1] = {Nx};  // signal's size (number of spatial points)

  int inembed[1] = {Nx};  // storage is same as dimN - no-padding!
  int onembed[1] = {Nx};

  int inputStride = 1;  // dist. between successive input elements
  int outputStride = inputStride;

  int inputDist = Nx;  // dist. between 1st elem. in successive input signals
  int outputDist = inputDist;

  cufftHandle plan;

  cufftPlanMany(&plan, 1, dimN, inembed, inputStride, inputDist, onembed,
                outputStride, outputDist, CUFFT_C2C, Nbatch);

  return plan;
}

void fft_execute_forward(thrust::complex<float> *data, cufftHandle *handle) {
  cufftExecC2C(*handle, (cufftComplex *)data, (cufftComplex *)data,
               CUFFT_FORWARD);
}

class A {
 public:
  cufftHandle _handle;
  int _Nbatch;
  int _Nx;
  thrust::complex<float> *_data;

  A() = default;
  A(int Nbatch, int Nx);
  A(const A &other);
  A &operator=(const A &other);
  ~A();
};

/* Constructor */
A::A(int Nbatch, int Nx) : _Nbatch(Nbatch), _Nx(Nx) {
  cudaMalloc((void **)&_data, sizeof(thrust::complex<float>) * _Nbatch * _Nx);
  _handle = plan_batched_1d_ffts(_Nbatch, _Nx);
}

/* copy-Constructor */
A::A(const A &other) {
  _Nbatch = other._Nbatch;
  _Nx = other._Nx;

  cudaMalloc((void **)&_data, sizeof(thrust::complex<float>) * _Nbatch * _Nx);
  cudaMemcpy(_data, other._data, sizeof(thrust::complex<float>) * _Nbatch * _Nx,
             cudaMemcpyDeviceToDevice);

  _handle = other._handle;
}

/* copy-assignment operator */
A &A::operator=(const A &other) {
  if (this != &other) {
    _Nbatch = other._Nbatch;
    _Nx = other._Nx;
    cudaMalloc((void **)&_data, sizeof(thrust::complex<float>) * _Nbatch * _Nx);
    cudaMemcpy(_data, other._data,
               sizeof(thrust::complex<float>) * _Nbatch * _Nx,
               cudaMemcpyDeviceToDevice);
    _handle = other._handle;
  }
  return *this;
}

/* Destructor */
A::~A() {
  cudaFree(_data);
  cufftDestroy(_handle);
}

Lets see a first example where everything works fine:

int main() {
  const int Nbatch = 10;
  const int Nx = 5;

  A obj1(Nbatch, Nx); /* create obj1 using constructor */
  A obj2;
  obj2 = obj1; /*  obj2 is a copy of obj1 */

  /* perform a forward FFT on the data of obj2 */
  fft_execute_forward(obj2._data, &obj2._handle);

  return 0;
}

If I compile and run this application using nvprof I obtain the following output:

(base) ahadji05@ahadji05:~/Downloads$ sudo nvprof ./a.out 
==17355== NVPROF is profiling process 17355, command: ./a.out
==17355== Profiling application: ./a.out
==17355== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   63.98%  4.8320us         1  4.8320us  4.8320us  4.8320us  void vector_fft<unsigned int=5, unsigned int=5, unsigned int=16, padding_t=0, twiddle_t=0, loadstore_modifier_t=2, layout_t=0, unsigned int, float>(kernel_arguments_t<unsigned int>)
                   36.02%  2.7200us         1  2.7200us  2.7200us  2.7200us  [CUDA memcpy DtoD]
      API calls:   55.22%  139.29ms         4  34.822ms  3.7240us  139.16ms  cudaFree
                   44.37%  111.92ms         3  37.307ms  7.0520us  111.90ms  cudaMalloc
                    0.20%  508.75us         2  254.38us  238.84us  269.91us  cuDeviceTotalMem
                    0.11%  284.22us       191  1.4880us     103ns  65.726us  cuDeviceGetAttribute
                    0.05%  137.33us         1  137.33us  137.33us  137.33us  cudaGetDeviceProperties
                    0.01%  36.938us         2  18.469us  17.829us  19.109us  cuDeviceGetName
                    0.01%  25.024us         1  25.024us  25.024us  25.024us  cudaMemcpy
                    0.01%  14.022us         1  14.022us  14.022us  14.022us  cudaLaunchKernel
                    0.00%  10.974us         1  10.974us  10.974us  10.974us  cuDeviceGetPCIBusId
                    0.00%  4.7250us         7     675ns     257ns  1.5590us  cudaGetDevice
                    0.00%  2.0420us         4     510ns     121ns  1.1190us  cuDeviceGetCount
                    0.00%  1.5960us         1  1.5960us  1.5960us  1.5960us  cuInit
                    0.00%  1.0160us         3     338ns     174ns     567ns  cuDeviceGet
                    0.00%     427ns         2     213ns     208ns     219ns  cuDeviceGetUuid
                    0.00%     391ns         1     391ns     391ns     391ns  cuDriverGetVersion
                    0.00%     351ns         1     351ns     351ns     351ns  cudaGetErrorString

Note the kernel void vector_fft which appears once in the GPU activities section.

Now lets see the second example:

int main() {
  const int Nbatch = 10;
  const int Nx = 5;

  A obj2;

  {
    A obj1(Nbatch, Nx);
    obj2 = obj1;
  } /* here obj1 goes out of scope */

  fft_execute_forward(obj2._data, &obj2._handle);

  return 0;
}

here, obj1 goes out of scope before fft is performed on the data of object 2.
The output of nvprof for this program is:

(base) ahadji05@ahadji05:~/Downloads$ sudo nvprof ./a.out 
==18394== NVPROF is profiling process 18394, command: ./a.out
==18394== Profiling application: ./a.out
==18394== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  3.1040us         1  3.1040us  3.1040us  3.1040us  [CUDA memcpy DtoD]
      API calls:   56.06%  140.70ms         4  35.176ms  4.1090us  140.58ms  cudaFree
                   43.55%  109.30ms         3  36.433ms  4.0100us  109.28ms  cudaMalloc
                    0.19%  476.65us         2  238.32us  234.50us  242.15us  cuDeviceTotalMem
                    0.10%  244.44us       191  1.2790us     103ns  57.878us  cuDeviceGetAttribute
                    0.06%  160.48us         1  160.48us  160.48us  160.48us  cudaGetDeviceProperties
                    0.01%  35.673us         2  17.836us  17.332us  18.341us  cuDeviceGetName
                    0.01%  23.878us         1  23.878us  23.878us  23.878us  cudaMemcpy
                    0.00%  10.829us         1  10.829us  10.829us  10.829us  cuDeviceGetPCIBusId
                    0.00%  4.4690us         7     638ns     260ns  1.5210us  cudaGetDevice
                    0.00%  2.2000us         4     550ns     115ns  1.0350us  cuDeviceGetCount
                    0.00%  1.4250us         1  1.4250us  1.4250us  1.4250us  cuInit
                    0.00%  1.1820us         3     394ns     198ns     748ns  cuDeviceGet
                    0.00%     399ns         2     199ns     189ns     210ns  cuDeviceGetUuid
                    0.00%     361ns         1     361ns     361ns     361ns  cuDriverGetVersion

We have a silence fail of the function call fft_execute_forward(obj2._data, &obj2._handle);

The _data of obj2 are still accessible on the host (I checked this with a cudaMemcpy that is not shown here)! Thus, I strongly believe that the silence failure of FFT is because the _handle of obj2 has been invalidated when obj1 gone out of scope!

@njuffa This is what I expect from the copy operation to do, to make a separate copy of the cufftHandle with its own copy of the data that describe the plan, in this case, batched 1D ffts.

@Robert_Crovella the cufftHandle is copied to a different memory location, however the plan descriptor for the FFTs I guess it is not, right?

Again thank you for your help and I hope my example is clear!

please let me know if I misunderstood or misinterpreted anything.

one note for my last post:

If I comment-out the line cufftDestroy(_handle); from the destructor of class A the example 2 launches the FFT kernel as expected!

This is an additional proof that the failure is due to the invalidated _handle of obj2 after the obj1 gone out of scope!

A cufftHandle is just an int. That is typical for handles, as they are usually just a straight (sometimes decorated) index into an internal table. In this case, a table of descriptors / plans. Copying a cufftHandle just copies the handle itself, i.e. an int. It does not copy the data object (descriptor / plan) identified by the handle.

The Intel library you mentioned seems to deal with its handles in a way that reminds me of the way Posix uses handles with file descriptors in open(), dup(), and fcnt() calls, where dup() represents the deep copy operation that you mentioned. The DftiCopyDescriptor() function of the Intel library offers the dup()-equivalent functionality.

As far as I can see, cuFFT does not offer a deep-copy function of the sort you envision. But I am just going by the documentation here and will leave specific advice to people more intimately familiar with CUFFT’s features. As for the call to cufftDestroy() in the class destructor, that is obviously you as the programmer telling CUFFT to destroy the plan identified by the handle passed to it. After which trying to use that handle further will result in undefined behavior.

FWIW, the interfaces of cuFFT were pretty closely modelled after the popular FFTW library for ease of porting work to the GPU.

1 Like

You should always perform error checking of cuda api calls / cufft api calls. In case 2, fft_execute_forward does not fail silently. With correct error checking you would see that the return value is CUFFT_INVALID_PLAN .

There are a few options to work around your issue, but it’s not an issue specific to cufft.

Option1: Delete copy constructor and copy assignment, use move constructor and move assignment instead.
Option2: Create a new plan when making a copy.

Option 2 could look like this:

#include <iostream>
#include <cassert>

#include "cufft.h"
#include "thrust/complex.h"

cufftHandle plan_batched_1d_ffts(int Nbatch, int Nx) {
  int dimN[1] = {Nx};  // signal's size (number of spatial points)

  int inembed[1] = {Nx};  // storage is same as dimN - no-padding!
  int onembed[1] = {Nx};

  int inputStride = 1;  // dist. between successive input elements
  int outputStride = inputStride;

  int inputDist = Nx;  // dist. between 1st elem. in successive input signals
  int outputDist = inputDist;

  cufftHandle plan;

  cufftResult status = cufftPlanMany(&plan, 1, dimN, inembed, inputStride, inputDist, onembed,
                outputStride, outputDist, CUFFT_C2C, Nbatch);
  std::cerr << "cufftPlanMany status = " << status << "\n";
  assert(status == CUFFT_SUCCESS);

  return plan;
}

void fft_execute_forward(thrust::complex<float> *data, cufftHandle *handle) {
  cufftResult status = cufftExecC2C(*handle, (cufftComplex *)data, (cufftComplex *)data,
               CUFFT_FORWARD);
  std::cerr << "cufftExecC2C status = " << status << "\n";
  assert(status == CUFFT_SUCCESS);
}

class A {
 public:
  cufftHandle _handle{};
  int _Nbatch{};
  int _Nx{};
  thrust::complex<float> *_data{};

  A() = default;
  A(int Nbatch, int Nx);
  A(const A &other);
  A &operator=(const A &other);
  ~A();
};

/* Constructor */
A::A(int Nbatch, int Nx) : _Nbatch(Nbatch), _Nx(Nx) {

  cudaError_t status = cudaMalloc((void **)&_data, sizeof(thrust::complex<float>) * _Nbatch * _Nx);
  assert(status == cudaSuccess);
  _handle = plan_batched_1d_ffts(_Nbatch, _Nx);
  
}

/* copy-Constructor - delegate to normal constructor to create data and plan*/
A::A(const A &other) : A(other._Nbatch, other._Nx) {
  cudaError_t status = cudaMemcpy(_data, other._data, sizeof(thrust::complex<float>) * _Nbatch * _Nx,
             cudaMemcpyDeviceToDevice);
    assert(status == cudaSuccess);
}

/* copy-assignment operator */
A &A::operator=(const A &other) {
  if (this != &other) {
    A tmp(other); //copy constructor, creates valid data and plan.
      std::swap(this->_handle, tmp._handle);
      std::swap(this->_Nbatch, tmp._Nbatch);
      std::swap(this->_Nx, tmp._Nx);
      std::swap(this->_data, tmp._data);
     //old plan and data of *this will be cleaned up when tmp goes out of scope.
  }
  return *this;
}

/* Destructor */
A::~A() {
  cudaFree(_data);
  cufftDestroy(_handle);
}



#if 0

int main() {
  const int Nbatch = 10;
  const int Nx = 5;

  A obj1(Nbatch, Nx); /* create obj1 using constructor */
  A obj2;
  obj2 = obj1; /*  obj2 is a copy of obj1 */

  /* perform a forward FFT on the data of obj2 */
  fft_execute_forward(obj2._data, &obj2._handle);

  return 0;
}

#else

int main() {
  const int Nbatch = 10;
  const int Nx = 5;

  A obj2;

  {
    A obj1(Nbatch, Nx);
    obj2 = obj1;
  } /* here obj1 goes out of scope */

  fft_execute_forward(obj2._data, &obj2._handle);

  return 0;
}

#endif

@striker159 Thank you for the suggestions!

Option 1 will probably cause problems in other parts of my code, might not be the right choice for my application; this example was a simple test-case to reveal this problem!

Although, I think you second suggestion is the right thing to do. Simple and safe!