I wasn’t able to locate one on the web. But this is a fairly simple modification of the simpleCUFFT_callback sample code. It does size 50x50 2D forward transform, followed by a 2D inverse transform with a load callback that does the necessary scaling to make the output match the input.
It seems to work correctly for me:
$ cat t354.cu
/*
* Example showing the use of CUFFT for 2D-FFT with callback.
* This sample is derived from simpleCUFFT_callback, and uses a callback
* function to perform the pointwise scale, on input to the
* inverse transform.
*
*/
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, project
#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>
#include <helper_functions.h>
#include <helper_cuda.h>
// Complex data type
typedef float2 Complex;
static __device__ __host__ inline Complex ComplexScale(Complex, float);
// This is the callback routine prototype
static __device__ cufftComplex ComplexPointwiseScale(void * a, size_t index, void * cb_info, void *sharedmem);
typedef struct _cb_params{
float scale;
} cb_params;
// This is the callback routine. It does complex pointwise scaling.
static __device__ cufftComplex ComplexPointwiseScale(void *a, size_t index, void *cb_info, void *sharedmem)
{
cb_params * my_params = (cb_params *)cb_info;
return (cufftComplex)ComplexScale(((Complex *)a)[index], my_params->scale);
}
// Define the device pointer to the callback routine. The host code will fetch this and pass it to CUFFT
__device__ cufftCallbackLoadC myOwnCallbackPtr = ComplexPointwiseScale;
#define SIGNAL_SIZE 50
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUFFT callbacks
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
printf("[CUFFT_2D_callback] is starting...\n");
// Allocate host memory for the signal
int nD_size = SIGNAL_SIZE * SIGNAL_SIZE;
int mem_size = sizeof(Complex) * nD_size;
Complex *h_signal = (Complex *)malloc(mem_size);
// Initialize the memory for the signal
for (unsigned int i = 0; i < nD_size; ++i)
{
h_signal[i].x = rand() / (float)RAND_MAX;
h_signal[i].y = 0;
}
// Allocate device memory for signal
Complex *d_signal;
checkCudaErrors(cudaMalloc((void **)&d_signal, mem_size));
// Copy host memory to device
checkCudaErrors(cudaMemcpy(d_signal, h_signal, mem_size,
cudaMemcpyHostToDevice));
// Create one CUFFT plan for the forward transforms, and one for the reverse transform
// with load callback.
cufftHandle plan, cb_plan;
size_t work_size;
size_t cb_work_size;
checkCudaErrors(cufftCreate(&plan));
checkCudaErrors(cufftCreate(&cb_plan));
checkCudaErrors(cufftMakePlan2d(plan, SIGNAL_SIZE, SIGNAL_SIZE, CUFFT_C2C, &work_size));
checkCudaErrors(cufftMakePlan2d(cb_plan, SIGNAL_SIZE, SIGNAL_SIZE, CUFFT_C2C, &cb_work_size));
// Define a structure used to pass in the scale factor
cb_params h_params;
h_params.scale = 1.0f / nD_size;
// Allocate device memory for parameters
cb_params *d_params;
checkCudaErrors(cudaMalloc((void **)&d_params, sizeof(cb_params)));
// Copy host memory to device
checkCudaErrors(cudaMemcpy(d_params, &h_params, sizeof(cb_params),
cudaMemcpyHostToDevice));
// The host needs to get a copy of the device pointer to the callback
cufftCallbackLoadC hostCopyOfCallbackPtr;
checkCudaErrors(cudaMemcpyFromSymbol(&hostCopyOfCallbackPtr,
myOwnCallbackPtr,
sizeof(hostCopyOfCallbackPtr)));
// Now associate the load callback with the plan.
cufftResult status = cufftXtSetCallback(cb_plan,
(void **)&hostCopyOfCallbackPtr,
CUFFT_CB_LD_COMPLEX,
(void **)&d_params);
if (status == CUFFT_LICENSE_ERROR)
{
printf("This sample requires a valid license file.\n");
printf("The file was either not found, out of date, or otherwise invalid.\n");
return EXIT_WAIVED;
}
checkCudaErrors(cufftXtSetCallback(cb_plan,
(void **)&hostCopyOfCallbackPtr,
CUFFT_CB_LD_COMPLEX,
(void **)&d_params));
// Transform signal
printf("Transforming signal cufftExecC2C\n");
checkCudaErrors(cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_FORWARD));
printf("Transforming signal back cufftExecC2C\n");
checkCudaErrors(cufftExecC2C(cb_plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_INVERSE));
// Copy device memory to host
Complex *h_signal_res = (Complex *)malloc(mem_size);
checkCudaErrors(cudaMemcpy(h_signal_res, d_signal, mem_size,
cudaMemcpyDeviceToHost));
bool bTestResult = sdkCompareL2fe((float *)h_signal_res, (float *)h_signal, nD_size, 1e-5f);
if (bTestResult) printf("results match!\n");
else printf("mismatch!\n");
checkCudaErrors(cufftDestroy(plan));
checkCudaErrors(cufftDestroy(cb_plan));
// cleanup memory
free(h_signal);
free(h_signal_res);
checkCudaErrors(cudaFree(d_signal));
checkCudaErrors(cudaFree(d_params));
return bTestResult ? EXIT_SUCCESS : EXIT_FAILURE;
}
////////////////////////////////////////////////////////////////////////////////
// Complex operations
////////////////////////////////////////////////////////////////////////////////
// Complex scale
static __device__ __host__ inline Complex ComplexScale(Complex a, float s)
{
Complex c;
c.x = s * a.x;
c.y = s * a.y;
return c;
}
$ nvcc -I/usr/local/cuda/samples/common/inc -arch=sm_61 -o t354 t354.cu -lcufft_static -lculibos -rdc=true
$ ./t354
[CUFFT_2D_callback] is starting...
Transforming signal cufftExecC2C
Transforming signal back cufftExecC2C
results match!
$
CUDA 8.0.61, Ubuntu 14.04, Pascal Titan X