cuFFT callback functions not working

Hi,

I am trying to use callback functions, and they seem to not be working. I am running this code:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cufft.h>
#include <cufftXt.h>
#include <helper_cuda.h>

// basic callback function

static __device__ void our_store(void *data, size_t index, cufftDoubleComplex v, void *cInfo, void *shared) {
    ((cufftDoubleComplex*)data)[index].x = 0.3;
    ((cufftDoubleComplex*)data)[index].y = 0.3;
}

__device__ cufftCallbackStoreZ callbackPtr = our_store;

int main() {
        
    int version;
    cufftGetVersion(&version);
    printf("DFT %d using cuFFT v%d\n", N, version);

    cufftCallbackStoreZ hostCopyOfStore;
    checkCudaErrors( cudaMemcpyFromSymbol(&hostCopyOfStore, callbackPtr, sizeof(hostCopyOfStore)) );

    cufftDoubleComplex *data_in, *data_out;

    checkCudaErrors( cudaMalloc((void**)&data_in, sizeof(cufftDoubleComplex)*N) );
    checkCudaErrors( cudaMalloc((void**)&data_out, sizeof(cufftDoubleComplex)*N) );

    cufftDoubleComplex *in, *out;

    checkCudaErrors( cudaMallocHost((void**)&in, sizeof(cufftDoubleComplex)*N) );
    checkCudaErrors( cudaMallocHost((void**)&out, sizeof(cufftDoubleComplex)*N) );

    int dims[1] = {N};
        
    for (int i = 0; i < N; i++) {
        //printf("%d\n", i);
        in[i].x = i;
        in[i].y = 0;
    }   

    checkCudaErrors( cudaMemcpy(data_in, in, N*sizeof(cufftDoubleComplex), cudaMemcpyHostToDevice) );

    cufftHandle plan;

    cufftCreate(&plan);

    size_t workSize;

    //if (cufftPlanMany(&plan, 1, dims, NULL, 1, N, NULL, 1, N, CUFFT_Z2Z, 1) != CUFFT_SUCCESS) return -1;
    if (cufftMakePlanMany(plan, 1, dims, NULL, 1, N, NULL, 1, N, CUFFT_Z2Z, 1, &workSize) != CUFFT_SUCCESS) return -1; 

    checkCudaErrors( cufftXtSetCallback(plan, (void**)&hostCopyOfStore, CUFFT_CB_ST_COMPLEX_DOUBLE, NULL) );
        
    if (cufftExecZ2Z(plan, data_in, data_out, CUFFT_FORWARD) != CUFFT_SUCCESS) return -1; 
        
    checkCudaErrors( cudaDeviceSynchronize() );
    checkCudaErrors( cudaMemcpy(out, data_out, N*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToHost) );

    for (int i = 0; i < N; i++) printf("(%f %f)\n", out[i].x, out[i].y);

and my output is

DFT 4 using cuFFT v10000
(6.000000 0.000000)
(-2.000000 2.000000)
(-2.000000 0.000000)
(-2.000000 -2.000000)

but I am expecting all of the results to be 0.3 based on the callback function.

The Makefile I am using is

DEFINES       = -DN=4
CFLAGS        = -m64 -I/usr/local/cuda-10.0/samples/common/inc
LIBS          = -lcufft_static -lculibos 


all: test

test: test.cu
    nvcc -o $@ $(CFLAGS) -DN=4 $^ $(LIBS)

I am running on nvcc V10.0.130 and ubuntu 16.04 and the GPU is a Titan V. Any help as to why I am running into this issue would be greatly appreciated.

callbacks require relocatable device code. I don’t see that anywhere in your makefile.

add:

-rdc=true

somewhere, e.g.:

test: test.cu
    nvcc -rdc=true -o $@ $(CFLAGS) -DN=4 $^ $(LIBS)

By the way there are cufft callback CUDA sample code(s). If you study their makefile, you should get an idea of what is needed.