Multi-GPU 3D R2C FFT: seg fault in cufftXtMemcpy

I am trying to do multi-GPU 3D R2C FFT but now seeing a segment fault when I added the last line (cuffXtMemcpy). The segment fault will go away if I commented out that line. Can anyone take a quick look? The code is very short (please jump to main() and ignore those headers and error-checking functions upfront).

    #include <cuda_runtime.h>
    #include <cufft.h>
    #include <cufftXt.h>
    #include <typeinfo>
    #include <string>
    #include <cstdlib>
    #include <cstdio>
    #include <cmath>
    #include <vector>
    #include <numeric>
    #include <cfloat>
    #include <algorithm>
    #include <fstream>
    #include <iostream>
    using std::vector;
    using std::string;

    //=== my abort and CUDA error-checking function
    inline void abort(string msg, const char *file, int line){
        fprintf(stderr, "Error in file '%s' at line '%i': %s\n",
                file, line, msg.c_str());
        exit(EXIT_FAILURE);
    }
    __host__ __device__ const char* cudaGetErrorString(int errCode){
        const char* errStrings[17] = {
            "The cuFFT operation was successful (CUFFT_SUCCESS)",
            "cuFFT was passed an invalid plan handle (CUFFT_INVALID_PLAN)",
            "cuFFT failed to allocate GPU or CPU memory (CUFFT_ALLOC_FAILED)",
            "No longer used (CUFFT_INVALID_TYPE)",
            "User specified an invalid pointer or parameter (CUFFT_INVALID_VALUE)",
            "Driver or internal cuFFT library error (CUFFT_INTERNAL_ERROR)",
            "Failed to execute an FFT on the GPU (CUFFT_EXEC_FAILED)",
            "The cuFFT library failed to initialize (CUFFT_SETUP_FAILED)",
            "User specified an invalid transform size (CUFFT_INVALID_SIZE)",
            "No longer used (CUFFT_UNALIGNED_DATA)",
            "Missing parameters in call (CUFFT_INCOMPLETE_PARAMETER_LIST)",
            "Execution of a plan was on different GPU than plan creation (CUFFT_INVALID_DEVICE)",
            "Internal plan database error (CUFFT_PARSE_ERROR)",
            "No workspace has been provided prior to plan execution (CUFFT_NO_WORKSPACE)",
            "Function does not implement functionality for parameters given (CUFFT_NOT_IMPLEMENTED)",
            "Used in previous versions (CUFFT_LICENSE_ERROR)",
            "Operation is not supported for parameters given (CUFFT_NOT_SUPPORTED)"
        };
        return errStrings[errCode];
    }
    template<typename T>
    inline void checkCudaErrors(T errCode, const char *func, const char *file, int line){
        if(errCode){
            cudaError_t type_cuda;
            cufftResult type_cufft;
            string errType;
            if(typeid(errCode) == typeid(type_cuda)) errType = "General CUDA error";
            else if(typeid(errCode) == typeid(type_cufft)) errType = "CUFFT error";
            else {
                fprintf(stderr, "Unknown error type\n");
                fflush(stderr);
                exit(EXIT_FAILURE);
            }
            fprintf(stderr, "%s in file '%s' at line '%i' when calling '%s': %d (%s)\n",
                    errType.c_str(), file, line, func, errCode, cudaGetErrorString(errCode));
            fflush(stderr);
            exit(EXIT_FAILURE);
        }
    }
    #define ABORT(msg) abort( msg, __FILE__, __LINE__ )
    #define CUCHK(val) checkCudaErrors( (val), #val, __FILE__, __LINE__ )



    int main(){
        int n[3]{1024, 512, 768};
        size_t size = (size_t)n[0]*(size_t)n[1]*(size_t)n[2];
        size_t size2 = (size_t)(n[0]+2)*(size_t)n[1]*(size_t)n[2];
        float *ha = new float [size]();
    #pragma omp parallel for
        for(size_t i = 0; i < size; i++){
            int iy = i/((size_t)n[1]*(size_t)n[0]);
            int ix = i%((size_t)n[1]*(size_t)n[0]);
            int iz = ix%n[0];
            ix = ix/n[0];
            ha[i] = cos(iz) + sin(ix + iy);
        }
        
        //2. multi-GPU C2R and R2C FFT
        cufftHandle planR2CmGPU, planC2RmGPU;
        CUCHK(cufftCreate(&planR2CmGPU));
        CUCHK(cufftCreate(&planC2RmGPU));
        int nGPUs = 2;
        int whichGPUs[2] = {0, 1};
        size_t workSize2[2];
        CUCHK(cufftXtSetGPUs(planR2CmGPU, nGPUs, whichGPUs));
        CUCHK(cufftMakePlan3d(planR2CmGPU, n[2], n[1], n[0], CUFFT_R2C, workSize2));
        CUCHK(cufftXtSetGPUs(planC2RmGPU, nGPUs, whichGPUs));
        CUCHK(cufftMakePlan3d(planC2RmGPU, n[2], n[1], n[0], CUFFT_C2R, workSize2));
        cudaLibXtDesc *da2;
        CUCHK(cufftXtMalloc(planR2CmGPU, &da2, CUFFT_XT_FORMAT_INPLACE));
        //CUCHK(cufftXtMemcpy(planR2CmGPU, da2, ha, CUFFT_COPY_HOST_TO_DEVICE)); //problematic line
        
        
        //4. clean up
        CUCHK(cufftXtFree(da2));
        CUCHK(cufftDestroy(planR2CmGPU));
        CUCHK(cufftDestroy(planC2RmGPU));
        delete [] ha;
        return 0;
    }

I figured out this myself. Multi-GPU FFT supports in-place transform only, so I have to pad the host array (ha) to enable in-place transform.

float *ha = new float [size2](); //size2 rather than size

In case anyone needs a working example, I put my code here:
HPC_demos/cufft_rc_mgpu at master · llodds/HPC_demos · GitHub