Cufftxtmemcpy failed to copy from host to device

i have written a code for cufftxt 2 gpus 3d r2c , but cufftxtmemcpy is failling to copy data from host to device . can someone tell me why?

#include “cuda_runtime.h”
#include “device_launch_parameters.h”
#include “cuda.h”
#include “cufft.h”
#include “cufftXt.h”
#include
#include <stdio.h>
#include
#include

using namespace std;
using namespace std::chrono;

int main()
{
cufftHandle planr2c;
cufftResult result;
int rank_1 = 3;
int Nx = 512;
int Ny = 512;
int Nz = 512;
int n = { Nx,Ny,Nz };
int BATCH = 1;
int* inembed;
int* onembed;
int istride = 1;
int ostride = 1;
int idist = Nx * Ny * Nz;
int odist = Nx * Ny * (Nz / 2 + 1);
int ngpus = 2;
int* whichgpu;

if ((inembed = (int*)malloc(sizeof(int) * 3)) == NULL)
{
	cout << "\n Inembed failed to initiate";
}
inembed[0] = Nx; inembed[1] = Ny; inembed[2] = Nz;
if ((onembed = (int *)malloc(sizeof(int) * 3)) == NULL)
{
	cout << "\n failed to initiate onembed";
}
onembed[0] = Nx; onembed[1] = Ny; onembed[2] = Nz / 2 + 1;
result = cufftCreate(&planr2c);										// CREATION OF EMPTY PLAN
if (result != CUFFT_SUCCESS)
{
	cout << "\n Plan creation failed";
	return 1;
}

//cudaGetDeviceCount(&ngpus);									// GET TOTAL NUMBER OF GPUS
whichgpu = (int*)malloc(sizeof(int)*ngpus);
//cout << "\n " << ngpus;
for (int i = 0; i < ngpus; i++)								// ALLOCATING WHICH GPUS ARRAY
{
	whichgpu[i] = i;
	cout << "\n " << whichgpu[i];
}

result = cufftXtSetGPUs(planr2c, ngpus, whichgpu);            // SETTING NO OF GPUS
/*if (result != CUFFT_SUCCESS)
{
	cout << "\n cufftxt set gpu failed";
	return 1;
}*/

size_t *worksize;
worksize = new size_t[2];
//cufftComplex* dresult, * dresultN;
cufftReal* hdata0, * hdata1;								// CUFFT DATA DECLERATION
cufftComplex* hdatac0, * hdatac1;


hdata0 = (cufftReal*)malloc(sizeof(cufftReal) * Nx * Ny * Nz);			       // ALLOCATION OF DATA ON CPU
hdata1 = (cufftReal*)malloc(sizeof(cufftReal) * Nx * Ny * Nz);				// ALLOCATION OF DATA ON CPU
hdatac0 = (cufftComplex*)malloc(sizeof(cufftComplex) * Nx * Ny * (Nz / 2 + 1));	// ALLOCATION OF DATA ON CPU
hdatac1 = (cufftComplex*)malloc(sizeof(cufftComplex) * Nx * Ny * (Nz / 2 + 1));	// ALLOCATION OF DATA ON CPU

for (long i = 0; i < Nx*Ny*Nz ; i++)
{
	hdata0[i] = rand() % 10;											// INITIALIZING THE DATA ON CPU
}


result = cufftMakePlanMany(planr2c,rank_1,n,inembed,istride,idist,onembed,ostride,odist,CUFFT_R2C,BATCH,worksize);		// Making the plan
if (result != CUFFT_SUCCESS)
{
	cout << "\n Plan making failed";															
	return 1;
}

cudaLibXtDesc* ddata0, * ddata1;							// cudalib descriptor On GPU
result = cufftXtMalloc(planr2c, &ddata0, CUFFT_XT_FORMAT_INPUT);			// ALLOCATING MEMORY ON GPU
if (result != CUFFT_SUCCESS)
{
	cout << "\n Memory allocation failed on GPU";
	return 1;
}
result = cufftXtMalloc(planr2c, &ddata1, CUFFT_XT_FORMAT_INPUT);			// ALLOCATING MEMORY ON GPU
if (result != CUFFT_SUCCESS)
{
	cout << "\n Memory allocation failed on GPU";
	return 1;
}

result = cufftXtMemcpy(planr2c, ddata0, hdata0, CUFFT_COPY_HOST_TO_DEVICE);			// MEMORY COPY HOST TO DEVICE
if (result != CUFFT_SUCCESS)
{
	cout << "\n Memory copy from host to device is failed";
	return 1;
}

result = cufftXtExecDescriptorR2C(planr2c, ddata0, ddata0);					// PLAN EXECUITION OF LIB DESCRIPTOR
if (result != CUFFT_SUCCESS)
{
	cout << "\n PLAN EXECUION FAILED ";
}

result = cufftXtMemcpy(planr2c, hdatac0, ddata0, CUFFT_COPY_DEVICE_TO_HOST);		// COPY OF RESULT FROM DEVICE TO HOST
if (result!=CUFFT_SUCCESS)
{
	cout << "\n Memory copy failed device to host for mid results";
}

for (long i = 0; i < Nx * Ny * (Nz / 2 + 1); i++)
{
	hdatac0->x /= Nx * Ny * Nz;													// NORMALIZING THE RESULT 
	hdatac0->y /= Nx * Ny * Nz;
	cout << "\n Result is data[" << i << "] real  = " << hdatac0->x << "  ,  imag = " << hdatac0->y;
}

cufftXtFree(ddata0);
//cufftXtFree(ddata1);
cufftDestroy(planr2c);
return 0;

}