i have written a code for cufftxt 2 gpus 3d r2c , but cufftxtmemcpy is failling to copy data from host to device . can someone tell me why?
#include “cuda_runtime.h”
#include “device_launch_parameters.h”
#include “cuda.h”
#include “cufft.h”
#include “cufftXt.h”
#include
#include <stdio.h>
#include
#include
using namespace std;
using namespace std::chrono;
int main()
{
cufftHandle planr2c;
cufftResult result;
int rank_1 = 3;
int Nx = 512;
int Ny = 512;
int Nz = 512;
int n = { Nx,Ny,Nz };
int BATCH = 1;
int* inembed;
int* onembed;
int istride = 1;
int ostride = 1;
int idist = Nx * Ny * Nz;
int odist = Nx * Ny * (Nz / 2 + 1);
int ngpus = 2;
int* whichgpu;
if ((inembed = (int*)malloc(sizeof(int) * 3)) == NULL)
{
cout << "\n Inembed failed to initiate";
}
inembed[0] = Nx; inembed[1] = Ny; inembed[2] = Nz;
if ((onembed = (int *)malloc(sizeof(int) * 3)) == NULL)
{
cout << "\n failed to initiate onembed";
}
onembed[0] = Nx; onembed[1] = Ny; onembed[2] = Nz / 2 + 1;
result = cufftCreate(&planr2c); // CREATION OF EMPTY PLAN
if (result != CUFFT_SUCCESS)
{
cout << "\n Plan creation failed";
return 1;
}
//cudaGetDeviceCount(&ngpus); // GET TOTAL NUMBER OF GPUS
whichgpu = (int*)malloc(sizeof(int)*ngpus);
//cout << "\n " << ngpus;
for (int i = 0; i < ngpus; i++) // ALLOCATING WHICH GPUS ARRAY
{
whichgpu[i] = i;
cout << "\n " << whichgpu[i];
}
result = cufftXtSetGPUs(planr2c, ngpus, whichgpu); // SETTING NO OF GPUS
/*if (result != CUFFT_SUCCESS)
{
cout << "\n cufftxt set gpu failed";
return 1;
}*/
size_t *worksize;
worksize = new size_t[2];
//cufftComplex* dresult, * dresultN;
cufftReal* hdata0, * hdata1; // CUFFT DATA DECLERATION
cufftComplex* hdatac0, * hdatac1;
hdata0 = (cufftReal*)malloc(sizeof(cufftReal) * Nx * Ny * Nz); // ALLOCATION OF DATA ON CPU
hdata1 = (cufftReal*)malloc(sizeof(cufftReal) * Nx * Ny * Nz); // ALLOCATION OF DATA ON CPU
hdatac0 = (cufftComplex*)malloc(sizeof(cufftComplex) * Nx * Ny * (Nz / 2 + 1)); // ALLOCATION OF DATA ON CPU
hdatac1 = (cufftComplex*)malloc(sizeof(cufftComplex) * Nx * Ny * (Nz / 2 + 1)); // ALLOCATION OF DATA ON CPU
for (long i = 0; i < Nx*Ny*Nz ; i++)
{
hdata0[i] = rand() % 10; // INITIALIZING THE DATA ON CPU
}
result = cufftMakePlanMany(planr2c,rank_1,n,inembed,istride,idist,onembed,ostride,odist,CUFFT_R2C,BATCH,worksize); // Making the plan
if (result != CUFFT_SUCCESS)
{
cout << "\n Plan making failed";
return 1;
}
cudaLibXtDesc* ddata0, * ddata1; // cudalib descriptor On GPU
result = cufftXtMalloc(planr2c, &ddata0, CUFFT_XT_FORMAT_INPUT); // ALLOCATING MEMORY ON GPU
if (result != CUFFT_SUCCESS)
{
cout << "\n Memory allocation failed on GPU";
return 1;
}
result = cufftXtMalloc(planr2c, &ddata1, CUFFT_XT_FORMAT_INPUT); // ALLOCATING MEMORY ON GPU
if (result != CUFFT_SUCCESS)
{
cout << "\n Memory allocation failed on GPU";
return 1;
}
result = cufftXtMemcpy(planr2c, ddata0, hdata0, CUFFT_COPY_HOST_TO_DEVICE); // MEMORY COPY HOST TO DEVICE
if (result != CUFFT_SUCCESS)
{
cout << "\n Memory copy from host to device is failed";
return 1;
}
result = cufftXtExecDescriptorR2C(planr2c, ddata0, ddata0); // PLAN EXECUITION OF LIB DESCRIPTOR
if (result != CUFFT_SUCCESS)
{
cout << "\n PLAN EXECUION FAILED ";
}
result = cufftXtMemcpy(planr2c, hdatac0, ddata0, CUFFT_COPY_DEVICE_TO_HOST); // COPY OF RESULT FROM DEVICE TO HOST
if (result!=CUFFT_SUCCESS)
{
cout << "\n Memory copy failed device to host for mid results";
}
for (long i = 0; i < Nx * Ny * (Nz / 2 + 1); i++)
{
hdatac0->x /= Nx * Ny * Nz; // NORMALIZING THE RESULT
hdatac0->y /= Nx * Ny * Nz;
cout << "\n Result is data[" << i << "] real = " << hdatac0->x << " , imag = " << hdatac0->y;
}
cufftXtFree(ddata0);
//cufftXtFree(ddata1);
cufftDestroy(planr2c);
return 0;
}