CuFFT cufftPlan2d

Hi,

I am getting the wrong result and memory allocation fails when I do a 2d Z2Z cuFFT on a tesla K40 card for any nx=ny > 2500 points making it a 6250000 total number of points.

is nx=ny > 2500 the maximum number of points I can use for cuFFT? Surely I should be able to reach at least a nx=ny=10000 points for a Z2Z with cuFFT

----See below----

GPU 2D cuFFT*********************

nptx=2390, npty=2390, npts=5712100, nptx * npty=5712100
nptx * npty*sizeof(cufftDoubleComplex)=91393600
CUDA error at test_code/gpu/cuda/get_fft123D_gpu.cpp:134 code=2(CUFFT_ALLOC_FAILED) “gather_fft_2D_gpu_cpp”
CUDA error at test_code/gpu/cuda/get_fft123D_gpu.cpp:137 code=1(CUFFT_INVALID_PLAN) “gather_fft_2D_gpu_cpp”
cudaDriverGetVersion returned 11
-> invalid argument
Result = FAIL
Exit at Line 272 in file test_code/gpu/cuda/get_fft123D_gpu.cpp get_error_id_fft123D_gpu

I am doing a CUFFT_FORWARD Followed by a CUFFT_INVERSE using the same routine shown below from subsequent calls. The memory fails to allocate and on the inverse the result is completely wrong for any nx=ny>2500. I do normalise the inversted transform by nx*ny, it is not a normalisation error.

The code is the following:

int gather_fft_2D_gpu_cpp(int *nx, int *ny,
double complex *in, double complex *out,
int sign) {
int rc = 0; /
the return code from the function */
int nptx = *nx;
int npty = *ny;

int npts = nptx * npty;

int direction = *sign;
/* the error handlers from the cuda library */
cudaError_t err;
cufftResult cufft_err;
/* the plan for the cuFFT */
cufftHandle plan_fwd;
cufftHandle plan_bwd;

cufftDoubleComplex *d_in;
cufftDoubleComplex *d_out;

printf("nptx=%i, npty=%i, npts=%i, nptx * npty=%i\n",nptx, npty, npts,nptx * npty);
printf("nptx * npty*sizeof(cufftDoubleComplex)=%i\n",nptx * npty*sizeof(cufftDoubleComplex));

err = cudaMalloc((void**)&d_in, nptx * npty*sizeof(cufftDoubleComplex));
err = cudaMalloc((void**)&d_out, nptx * npty*sizeof(cufftDoubleComplex));
err = cudaMemcpy(d_in, in, nptx * npty * sizeof(cufftDoubleComplex), cudaMemcpyHostToDevice);

if ( direction == CUFFT_FORWARD ) {

  cufft_err = cufftPlan2d(&plan_fwd, nptx, npty, CUFFT_Z2Z);
  cufft_err = cufftExecZ2Z(plan_fwd, d_in, d_out, direction);
  err = cudaMemcpy(out, d_out, nptx * npty*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToHost);
  cufft_err = cufftDestroy(plan_fwd);
  err = cudaFree(d_in);
  err = cudaFree(d_out);

  for (int i=0; i<nptx-(nptx-3) ; i++) {
for (int j=0; j<npty-(npty-3) ; j++) {
  printf("in[%i][%i]=(%15.8f,%15.8f), out[%i][%i]=(%15.8f,%15.8f)\n",
	 i,j,creal(in[j*nptx+i]),cimag(in[j*nptx+i]),
	 i,j,creal(out[j*nptx+i]),cimag(out[j*nptx+i]) );
}
  }

} else if ( direction == CUFFT_INVERSE) {
  
  cufft_err = cufftPlan2d(&plan_bwd, nptx, npty, CUFFT_Z2Z);
  cufft_err = cufftExecZ2Z(plan_bwd, d_in, d_out, direction);
  err = cudaMemcpy(out, d_out, nptx * npty*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToHost);
  cufft_err = cufftDestroy(plan_bwd);
  
  err = cudaFree(d_in);
  err = cudaFree(d_out);

} else { rc = get_warning_message_fft123D_gpu(); }

cuCtxSynchronize();
cudaDeviceReset();

return rc;

}

I think your problem is probably in something you haven’t shown.

The following test case based mostly on your code runs without error (no asserts are hit) for me on a K40c, RHEL 6.2, CUDA 7:

$ cat t771.cu
#include <cufft.h>
#include <stdio.h>
#include <assert.h>
#include <cuComplex.h>

#define NX 3000
#define NY 3000

int gather_fft_2D_gpu_cpp(int nx, int ny, cufftDoubleComplex *in, cufftDoubleComplex *out, int sign) {
  int rc = 0; /* the return code from the function */
  int nptx = nx;
  int npty = ny;

  int npts = nptx * npty;

  int direction = sign;
/* the error handlers from the cuda library */
  cudaError_t err;
  cufftResult cufft_err;
/* the plan for the cuFFT */
  cufftHandle plan_fwd;
  cufftHandle plan_bwd;

  cufftDoubleComplex *d_in;
  cufftDoubleComplex *d_out;

  printf("nptx=%i, npty=%i, npts=%i, nptx * npty=%i\n",nptx, npty, npts,nptx * npty);
  printf("nptx * npty*sizeof(cufftDoubleComplex)=%i\n",nptx * npty*sizeof(cufftDoubleComplex));

  err = cudaMalloc((void**)&d_in, nptx * npty*sizeof(cufftDoubleComplex));
  if (err != cudaSuccess) assert(0);
  err = cudaMalloc((void**)&d_out, nptx * npty*sizeof(cufftDoubleComplex));
  if (err != cudaSuccess) assert(0);
  err = cudaMemcpy(d_in, in, nptx * npty * sizeof(cufftDoubleComplex), cudaMemcpyHostToDevice);
  if (err != cudaSuccess) assert(0);

  if ( direction == CUFFT_FORWARD ) {

    cufft_err = cufftPlan2d(&plan_fwd, nptx, npty, CUFFT_Z2Z);
    if (cufft_err != CUFFT_SUCCESS) assert(0);
    cufft_err = cufftExecZ2Z(plan_fwd, d_in, d_out, direction);
    if (cufft_err != CUFFT_SUCCESS) assert(0);
    err = cudaMemcpy(out, d_out, nptx * npty*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToHost);
    if (err != cudaSuccess) assert(0);
    cufft_err = cufftDestroy(plan_fwd);
    if (cufft_err != CUFFT_SUCCESS) assert(0);
    err = cudaFree(d_in);
    if (err != cudaSuccess) assert(0);
    err = cudaFree(d_out);
    if (err != cudaSuccess) assert(0);

    for (int i=0; i<nptx-(nptx-3) ; i++) {
      for (int j=0; j<npty-(npty-3) ; j++) {
        printf("in[%i][%i]=(%15.8f,%15.8f), out[%i][%i]=(%15.8f,%15.8f)\n",
          i,j,cuCreal(in[j*nptx+i]),cuCimag(in[j*nptx+i]),
          i,j,cuCreal(out[j*nptx+i]),cuCimag(out[j*nptx+i]) );
      }
    }

  } else if ( direction == CUFFT_INVERSE) {

    cufft_err = cufftPlan2d(&plan_bwd, nptx, npty, CUFFT_Z2Z);
    if (cufft_err != CUFFT_SUCCESS) assert(0);
    cufft_err = cufftExecZ2Z(plan_bwd, d_in, d_out, direction);
    if (cufft_err != CUFFT_SUCCESS) assert(0);
    err = cudaMemcpy(out, d_out, nptx * npty*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToHost);
    if (err != cudaSuccess) assert(0);
    cufft_err = cufftDestroy(plan_bwd);
    if (cufft_err != CUFFT_SUCCESS) assert(0);

    err = cudaFree(d_in);
    if (err != cudaSuccess) assert(0);
    err = cudaFree(d_out);
    if (err != cudaSuccess) assert(0);

  } else { assert(0); }

//  cuCtxSynchronize();
  cudaDeviceReset();

  return rc;
}

int main(){
  int size = NX*NY*sizeof(cufftDoubleComplex);
  cufftDoubleComplex *hin, *hout;
  hin  = (cufftDoubleComplex *)malloc(size);
  if (hin  == NULL) assert(0);
  hout = (cufftDoubleComplex *)malloc(size);
  if (hout == NULL) assert(0);
  memset(hin, 0, size);
  gather_fft_2D_gpu_cpp(NX, NY, hin, hout, CUFFT_FORWARD);
  gather_fft_2D_gpu_cpp(NX, NY, hin, hout, CUFFT_INVERSE);

  return 0;
}
$ nvcc -o t771 t771.cu -lcufft
$ ./t771
nptx=3000, npty=3000, npts=9000000, nptx * npty=9000000
nptx * npty*sizeof(cufftDoubleComplex)=144000000
in[0][0]=(     0.00000000,     0.00000000), out[0][0]=(     0.00000000,     0.00000000)
in[0][1]=(     0.00000000,     0.00000000), out[0][1]=(     0.00000000,     0.00000000)
in[0][2]=(     0.00000000,     0.00000000), out[0][2]=(     0.00000000,     0.00000000)
in[1][0]=(     0.00000000,     0.00000000), out[1][0]=(     0.00000000,     0.00000000)
in[1][1]=(     0.00000000,     0.00000000), out[1][1]=(     0.00000000,     0.00000000)
in[1][2]=(     0.00000000,     0.00000000), out[1][2]=(     0.00000000,     0.00000000)
in[2][0]=(     0.00000000,     0.00000000), out[2][0]=(     0.00000000,     0.00000000)
in[2][1]=(     0.00000000,     0.00000000), out[2][1]=(     0.00000000,     0.00000000)
in[2][2]=(     0.00000000,     0.00000000), out[2][2]=(     0.00000000,     0.00000000)
nptx=3000, npty=3000, npts=9000000, nptx * npty=9000000
nptx * npty*sizeof(cufftDoubleComplex)=144000000
$

If you want help, I suggest you provide a complete test case, just as I have done. It should be something that I can copy, paste, compile and run, and see the issue, without having to add anything or change anything.

ok,

I am seeing the same problem wiht the 3d code which is essentially the same as the 2d case ----See below 1---- but this time it is for the case nx=ny=nz > 171 which workouts to nx=ny=(171^(3/2)=2236.11 which is consistent with the previous case at nx=ny>~2300.

At nx=ny=nz=171 the result is correct with the CUFFT_FORWARD followed by CUFFT_INVERSE —See below 2---- (first output from the cpp environment in code below the other is in fortran code that gets passed in/out from fortran-> c++ -> fortran.

At 172 the result is wrong see output ----See below 3---- it is not a problem of passing between fortran -> c++ -> fortran because the numbers are the same as we can from both ----See belows 2 and 3 ----

At 181 it looks like the calculation is completely misalligned or something ----See below 4---- and at 184 out of memory ----See below 5----. nx=ny=nz=184 is equivalent to nx=ny=~2495.

I try to have a stand alone test case in .cu alone to see if I see the same problem.


----See below 1----

int gather_fft_3D_gpu_cpp(int *nx, int *ny, int *nz,
			    double complex *in, double complex *out,
			    int *sign) {
    int rc = 0;
    int nptx = *nx;
    int npty = *ny;
    int nptz = *nz;

    int npts = nptx * npty * nptz;

    int direction = *sign;
    /* the error handlers from the cuda library */
    cudaError_t err;
    cufftResult cufft_err;
    /* the plan for the cuFFT */
    cufftHandle plan_fwd;
    cufftHandle plan_bwd;

    cufftDoubleComplex *d_in;
    cufftDoubleComplex *d_out;
    
    printf("nptx=%i, npty=%i, nptz=%i, npts=%i, nptx * npty=%i\n",nptx,npty,nptz, npts,nptx * npty *nptz);
    printf("nptx*npty*nptz*sizeof(cufftDoubleComplex)=%i\n",nptx * npty*nptzsizeof(cufftDoubleComplex));

    err = cudaMalloc((void**)&d_in, nptx * npty * nptz * sizeof(cufftDoubleComplex));
    if ( (int)err != CUDA_SUCCESS ) {rc = get_error_id_fft123D_gpu(err); }
    err = cudaMalloc((void**)&d_out, nptx * npty * nptz * sizeof(cufftDoubleComplex));
    if ( (int)err != CUDA_SUCCESS ) {rc = get_error_id_fft123D_gpu(err); }
    err = cudaMemcpy(d_in, in, nptx * npty * nptz * sizeof(cufftDoubleComplex), cudaMemcpyHostToDevice);
    if ( (int)err != CUDA_SUCCESS ) {rc = get_error_id_fft123D_gpu(err); }

    if ( direction == CUFFT_FORWARD ) {

      cufft_err = cufftPlan3d(&plan_fwd, nptx, npty, nptz, CUFFT_Z2Z);
      cufft_err = cufftExecZ2Z(plan_fwd, d_in, d_out, direction);
      err = cudaMemcpy(out, d_out, nptx * npty * nptz * sizeof(cufftDoubleComplex), cudaMemcpyDeviceToHost);

      cufft_err = cufftDestroy(plan_fwd);
      err = cudaFree(d_in);
      err = cudaFree(d_out);

      for (int i=0; i<nptx-(nptx-2) ; i++) {
	for (int j=0; j<npty-(npty-2) ; j++) {
	  for (int k=0; k<npty-(nptz-2) ; k++) {
	    printf("in[%i][%i][%i]=(%15.8f,%15.8f), out[%i][%i][%i]=(%15.8f,%15.8f)\n",
		   i,j,k,creal(in[(j+npty*k)*nptx+i]),cimag(in[(j+npty*k)*nptx+i]),
		   i,j,k,creal(out[(j+npty*k)*nptx+i]),cimag(out[(j+npty*k)*nptx+i]) );
	  }
	}
      }

    } else if ( direction == CUFFT_INVERSE) {
      
      cufft_err = cufftPlan3d(&plan_bwd, nptx, npty, nptz, CUFFT_Z2Z);
      cufft_err = cufftExecZ2Z(plan_bwd, d_in, d_out, direction);
      err = cudaMemcpy(out, d_out, nptx * npty * nptz * sizeof(cufftDoubleComplex), cudaMemcpyDeviceToHost);

      cufft_err = cufftDestroy(plan_bwd);

      err = cudaFree(d_in);
      err = cudaFree(d_out);

    } else { rc = get_warning_message_fft123D_gpu(); }

    cudaDeviceReset();

    return rc;
  }

----See below 2----

Hello deviceQuery GPU world

 current date and time: June 25 2015  14:35:20.780
 Hello timers on CPU
devD->ndev = 2
Device 0: <      Tesla K40c >, Compute SM 3.5 detected, **suitable: yes**
Device 1: <         NVS 315 >, Compute SM 2.1 detected, **suitable: no**
***************************WARNING*****************************
Device: NVS 315 does not have error correcting code 
memory (ECC) enabled. This is probably because the device does 
not have CUDA computing capabilities. Check that the correct   
device has been selected.                                      
***************************************************************
> Peer access from Tesla K40c (GPU0) -> NVS 315 (GPU1) : No
> Peer access from NVS 315 (GPU1) -> Tesla K40c (GPU0) : No
 cudaGetDeviceCount returned:            2
 Hello cuFFT GPU world

                                                                
 ***************GPU 3D cuFFT************************************
                                                                
nptx=171, npty=171, nptz=171, npts=5000211, nptx * npty=5000211
nptx*npty*nptz*sizeof(cufftDoubleComplex)=80003376
in[0][0][0]=(    -0.02364964,     0.06578358), out[0][0][0]=(   502.36593500,  -305.22967932)
in[0][0][1]=(    -0.28146625,    -1.12181712), out[0][0][1]=(   151.03200912,  1643.55830625)
in[0][1][0]=(     0.45331948,     0.05983607), out[0][1][0]=(  3219.56883867, -1834.77686455)
in[0][1][1]=(     0.65900391,    -0.06613176), out[0][1][1]=(  4047.08587404, -2590.87301557)
in[1][0][0]=(    -0.99849318,    -0.37205070), out[1][0][0]=(   893.58672828,  2124.69012298)
in[1][0][1]=(    -1.41975564,    -2.18846831), out[1][0][1]=(   130.10871447,  1055.50221326)
in[1][1][0]=(    -0.29637343,     0.18286050), out[1][1][0]=(  -799.88109439,  -852.87117443)
in[1][1][1]=(     0.35349914,     0.12507332), out[1][1][1]=(   303.65752782, -2787.44991559)
nptx=171, npty=171, nptz=171, npts=5000211, nptx * npty=5000211
nptx*npty*nptz*sizeof(cufftDoubleComplex)=80003376
 the first 3x(i,j,k) = 27 entries of:
  i          j          k
          data:f(u,v,w)          Fourier: f(h,p,q)
          Inverse Fourier
           1           1           1 ( -2.3649641772680158E-002,  6.5783581283472226E-002) (  502.36593499822408     , -305.22967932327492     ) ( -2.3649641772679617E-002,  6.5783581283471462E-002)
           1           1           2 (-0.28146625122109226     , -1.1218171170729125     ) (  151.03200912151624     ,  1643.5583062488381     ) (-0.28146625122109187     , -1.1218171170729130     )
           1           1           3 ( 0.81176153412805252     , 0.55557978425490795     ) ( -4130.2335237579282     , -433.55278576649710     ) ( 0.81176153412805263     , 0.55557978425490706     )
           1           2           1 ( 0.45331948456310478     ,  5.9836072018130862E-002) (  3219.5688386655052     , -1834.7768645466608     ) ( 0.45331948456310478     ,  5.9836072018130654E-002)
           1           2           2 ( 0.65900390724612101     , -6.6131764684458610E-002) (  4047.0858740416434     , -2590.8730155656172     ) ( 0.65900390724612112     , -6.6131764684458291E-002)
           1           2           3 ( -1.7657968686440841E-002, -1.8735998303942427     ) ( -4005.0855962493301     ,  1589.1529525829878     ) ( -1.7657968686440696E-002, -1.8735998303942436     )
           1           3           1 (-0.22685905696355141     ,  1.1834474773279196     ) (  1220.9486442413322     ,  2259.8071500418951     ) (-0.22685905696355110     ,  1.1834474773279191     )
           1           3           2 (  7.0065624322714404E-002, 0.33495531020454938     ) (  278.33766552911129     , -1299.8748837772357     ) (  7.0065624322714362E-002, 0.33495531020454933     )
           1           3           3 ( 0.37453837823930314     , 0.64747579030410884     ) (  1173.1301914745350     ,  800.43152623770629     ) ( 0.37453837823930225     , 0.64747579030410907     )
           2           1           1 (-0.99849318210088189     ,-0.37205070448681204     ) (  893.58672827704822     ,  2124.6901229841324     ) (-0.99849318210088223     ,-0.37205070448681099     )
           2           1           2 ( -1.4197556360681238     , -2.1884683115799999     ) (  130.10871446755914     ,  1055.5022132591280     ) ( -1.4197556360681236     , -2.1884683115800003     )
           2           1           3 ( 0.59407571456269181     , -1.5657549322188500     ) ( -3905.9271894948106     , -110.30078724331725     ) ( 0.59407571456269126     , -1.5657549322188495     )
           2           2           1 (-0.29637343422183532     , 0.18286050148784075     ) ( -799.88109439375523     , -852.87117442550164     ) (-0.29637343422183543     , 0.18286050148784100     )
           2           2           2 ( 0.35349913996678872     , 0.12507332017999054     ) (  303.65752781575236     , -2787.4499155888443     ) ( 0.35349913996678883     , 0.12507332017999073     )
           2           2           3 (  1.2933592049695168     ,  2.0920821853421927     ) (  2512.2747315482011     ,  570.84666309890190     ) (  1.2933592049695166     ,  2.0920821853421918     )
           2           3           1 (-0.56795079559925676     ,-0.80599730654929869     ) ( -2950.7537371092121     ,  566.86838932776777     ) (-0.56795079559925632     ,-0.80599730654929924     )
           2           3           2 ( 0.15175246532609291     ,-0.25351100444372004     ) ( -2349.2979810995562     , -1528.4760080004821     ) ( 0.15175246532609310     ,-0.25351100444371949     )
           2           3           3 (-0.96564148187020660     ,  1.0673368860918750     ) ( -1917.2086104650957     ,  2010.4382316521101     ) (-0.96564148187020771     ,  1.0673368860918748     )
           3           1           1 ( -8.9249439473280651E-002, 0.24777538372453770     ) ( -1034.5449287084114     , -2082.6784998702115     ) ( -8.9249439473281386E-002, 0.24777538372453742     )
           3           1           2 ( -2.6822696433693918E-002, 0.50598904795847266     ) (  3870.0052213161362     , -1009.8888582307344     ) ( -2.6822696433693293E-002, 0.50598904795847144     )
           3           1           3 ( -3.4085204354544043     ,-0.64550866761693393     ) ( -716.18055360259973     ,  2265.5351650859693     ) ( -3.4085204354544048     ,-0.64550866761693437     )
           3           2           1 ( -1.3098630855385134     ,  1.6186453836487893     ) ( -1401.9800908309926     ,  3487.8480152435291     ) ( -1.3098630855385138     ,  1.6186453836487895     )
           3           2           2 ( 0.22232743355872595     , -1.3714280831601762     ) ( -1844.3147310683889     , -4450.3121279103552     ) ( 0.22232743355872578     , -1.3714280831601764     )
           3           2           3 ( 0.90522579250505031     , 0.82659925190101180     ) ( -5628.6919946374737     ,  1267.5608835824037     ) ( 0.90522579250504998     , 0.82659925190101147     )
           3           3           1 ( -7.8620411341883717E-002, 0.13658278304976232     ) (  2271.6843238882288     , -3509.0533780245146     ) ( -7.8620411341884078E-002, 0.13658278304976140     )
           3           3           2 (-0.95921947571418564     ,-0.60061788781806680     ) (  4067.9592179624660     , -118.41164156341455     ) (-0.95921947571418598     ,-0.60061788781806635     )
           3           3           3 (-0.47617309112812745     ,-0.38621844955668388     ) (  3680.4722007377154     ,  2766.2232583041323     ) (-0.47617309112812672     ,-0.38621844955668305     )
 Bye cuFFT GPU world


                                    cpu (s)   %cpu    wall (s)  %wall
Other timers                          5.81   100.00      3.42  100.00
--------------------------------------------------------------------------
                           Total      5.81               3.42
 Good bye timers on CPU
 Bye deviceQuery GPU world

----See below 3----

Hello deviceQuery GPU world

 current date and time: June 25 2015  14:39:08.632
 Hello timers on CPU
devD->ndev = 2
Device 0: <      Tesla K40c >, Compute SM 3.5 detected, **suitable: yes**
Device 1: <         NVS 315 >, Compute SM 2.1 detected, **suitable: no**
***************************WARNING*****************************
Device: NVS 315 does not have error correcting code 
memory (ECC) enabled. This is probably because the device does 
not have CUDA computing capabilities. Check that the correct   
device has been selected.                                      
***************************************************************
> Peer access from Tesla K40c (GPU0) -> NVS 315 (GPU1) : No
> Peer access from NVS 315 (GPU1) -> Tesla K40c (GPU0) : No
 cudaGetDeviceCount returned:            2
 Hello cuFFT GPU world

                                                                
 ***************GPU 3D cuFFT************************************
                                                                
nptx=172, npty=172, nptz=172, npts=5088448, nptx * npty=5088448
nptx*npty*nptz*sizeof(cufftDoubleComplex)=81415168
in[0][0][0]=(     0.04609078,    -0.05255878), out[0][0][0]=(11506608.19883323,553887.94082611)
in[0][0][1]=(    -0.29072698,     0.05267318), out[0][0][1]=(6558975.17494140,3093960.67375839)
in[0][1][0]=(    -0.13396534,     0.32144745), out[0][1][0]=(7023355.10589913,-3591180.16017250)
in[0][1][1]=(     0.12088265,    -0.15316070), out[0][1][1]=(-7241617.99899629,-3794752.46286161)
in[1][0][0]=(    -0.39532753,    -0.98950821), out[1][0][0]=(2669402.02448849,-1752855.51352782)
in[1][0][1]=(    -1.02601014,    -0.47764740), out[1][0][1]=(6606379.77725489,-540158.17284203)
in[1][1][0]=(    -0.88576649,    -1.88445534), out[1][1][0]=(3249992.29826993,1104609.51601791)
in[1][1][1]=(    -0.34516510,    -0.60250803), out[1][1][1]=(-5708991.98479776,255370.19876189)
nptx=172, npty=172, nptz=172, npts=5088448, nptx * npty=5088448
nptx*npty*nptz*sizeof(cufftDoubleComplex)=81415168
 the first 3x(i,j,k) = 27 entries of:
  i          j          k
          data:f(u,v,w)          Fourier: f(h,p,q)
          Inverse Fourier
           1           1           1 (  4.6090775107592838E-002, -5.2558782067667772E-002) (  11506608.198833227     ,  553887.94082611334     ) (  2.2613197970841457     , 0.10885203913376207     )
           1           1           2 (-0.29072697924230784     ,  5.2673177683272412E-002) (  6558975.1749413982     ,  3093960.6737583871     ) (  1.2889932598193787     , 0.60803621728243795     )
           1           1           3 (  1.7991622424521794     , -1.3569051439475852     ) ( -972818.08753394219     , -1353205.8244603300     ) (-0.19118168988539180     ,-0.26593684841828585     )
           1           2           1 (-0.13396534307466240     , 0.32144744884929033     ) (  7023355.1058991319     , -3591180.1601725025     ) (  1.3802548647247908     ,-0.70575156907813585     )
           1           2           2 ( 0.12088265345370300     ,-0.15316069536674745     ) ( -7241617.9989962857     , -3794752.4628616115     ) ( -1.4231486691023050     ,-0.74575832608717074     )
           1           2           3 (  1.0893933904696862     , 0.33618686382684609     ) (  3058282.2818390438     , -3396309.1962401932     ) ( 0.60102457209723747     ,-0.66745483028227726     )
           1           3           1 ( 0.11474202123600487     ,-0.10802913619881280     ) ( -7110378.5763337221     ,  10544303.048410652     ) ( -1.3973570283775567     ,  2.0722041471998245     )
           1           3           2 (  1.0636177075394502     , 0.31481852917098818     ) ( -5106317.1522670593     , -2064974.4759083944     ) ( -1.0035117097132680     ,-0.40581616947021854     )
           1           3           3 (  8.0480229992556607E-002, 0.21713784443463197     ) ( -862828.70271390444     ,  994862.06703067396     ) (-0.16956618259907627     , 0.19551385157727347     )
           2           1           1 (-0.39532753271696275     ,-0.98950821284464185     ) (  2669402.0244884882     , -1752855.5135278206     ) ( 0.52460043307674331     ,-0.34447743467710007     )
           2           1           2 ( -1.0260101407006275     ,-0.47764740231253394     ) (  6606379.7772548907     , -540158.17284203449     ) (  1.2983093818105031     ,-0.10615381602446060     )
           2           1           3 (-0.34475417974658845     ,  1.6718620548062160     ) (  3825227.1545762941     , -1278905.0139798233     ) ( 0.75174732149690715     ,-0.25133498740280402     )
           2           2           1 (-0.88576648760296373     , -1.8844553351840205     ) (  3249992.2982699317     ,  1104609.5160179052     ) ( 0.63870011018486028     , 0.21708181276843258     )
           2           2           2 (-0.34516509898308051     ,-0.60250803264991382     ) ( -5708991.9847977562     ,  255370.19876189373     ) ( -1.1219515232931054     ,  5.0186264802527948E-002)
           2           2           3 (-0.74988835976807078     , -2.8006795510253539     ) (  8864373.7837103587     ,  7223190.6178560108     ) (  1.7420584397659873     ,  1.4195272542543445     )
           2           3           1 ( -1.0708580884638919     , 0.12571164851531169     ) (  17186233.127108891     ,  2185764.7245783224     ) (  3.3775000013970646     , 0.42955430115004073     )
           2           3           2 ( -1.4162379620172185     ,  1.1526435346607165     ) (  120009.27591988980     , -1305797.5972157882     ) (  2.3584652121804096E-002,-0.25662001404274704     )
           2           3           3 ( 0.80172341828036053     , 0.90376924590265462     ) ( -1472280.6022917116     ,  11786348.982326562     ) (-0.28933784963346615     ,  2.3162954563604781     )
           3           1           1 ( -1.9974604365407492E-002, 0.26260068241834811     ) ( -1714309.6359450440     , -9552790.7854222693     ) (-0.33690226095364323     , -1.8773486110936517     )
           3           1           2 ( 0.37826624684423105     ,  9.1611242336722812E-002) ( -652656.20384531945     , -2076704.5573052918     ) (-0.12826233143098240     ,-0.40812140701944716     )
           3           1           3 (-0.11458373896191461     , 0.83221918125210115     ) ( -5159871.0326112369     ,  4414133.7546959715     ) ( -1.0140363098161240     , 0.86748135280069116     )
           3           2           1 (-0.15698144150044571     , 0.51142583386463181     ) (  9329377.1214021593     ,  12097415.261931157     ) (  1.8334425587924175     ,  2.3774273141694984     )
           3           2           2 ( 0.25811557668875768     ,-0.49031142724788163     ) ( -7051385.8142530089     ,  3848107.8402247620     ) ( -1.3857635597834563     , 0.75624391567424132     )
           3           2           3 ( 0.24469887443682720     , -1.4977936735991029     ) (  1552446.3068879563     , -6673903.9252380580     ) ( 0.30509230061660381     , -1.3115794688750004     )
           3           3           1 ( 0.59855297102373783     , -1.2749376003428701     ) (  6150021.1995272702     ,  4935999.4024269311     ) (  1.2086241619305671     , 0.97004025636636770     )
           3           3           2 (  1.7895863868821009     , 0.36001699562992651     ) ( -17576280.463746067     ,  6121440.5504616164     ) ( -3.4541534990130720     ,  1.2030073905563379     )
           3           3           3 (-0.38741108935744578     , 0.13865740769557838     ) (  8400142.6667712107     ,  2897201.9810090270     ) (  1.6508260803237471     , 0.56936849526791411     )
 Bye cuFFT GPU world


                                    cpu (s)   %cpu    wall (s)  %wall
Other timers                          5.01   100.00      2.41  100.00
--------------------------------------------------------------------------
                           Total      5.01               2.41
 Good bye timers on CPU
 Bye deviceQuery GPU world

----See below 4----

Hello deviceQuery GPU world

 current date and time: June 25 2015  14:44:35.866
 Hello timers on CPU
devD->ndev = 2
Device 0: <      Tesla K40c >, Compute SM 3.5 detected, **suitable: yes**
Device 1: <         NVS 315 >, Compute SM 2.1 detected, **suitable: no**
***************************WARNING*****************************
Device: NVS 315 does not have error correcting code 
memory (ECC) enabled. This is probably because the device does 
not have CUDA computing capabilities. Check that the correct   
device has been selected.                                      
***************************************************************
> Peer access from Tesla K40c (GPU0) -> NVS 315 (GPU1) : No
> Peer access from NVS 315 (GPU1) -> Tesla K40c (GPU0) : No
 cudaGetDeviceCount returned:            2
 Hello cuFFT GPU world

                                                                
 ***************GPU 3D cuFFT************************************
                                                                
nptx=181, npty=181, nptz=181, npts=5929741, nptx * npty=5929741
nptx*npty*nptz*sizeof(cufftDoubleComplex)=94875856
in[0][0][0]=(     0.06523652,     0.02511936), out[0][0][0]=(     0.00000000,     0.00000000)
in[0][0][1]=(    -0.37028220,    -0.42995076), out[0][0][1]=(-131072.12556002,    -0.00000000)
in[0][1][0]=(     0.19062431,     0.62126873), out[0][1][0]=(     0.00000000,     0.00000000)
in[0][1][1]=(    -0.65317818,     0.68922230), out[0][1][1]=(78804025349210589100907610131925953627762295956350846246963584709110795059914506620915245486380813299367580750839808.00000000,     0.00000000)
in[1][0][0]=(     0.20851924,     1.04495459), out[1][0][0]=(     0.00000000,    -0.00000000)
in[1][0][1]=(     0.06817967,     0.83141097), out[1][0][1]=(     0.00000000,231584248532963057495423368742144504841811670470210387061032929361957832622080.00000000)
in[1][1][0]=(    -0.69844663,    -0.78956012), out[1][1][0]=(     0.00000000,     0.00000000)
in[1][1][1]=(    -0.56343890,    -0.62775565), out[1][1][1]=(78804016119885471029861088471031318320251685323373759210154307648586106762013487881341317211617403265083052344213504.00000000,     0.00000000)
nptx=181, npty=181, nptz=181, npts=5929741, nptx * npty=5929741
nptx*npty*nptz*sizeof(cufftDoubleComplex)=94875856
 the first 3x(i,j,k) = 27 entries of:
  i          j          k
          data:f(u,v,w)          Fourier: f(h,p,q)
          Inverse Fourier
           1           1           1 (  6.5236515294111327E-002,  2.5119358963486191E-002) (  3.1504051394286209E-151,  5.9666738300203663E-154) (  5.3128882685240739E-158,  1.0062284052575596E-160)
           1           1           2 (-0.37028220323605543     ,-0.42995075799286403     ) ( -131072.12556001931     , -2.0522684054607039E-289) ( -2.2104190648464968E-002, -3.4609747802824845E-296)
           1           1           3 ( 0.50401625773632874     , 0.11644562243671801     ) (  5.1551943572406964E-116,  5.1552106945531776E-116) (  8.6937934679452218E-123,  8.6938210194225649E-123)
           1           2           1 ( 0.19062431393348733     , 0.62126872573834935     ) (  2.9833369163553781E-154,  1.5960851551332064E-152) (  5.0311420285563538E-161,  2.6916608248711141E-159)
           1           2           2 (-0.65317817982903636     , 0.68922229793155321     ) (  7.8804025349210589E+115,  1.4916682174571163E-154) (  1.3289623501129406E+109,  2.5155706083235613E-161)
           1           2           3 ( 0.49550017291181381     ,-0.80612077498102030     ) (  2.3158421407308306E+077,  2.3158421408236371E+077) (  3.9054692957598493E+070,  3.9054692959163594E+070)
           1           3           1 (-0.30832043612476728     , -1.2824614815375626     ) (  8.0035456868688740E+115,  2.6707071722755998E-184) (  1.3497293873153775E+109,  4.5039187584678655E-191)
           1           3           2 (-0.77578856467998125     , 0.77340200968859563     ) (  7.8804030469983788E+115,  1.2954255429580386E-076) (  1.3289624364703921E+109,  2.1846241563637243E-083)
           1           3           3 (-0.90119746133188205     ,-0.88723243239646343     ) (  5.1552112996456643E-116,  1.0565891882419390E+270) (  8.6938220398591855E-123,  1.7818471131233877E+263)
           2           1           1 ( 0.20851924072321851     ,  1.0449545864099983     ) (  5.5212554768680036E-188, -2.0522684092838329E-289) (  9.3111241736662762E-195, -3.4609747867298636E-296)
           2           1           2 (  6.8179669991490055E-002, 0.83141096638758039     ) (  4.9448828700770737E-152,  2.3158424853296306E+077) (  8.3391211691658599E-159,  3.9054698768961924E+070)
           2           1           3 ( 0.16042098057698870     , -2.2444347515558061     ) (  1.0565891882419390E+270,  8.5067151833681516E+038) (  1.7818471131233877E+263,  1.4345846105872334E+032)
           2           2           1 (-0.69844663283802078     ,-0.78956011683808947     ) (  4.7733390905836779E-153,  1.6315145416772828E-154) (  8.0498272868640937E-160,  2.7514094488735388E-161)
           2           2           2 (-0.56343889852393281     ,-0.62775565311367032     ) (  7.8804016119885471E+115,  3.8862760123074502E-077) (  1.3289621944682824E+109,  6.5538714293043327E-084)
           2           2           3 (  7.6269480273828932E-002,  2.5451044209574607E-002) (  2.3158423472187471E+077,  2.3158423473031286E+077) (  3.9054696439840242E+070,  3.9054696441263262E+070)
           2           3           1 ( 0.50723991849360339     , 0.69495726612659536     ) (  6.8056530496907328E+038,  1.3611497508061384E-153) (  1.1477150603526753E+032,  2.2954624001387893E-160)
           2           3           2 (  1.3642136448176638     , -1.2843219247097404     ) (  3.2385635191721060E-077,  4.7498933448692448E-077) (  5.4615598205252237E-084,  8.0102880460870799E-084)
           2           3           3 (-0.32438840310577727     , 0.27016154572319778     ) (  8.5061286160824140E+038,  5.1552076690907441E-116) (  1.4344856910415503E+032,  8.6938159172394618E-123)
           3           1           1 (  7.4068220543997629E-003, 0.26325508962447647     ) (  5.2674563534540201E-154,  5.5471411961487197E-154) (  8.8831137033708898E-161,  9.3547782207498102E-161)
           3           1           2 (  1.5343454732365334     , -1.2596352948486036     ) (  2.3158425926048117E+077, -2.0522684044901097E-289) (  3.9054700578065916E+070, -3.4609747786456607E-296)
           3           1           3 ( -1.1054445347169153     ,-0.36010229154268159     ) (  5.0758839776293491E-116,  5.0758840532843759E-116) (  8.5600433098669051E-123,  8.5600434374526240E-123)
           3           2           1 ( 0.12972504774154603     , 0.60403754630599682     ) (  2.3158421312695749E+077,  2.3158421504279870E+077) (  3.9054692798042524E+070,  3.9054693121132729E+070)
           3           2           2 (-0.22805036180118179     , -1.0412335911819437     ) (  3.4544675330641920E-077, -2.3770896689843168E-212) (  5.8256634363359077E-084, -4.0087580030634001E-219)
           3           2           3 (-0.20457664852042823     ,-0.58244713857342045     ) (  2.3158422086951712E+077,  8.0035428882558566E+115) (  3.9054694103758854E+070,  1.3497289153532771E+109)
           3           3           1 ( 0.72712655129476056     ,  1.8427135789560480     ) (  2.4165054804898363E-152,  1.4916687183071088E-153) (  4.0752293911147825E-159,  2.5155714529641495E-160)
           3           3           2 (-0.33801916327664500     ,  1.2928534999059449     ) (  9.3633573139570788E-097,  5.1817017541374993E-077) (  1.5790499642323464E-103,  8.7384959210486587E-084)
           3           3           3 ( 0.47603331367273527     , 0.33396295938675841     ) (  1.1467199407692130E-153,  2.8072598253138934E+154) (  1.9338449027861640E-160,  4.7342031048470638E+147)
 Bye cuFFT GPU world


                                    cpu (s)   %cpu    wall (s)  %wall
Other timers                          4.90   100.00      2.52  100.00
--------------------------------------------------------------------------
                           Total      4.90               2.52
 Good bye timers on CPU
 Bye deviceQuery GPU world

----See below 5----

Hello deviceQuery GPU world

 current date and time: June 25 2015  15:05:57.677
 Hello timers on CPU
devD->ndev = 2
Device 0: <      Tesla K40c >, Compute SM 3.5 detected, **suitable: yes**
Device 1: <         NVS 315 >, Compute SM 2.1 detected, **suitable: no**
***************************WARNING*****************************
Device: NVS 315 does not have error correcting code 
memory (ECC) enabled. This is probably because the device does 
not have CUDA computing capabilities. Check that the correct   
device has been selected.                                      
***************************************************************
> Peer access from Tesla K40c (GPU0) -> NVS 315 (GPU1) : No
> Peer access from NVS 315 (GPU1) -> Tesla K40c (GPU0) : No
 cudaGetDeviceCount returned:            2
 Hello cuFFT GPU world

                                                                
 ***************GPU 3D cuFFT************************************
                                                                
nptx=184, npty=184, nptz=184, npts=6229504, nptx * npty=6229504
nptx*npty*nptz*sizeof(cufftDoubleComplex)=99672064
cudaDriverGetVersion returned 2
-> out of memory
Result = FAIL
Exit at Line 343 in file test_code/gpu/cuda/get_fft123D_gpu.cpp get_error_id_fft123D_gpu

Ok with the stand alone .cu file ----See below 1---- it seems to work comfortably to nx=ny=12000 with output in ----See below 2----. So I am still confused why it is not working in the other case!

Thanks txbob for your comment.

----See below 1----

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
//include the cufft library
#include <cufft.h>

#define ANSI_COLOR_RED     "\x1b[31m"
#define ANSI_COLOR_GREEN   "\x1b[32m"
#define ANSI_COLOR_YELLOW  "\x1b[33m"
#define ANSI_COLOR_BLUE    "\x1b[34m"
#define ANSI_COLOR_MAGENTA "\x1b[35m"
#define ANSI_COLOR_CYAN    "\x1b[36m"
#define ANSI_COLOR_RESET   "\x1b[0m"

//declarations
void runtest(int argc, char**argv);

int main(int argc, char **argv) {
  runtest(argc,argv);
}
void runtest(int argc, char**argv) {

  int nx = 12000;
  int ny = 12000;

  int npts = nx * ny;

  cufftHandle plan_fwd;
  cufftHandle plan_bwd;

  printf("nx=%i, ny=%i, npts=%i, nx * ny=%i\n",nx, ny, npts, nx * ny);
  printf("nx*ny*sizeof(cufftDoubleComplex)=%lu\n",nx * ny*sizeof(cufftDoubleComplex));

  cufftDoubleComplex *h_in;
  cufftDoubleComplex *h_out = (cufftDoubleComplex*)malloc(sizeof(cufftDoubleComplex) * npts);
  cufftDoubleComplex *h_in_rev = (cufftDoubleComplex*)malloc(sizeof(cufftDoubleComplex) * npts);
  
  h_in = (cufftDoubleComplex*)malloc(sizeof(cufftDoubleComplex)* npts);
  for (unsigned int i=0 ; i < nx ; i++) {
    for (unsigned int j=0 ; j < ny ; j++) {
      h_in[i+nx*j].x = rand() / (float)RAND_MAX;
      h_in[i+nx*j].y = sin(i*4.0*atan(1.0)*2.0/npts);
    }
  }

  printf("pi: %f\n",4.0*atan(1.0));
  
  cufftDoubleComplex *d_in;
  cufftDoubleComplex *d_in_rev;
  cufftDoubleComplex *d_out;

  cudaMalloc((void**)&d_in, npts*sizeof(cufftDoubleComplex));
  cudaMalloc((void**)&d_out, npts*sizeof(cufftDoubleComplex));
  cudaMemcpy(d_in, h_in, npts * sizeof(cufftDoubleComplex), cudaMemcpyHostToDevice);

  //tranform data
  cufftPlan2d(&plan_fwd, nx, ny, CUFFT_Z2Z);
  cufftExecZ2Z(plan_fwd, (cufftDoubleComplex *)d_in, (cufftDoubleComplex *)d_out, CUFFT_FORWARD);
  //copy trans into h_out from device 
  cudaMemcpy(h_out, d_out, npts*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToHost);
  //transform back
  cudaMalloc((void**)&d_in_rev, npts*sizeof(cufftDoubleComplex));
  cufftPlan2d(&plan_bwd, nx, ny, CUFFT_Z2Z);
  cufftExecZ2Z(plan_fwd, (cufftDoubleComplex *)d_out, (cufftDoubleComplex *)d_in_rev, CUFFT_INVERSE);
  cudaMemcpy(h_in_rev, d_in_rev, npts*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToHost);

  // check result
  for (unsigned int i = 0; i < nx-(nx-3); ++i) {
    for (unsigned int j = 0; j < ny-(ny-3); ++j)
      {
	h_out[i+nx*j].x = h_out[i+nx*j].x / (float)npts;
	h_out[i+nx*j].y /= (float)npts;

	h_in_rev[i+nx*j].x = h_in_rev[i+nx*j].x / (float)npts;
	h_in_rev[i+nx*j].y /= (float)npts;

	printf( ANSI_COLOR_GREEN "data: %15.8f %15.8f"
		ANSI_COLOR_BLUE" Fourier %15.8f %15.8f"
		ANSI_COLOR_RED" Inverse %15.8f %15.8f\n",
		h_in[i+nx*j].x, h_in[i+nx*j].y, 
		h_out[i+nx*j].x, h_out[i+nx*j].y, 
		h_in_rev[i+nx*j].x, h_in_rev[i+nx*j].y);
	//printf("1 Error %g %g \n", fabs(h_signal[i].x - h_reversed_signal[i].x), fabs(h_signal[i].y - h_reversed_signal[i].y));
      }
  }

  cudaDeviceReset();
}
nx=12000, ny=12000, npts=144000000, nx * ny=144000000
nx*ny*sizeof(cufftDoubleComplex)=2304000000
pi: 3.141593
data:      0.84018773      0.00000000 Fourier      0.50000547      0.00026178 Inverse      0.84018773      0.00000000
data:      0.39438292      0.00000000 Fourier      0.00001718      0.00000371 Inverse      0.39438292     -0.00000000
data:      0.78309923      0.00000000 Fourier      0.00001905     -0.00004400 Inverse      0.78309923     -0.00000000
data:      0.28295493      0.00000004 Fourier     -0.00010075     -0.00001036 Inverse      0.28295493      0.00000004
data:      0.66948777      0.00000004 Fourier     -0.00000007      0.00004230 Inverse      0.66948777      0.00000004
data:      0.13180640      0.00000004 Fourier     -0.00001202      0.00000112 Inverse      0.13180640      0.00000004
data:      0.64349329      0.00000009 Fourier     -0.00004937      0.00000824 Inverse      0.64349329      0.00000009
data:      0.27752402      0.00000009 Fourier     -0.00000535      0.00001387 Inverse      0.27752402      0.00000009
data:      0.41568947      0.00000009 Fourier      0.00002695      0.00000408 Inverse      0.41568947      0.00000009