batched 2d fft's return wrong answer in last slice on k80 using cuda 8

on k80 using centos 6.4 NVidia driver level 361.77 and cuda 8 the enclosed example yields wrong answer in slice 31. anybody have any ideas

setenv PATH /pgsdev/com/nvidia/cuda-8.0/bin:${PATH}
setenv LD_LIBRARY_PATH /pgsdev/com/nvidia/cuda-8.0/lib64
nvcc 2dtest.cu -o 2dtest -I/pgsdev/com/nvidia/cuda-8.0/include -L/pgsdev/com/nvidia/cuda-8.0/lib64 -I/pgsdev/com/nvidia/cuda-8.0/samples/common/inc -lcufft -L/pgsdev/com/nvidia/cuda-8.0/lib64/stubs -lcuda

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include
#include
#include
#include <fcntl.h>
#include <math.h>
#include <sys/types.h>
#include <unistd.h>
// CUDA includes
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>
#include <cuda.h>
#include <cudaProfiler.h>
#include <builtin_types.h>
#include <cufft.h>
#include “/pgsdev/com/GPU/CUDA/cuda-c-tests/tti-test/complex_gpu.h”

#define FAILURE 1
#define SUCCESS 0
#define FORWARD 1
#define INVERSE -1
#define PACKETSIZE 32
void checkCudaErrors(cudaError_t result)
{
if (result)
{
fprintf(stderr, “CUDA error code = %d \n”,
static_cast(result));
cudaDeviceReset();
// Make sure we call CUDA Device Reset before exiting
exit(1);
}
}
global void fftscale( cuFloatComplex * rwave, cuFloatComplex * swave){
int ny = gridDim.x;
int nwc = gridDim.y;
int nx = blockDim.x;

int iky = blockIdx.x;
int iw = blockIdx.y;
int ele = threadIdx.x+(iky-1)*nx;

if(ele >= nxny )return;
if(iw >= nwc )return ;
int pos;
int sliceSze = nx * ny;
pos = sliceSze * iw + ele;
float scale=1.0/(float)(nx
ny);
// split step part
rwave[pos]=scalerwave[pos];
swave[pos]=scale
swave[pos];

}

float *** calloc3D(int nz,int nx,int ny){
float *** tmp;
size_t total = nz;
total = nx;
total = ny;
float * large = (float
)calloc(total,sizeof(float));
tmp = (float
**)malloc(sizeof(float**) * ny );
for(int i=0;i<ny;i++){
tmp[i] = (float**)malloc(sizeof(float*) * nx);
for(int j=0;j<nx;j++){
tmp[i][j] = large;
large += nz;
}
}
return tmp;
}
void free3D(float *** ptr, int ny){
free(&ptr[0][0][0]);
for(int i=0;i<ny;i++){
free(ptr[i]);
}
free(ptr);
}

int main(int argc, char **argv) {
cudaError_t status;
// signal array and FFT output array
float *** traces ;

// number of sampling points
int NZ = 502;
int NX = 640;
int NY = 640;
float scale = 1.0;
scale /= (float)(NZ-2);
// get CUDA devices
int numDevices;
status = cudaGetDeviceCount(&numDevices);
checkCudaErrors(status);
std::cout << "number of cuda devices " << numDevices << std::endl;
status = cudaSetDevice(numDevices-1);
std::cout << " cuda set device ok " << std::endl;
checkCudaErrors(status);
cudaDeviceProp prop;
checkCudaErrors(cudaGetDeviceProperties(&prop,numDevices-1));
std::cout << "device number " << numDevices-1 << std::endl;
std::cout << "device name " << prop.name << std::endl;
std::cout << "global memory " << (float)prop.totalGlobalMem/1e9 << " Gb " << std::endl;
if( cuInit(0) != CUDA_SUCCESS)
std::cout << "failed to init cuda device " << std::endl;
else
std::cout << "cu init ok " << std::endl;

size_t totalMem =0;

// Allocation of traces
traces = calloc3D(NZ,NX,NY);

// Initialization of 1D tracesInput
traces[NY/2][NX/2][1] = 1.0;
traces[NY/2][NX/2][2] = 2.0;
traces[NY/2][NX/2][3] = 3.0;
traces[NY/2][NX/2][4] = 4.0;
traces[NY/2][NX/2][5] = 5.0;
traces[NY/2][NX/2][6] = 4.0;
traces[NY/2][NX/2][7] = 3.0;
traces[NY/2][NX/2][8] = 2.0;
traces[NY/2][NX/2][9] = 1.0;
std::cout << " init stuff done " << std::endl;

// Setup cuFFT.
cudaEvent_t start, stop;
float elapsedTimeInMs = 0.0f;
float gputime = 0.0f;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
cufftHandle plan1d;
if(cufftCreate(&plan1d)!=CUFFT_SUCCESS)
std::cout << "plan1d handle creation failed " << std::cout;
// cufftHandle plan1dInv;
int dims[2]={NZ-2,0};
int batch=NXNY;
if(cufftPlanMany(&plan1d,1,dims,NULL,0,0,NULL,0,0,CUFFT_R2C,batch) != CUFFT_SUCCESS)
std::cout << " plan many 1d fft failed " << std::endl;
else
std::cout << " plan many 1d fft success " << std::endl;
/
if(cufftPlanMany(&plan1dInv,1,dims,NULL,0,0,NULL,0,0,CUFFT_C2R,batch) != CUFFT_SUCCESS)
std::cout << " plan many 1d fft failed " << std::endl;
else
std::cout << " plan many 1d fft success " << std::endl; */

checkCudaErrors(cudaEventRecord(start, 0));
float * traces_d=0;
size_t memSize = NXNYNZsizeof(float);
checkCudaErrors(cudaMalloc((void ) &traces_d, memSize));
std::cout << " traces alloc on device ok bytes " << memSize/1e6 << " megabytes " << std::endl;
float * tracesAlt = &traces[0][0][0];
checkCudaErrors(cudaMemcpy(traces_d, tracesAlt, memSize,
cudaMemcpyHostToDevice));
if(cufftExecR2C(plan1d,(cufftReal
)traces_d,(cufftComplex
)traces_d) != CUFFT_SUCCESS)
std::cout << " exec fft r2c failed " << std::endl;
else
std::cout << " exec fft r2c good " << std::endl;
/
if(cufftExecC2R(plan1dInv,(cufftComplex*)traces_d,(cufftReal*)traces_d) != CUFFT_SUCCESS)
std::cout << " exec fft c2r failed " << std::endl;
else
std::cout << " exec fft c2r good " << std::endl; */

checkCudaErrors(status);
checkCudaErrors(cudaMemcpy(tracesAlt, traces_d, memSize,
cudaMemcpyDeviceToHost));
for(int i=0;i<NZ-2;i++)traces[NY/2][NX/2][i] *= 1./(float)(NZ-2);
checkCudaErrors(cudaEventRecord(stop, 0));

// make sure GPU has finished copying
checkCudaErrors(cudaEventSynchronize(stop));
//get the total elapsed time in ms
checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));
std::cout << "forward 1d FFT passed in time " << elapsedTimeInMs/1000.0 << " secs " << std::endl;
gputime += elapsedTimeInMs/1000.0;
cudaFree(traces_d);

size_t totalbytes = PACKETSIZE;
totalbytes = 2NX;
totalbytes = NY;
totalbytes =sizeof(float);
float * vslice = (float
)malloc(NX
NYsizeof(float));
cuFloatComplex * SRCslices_d;
checkCudaErrors(cudaMalloc((void **) &SRCslices_d,(size_t)(2
PACKETSIZENYNXsizeof(float))));
cuFloatComplex * RCVslices_d;
checkCudaErrors(cudaMalloc((void **) &RCVslices_d,(size_t)(2
PACKETSIZENYNX*sizeof(float))));

std::cout << "past initial device copies " << std::endl;

int numPacks = (NZ/2)/PACKETSIZE;
if(numPacksPACKETSIZE<NZ/2)numPacks++;
int pos=1;
int num;
size_t pacsize = NX
2*NY;
pacsize *= PACKETSIZE;
pacsize *= sizeof(float);
cuFloatComplex *** SRCslices = (cuFloatComplex **)calloc3D(2NX,NY,PACKETSIZE);
cuFloatComplex *** RCVslices = (cuFloatComplex )calloc3D(2NX,NY,PACKETSIZE);
cuFloatComplex * SRCalt = (cuFloatComplex
)&SRCslices[0][0][0];
cuFloatComplex * RCValt = (cuFloatComplex
)&RCVslices[0][0][0];

totalMem += pacsize*2;
std::cout << " total memory " << (float)totalMem/1e9 << " Gb " << std::endl;
float FMAX=62.5;
float deltaf=FMAX/(float)(NZ/2-1);
float dw=acos(-1.0)2.0deltaf;
float w0=dw;

scale=1.0/(float)(NX*NY);

cufftHandle plan2d;
if(cufftCreate(&plan2d)!=CUFFT_SUCCESS)
std::cout << " plan2d handle creation failed" << std::endl;
dims[0]=NX;
dims[1]=NY;
batch=PACKETSIZE;
if(cufftPlanMany(&plan2d,2,dims,NULL,0,0,NULL,0,0,CUFFT_C2C,batch) != CUFFT_SUCCESS)
std::cout << "2d plan bake failed " << std::endl;

std::cout << "2d plan baked " << std::endl;
// double elapsed;
// Perform Forward FFT
for(int packet=0; packet< 1 ; packet++ ){
num = NZ/2-2-pos+1;
if(num>PACKETSIZE)num=PACKETSIZE;
std::cout << "packet " << packet << " packet start " << pos << " packet size " << num << std::endl;
for(int k=pos;k<pos+num;k++){
// std::cout << k << std::endl;
for(int i=0;i<NY;i++)
for(int j=0;j<NX;j++){
// std::cout << k << " " << j << " " << i << std::endl;
SRCslices[k-pos][i][j] = make_cuFloatComplex(traces[i][j][2k],traces[i][j][2k+1]);
RCVslices[k-pos][i][j] = make_cuFloatComplex(traces[i][j][2k],traces[i][j][2k+1]);
if(RCVslices[k-pos][i][j].x!=0.0)
std::cout << k-pos << " " << i << " " << j << " " << RCVslices[k-pos][i][j].x << " " << RCVslices[k-pos][i][j].y << std::endl;
}
}
for(int k=pos+num;k<pos+PACKETSIZE;k++)
for(int i=0;i<NY;i++)
for(int j=0;j<NX;j++){
// std::cout << k << " " << j << " " << i << std::endl;
SRCslices[k-pos][i][j] = make_cuFloatComplex(0.0,0.0);
RCVslices[k-pos][i][j] = make_cuFloatComplex(0.0,0.0);
}
std::cout << " past packet transposes " << std::endl;
checkCudaErrors(cudaEventRecord(start, 0));
checkCudaErrors(cudaMemcpy(SRCslices_d, SRCalt,pacsize ,
cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(RCVslices_d, RCValt,pacsize ,
cudaMemcpyHostToDevice));
checkCudaErrors(cudaEventRecord(stop, 0));
// make sure GPU has finished copying
checkCudaErrors(cudaEventSynchronize(stop));
//get the total elapsed time in ms
checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));
std::cout << " gpu time of initial packet copies " << elapsedTimeInMs/1000.0 << std::endl;
gputime += elapsedTimeInMs/1000.0;
// got about 6 KM
for(int iz=0;iz<1;iz++){
checkCudaErrors(cudaEventRecord(start, 0));
// wave number domain
if( cufftExecC2C(plan2d,SRCslices_d,SRCslices_d,CUFFT_FORWARD) != CUFFT_SUCCESS)
std::cout << " cufft exec 2d c2c failed " << std::endl;
if( cufftExecC2C(plan2d,RCVslices_d,RCVslices_d,CUFFT_FORWARD) != CUFFT_SUCCESS)
std::cout << " cufft exec 2d c2c failed " << std::endl;

  dim3 grid (NY, num,1);
  dim3 tBlock (NX, 1,1);

  fftscale<<<grid,tBlock>>>( RCVslices_d,SRCslices_d);
  // back to f-x domain
  if( cufftExecC2C(plan2d,SRCslices_d,SRCslices_d,CUFFT_INVERSE) != CUFFT_SUCCESS)
std::cout << " cufft exec 2d c2c failed " << std::endl;
  if( cufftExecC2C(plan2d,RCVslices_d,RCVslices_d,CUFFT_INVERSE)  != CUFFT_SUCCESS)
std::cout << " cufft exec 2d c2c failed " << std::endl;
  checkCudaErrors(cudaMemcpy(SRCalt, SRCslices_d,pacsize ,
  				 cudaMemcpyDeviceToHost));
  checkCudaErrors(cudaMemcpy(RCValt,RCVslices_d,pacsize ,
  				 cudaMemcpyDeviceToHost));
  for(int k=pos;k<pos+num;k++)
for(int i=0;i<NY;i++)
  for(int j=0;j<NX;j++){
    if(j==NX/2 && i==NY/2 )
    std::cout << k-pos << " " << i << " " << j << " " << RCVslices[k-pos][i][j].x << " " << RCVslices[k-pos][i][j].y  << std::endl;
  }
  checkCudaErrors(cudaEventRecord(stop, 0));
  // make sure GPU has finished copying
  checkCudaErrors(cudaEventSynchronize(stop));
  //get the total elapsed time in ms
  checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));
  gputime += elapsedTimeInMs/1000.0;

}
pos += PACKETSIZE;
w0 += num*dw;

}
std::cout << " GPU time " << gputime << std::endl;
// std::cout << "destroy plans " << std::endl;

cufftDestroy(plan2d);
cufftDestroy(plan1d);
cudaFree(RCVslices_d);
cudaFree(SRCslices_d);
return 0;
}

probably what’s needed is a complete example that someone else could compile. and remove anything that is not necessary to show the issue. And define the actual issue, i.e. show the actual data produced and how you are determining it is in error.

Hi ,

Ran it through cuda-memcheck. It turns out that I should not have iky-1 but just iky in the fftscale. Once I did that the problem is fixed.