on k80 using centos 6.4 NVidia driver level 361.77 and cuda 8 the enclosed example yields wrong answer in slice 31. anybody have any ideas
setenv PATH /pgsdev/com/nvidia/cuda-8.0/bin:${PATH}
setenv LD_LIBRARY_PATH /pgsdev/com/nvidia/cuda-8.0/lib64
nvcc 2dtest.cu -o 2dtest -I/pgsdev/com/nvidia/cuda-8.0/include -L/pgsdev/com/nvidia/cuda-8.0/lib64 -I/pgsdev/com/nvidia/cuda-8.0/samples/common/inc -lcufft -L/pgsdev/com/nvidia/cuda-8.0/lib64/stubs -lcuda
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include
#include
#include
#include <fcntl.h>
#include <math.h>
#include <sys/types.h>
#include <unistd.h>
// CUDA includes
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>
#include <cuda.h>
#include <cudaProfiler.h>
#include <builtin_types.h>
#include <cufft.h>
#include “/pgsdev/com/GPU/CUDA/cuda-c-tests/tti-test/complex_gpu.h”
#define FAILURE 1
#define SUCCESS 0
#define FORWARD 1
#define INVERSE -1
#define PACKETSIZE 32
void checkCudaErrors(cudaError_t result)
{
if (result)
{
fprintf(stderr, “CUDA error code = %d \n”,
static_cast(result));
cudaDeviceReset();
// Make sure we call CUDA Device Reset before exiting
exit(1);
}
}
global void fftscale( cuFloatComplex * rwave, cuFloatComplex * swave){
int ny = gridDim.x;
int nwc = gridDim.y;
int nx = blockDim.x;
int iky = blockIdx.x;
int iw = blockIdx.y;
int ele = threadIdx.x+(iky-1)*nx;
if(ele >= nxny )return;
if(iw >= nwc )return ;
int pos;
int sliceSze = nx * ny;
pos = sliceSze * iw + ele;
float scale=1.0/(float)(nxny);
// split step part
rwave[pos]=scalerwave[pos];
swave[pos]=scaleswave[pos];
}
float *** calloc3D(int nz,int nx,int ny){
float *** tmp;
size_t total = nz;
total = nx;
total = ny;
float * large = (float)calloc(total,sizeof(float));
tmp = (float**)malloc(sizeof(float**) * ny );
for(int i=0;i<ny;i++){
tmp[i] = (float**)malloc(sizeof(float*) * nx);
for(int j=0;j<nx;j++){
tmp[i][j] = large;
large += nz;
}
}
return tmp;
}
void free3D(float *** ptr, int ny){
free(&ptr[0][0][0]);
for(int i=0;i<ny;i++){
free(ptr[i]);
}
free(ptr);
}
int main(int argc, char **argv) {
cudaError_t status;
// signal array and FFT output array
float *** traces ;
// number of sampling points
int NZ = 502;
int NX = 640;
int NY = 640;
float scale = 1.0;
scale /= (float)(NZ-2);
// get CUDA devices
int numDevices;
status = cudaGetDeviceCount(&numDevices);
checkCudaErrors(status);
std::cout << "number of cuda devices " << numDevices << std::endl;
status = cudaSetDevice(numDevices-1);
std::cout << " cuda set device ok " << std::endl;
checkCudaErrors(status);
cudaDeviceProp prop;
checkCudaErrors(cudaGetDeviceProperties(&prop,numDevices-1));
std::cout << "device number " << numDevices-1 << std::endl;
std::cout << "device name " << prop.name << std::endl;
std::cout << "global memory " << (float)prop.totalGlobalMem/1e9 << " Gb " << std::endl;
if( cuInit(0) != CUDA_SUCCESS)
std::cout << "failed to init cuda device " << std::endl;
else
std::cout << "cu init ok " << std::endl;
size_t totalMem =0;
// Allocation of traces
traces = calloc3D(NZ,NX,NY);
// Initialization of 1D tracesInput
traces[NY/2][NX/2][1] = 1.0;
traces[NY/2][NX/2][2] = 2.0;
traces[NY/2][NX/2][3] = 3.0;
traces[NY/2][NX/2][4] = 4.0;
traces[NY/2][NX/2][5] = 5.0;
traces[NY/2][NX/2][6] = 4.0;
traces[NY/2][NX/2][7] = 3.0;
traces[NY/2][NX/2][8] = 2.0;
traces[NY/2][NX/2][9] = 1.0;
std::cout << " init stuff done " << std::endl;
// Setup cuFFT.
cudaEvent_t start, stop;
float elapsedTimeInMs = 0.0f;
float gputime = 0.0f;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
cufftHandle plan1d;
if(cufftCreate(&plan1d)!=CUFFT_SUCCESS)
std::cout << "plan1d handle creation failed " << std::cout;
// cufftHandle plan1dInv;
int dims[2]={NZ-2,0};
int batch=NXNY;
if(cufftPlanMany(&plan1d,1,dims,NULL,0,0,NULL,0,0,CUFFT_R2C,batch) != CUFFT_SUCCESS)
std::cout << " plan many 1d fft failed " << std::endl;
else
std::cout << " plan many 1d fft success " << std::endl;
/ if(cufftPlanMany(&plan1dInv,1,dims,NULL,0,0,NULL,0,0,CUFFT_C2R,batch) != CUFFT_SUCCESS)
std::cout << " plan many 1d fft failed " << std::endl;
else
std::cout << " plan many 1d fft success " << std::endl; */
checkCudaErrors(cudaEventRecord(start, 0));
float * traces_d=0;
size_t memSize = NXNYNZsizeof(float);
checkCudaErrors(cudaMalloc((void ) &traces_d, memSize));
std::cout << " traces alloc on device ok bytes " << memSize/1e6 << " megabytes " << std::endl;
float * tracesAlt = &traces[0][0][0];
checkCudaErrors(cudaMemcpy(traces_d, tracesAlt, memSize,
cudaMemcpyHostToDevice));
if(cufftExecR2C(plan1d,(cufftReal)traces_d,(cufftComplex)traces_d) != CUFFT_SUCCESS)
std::cout << " exec fft r2c failed " << std::endl;
else
std::cout << " exec fft r2c good " << std::endl;
/ if(cufftExecC2R(plan1dInv,(cufftComplex*)traces_d,(cufftReal*)traces_d) != CUFFT_SUCCESS)
std::cout << " exec fft c2r failed " << std::endl;
else
std::cout << " exec fft c2r good " << std::endl; */
checkCudaErrors(status);
checkCudaErrors(cudaMemcpy(tracesAlt, traces_d, memSize,
cudaMemcpyDeviceToHost));
for(int i=0;i<NZ-2;i++)traces[NY/2][NX/2][i] *= 1./(float)(NZ-2);
checkCudaErrors(cudaEventRecord(stop, 0));
// make sure GPU has finished copying
checkCudaErrors(cudaEventSynchronize(stop));
//get the total elapsed time in ms
checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));
std::cout << "forward 1d FFT passed in time " << elapsedTimeInMs/1000.0 << " secs " << std::endl;
gputime += elapsedTimeInMs/1000.0;
cudaFree(traces_d);
size_t totalbytes = PACKETSIZE;
totalbytes = 2NX;
totalbytes = NY;
totalbytes =sizeof(float);
float * vslice = (float)malloc(NXNYsizeof(float));
cuFloatComplex * SRCslices_d;
checkCudaErrors(cudaMalloc((void **) &SRCslices_d,(size_t)(2PACKETSIZENYNXsizeof(float))));
cuFloatComplex * RCVslices_d;
checkCudaErrors(cudaMalloc((void **) &RCVslices_d,(size_t)(2PACKETSIZENYNX*sizeof(float))));
std::cout << "past initial device copies " << std::endl;
int numPacks = (NZ/2)/PACKETSIZE;
if(numPacksPACKETSIZE<NZ/2)numPacks++;
int pos=1;
int num;
size_t pacsize = NX2*NY;
pacsize *= PACKETSIZE;
pacsize *= sizeof(float);
cuFloatComplex *** SRCslices = (cuFloatComplex **)calloc3D(2NX,NY,PACKETSIZE);
cuFloatComplex *** RCVslices = (cuFloatComplex )calloc3D(2NX,NY,PACKETSIZE);
cuFloatComplex * SRCalt = (cuFloatComplex)&SRCslices[0][0][0];
cuFloatComplex * RCValt = (cuFloatComplex)&RCVslices[0][0][0];
totalMem += pacsize*2;
std::cout << " total memory " << (float)totalMem/1e9 << " Gb " << std::endl;
float FMAX=62.5;
float deltaf=FMAX/(float)(NZ/2-1);
float dw=acos(-1.0)2.0deltaf;
float w0=dw;
scale=1.0/(float)(NX*NY);
cufftHandle plan2d;
if(cufftCreate(&plan2d)!=CUFFT_SUCCESS)
std::cout << " plan2d handle creation failed" << std::endl;
dims[0]=NX;
dims[1]=NY;
batch=PACKETSIZE;
if(cufftPlanMany(&plan2d,2,dims,NULL,0,0,NULL,0,0,CUFFT_C2C,batch) != CUFFT_SUCCESS)
std::cout << "2d plan bake failed " << std::endl;
std::cout << "2d plan baked " << std::endl;
// double elapsed;
// Perform Forward FFT
for(int packet=0; packet< 1 ; packet++ ){
num = NZ/2-2-pos+1;
if(num>PACKETSIZE)num=PACKETSIZE;
std::cout << "packet " << packet << " packet start " << pos << " packet size " << num << std::endl;
for(int k=pos;k<pos+num;k++){
// std::cout << k << std::endl;
for(int i=0;i<NY;i++)
for(int j=0;j<NX;j++){
// std::cout << k << " " << j << " " << i << std::endl;
SRCslices[k-pos][i][j] = make_cuFloatComplex(traces[i][j][2k],traces[i][j][2k+1]);
RCVslices[k-pos][i][j] = make_cuFloatComplex(traces[i][j][2k],traces[i][j][2k+1]);
if(RCVslices[k-pos][i][j].x!=0.0)
std::cout << k-pos << " " << i << " " << j << " " << RCVslices[k-pos][i][j].x << " " << RCVslices[k-pos][i][j].y << std::endl;
}
}
for(int k=pos+num;k<pos+PACKETSIZE;k++)
for(int i=0;i<NY;i++)
for(int j=0;j<NX;j++){
// std::cout << k << " " << j << " " << i << std::endl;
SRCslices[k-pos][i][j] = make_cuFloatComplex(0.0,0.0);
RCVslices[k-pos][i][j] = make_cuFloatComplex(0.0,0.0);
}
std::cout << " past packet transposes " << std::endl;
checkCudaErrors(cudaEventRecord(start, 0));
checkCudaErrors(cudaMemcpy(SRCslices_d, SRCalt,pacsize ,
cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(RCVslices_d, RCValt,pacsize ,
cudaMemcpyHostToDevice));
checkCudaErrors(cudaEventRecord(stop, 0));
// make sure GPU has finished copying
checkCudaErrors(cudaEventSynchronize(stop));
//get the total elapsed time in ms
checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));
std::cout << " gpu time of initial packet copies " << elapsedTimeInMs/1000.0 << std::endl;
gputime += elapsedTimeInMs/1000.0;
// got about 6 KM
for(int iz=0;iz<1;iz++){
checkCudaErrors(cudaEventRecord(start, 0));
// wave number domain
if( cufftExecC2C(plan2d,SRCslices_d,SRCslices_d,CUFFT_FORWARD) != CUFFT_SUCCESS)
std::cout << " cufft exec 2d c2c failed " << std::endl;
if( cufftExecC2C(plan2d,RCVslices_d,RCVslices_d,CUFFT_FORWARD) != CUFFT_SUCCESS)
std::cout << " cufft exec 2d c2c failed " << std::endl;
dim3 grid (NY, num,1);
dim3 tBlock (NX, 1,1);
fftscale<<<grid,tBlock>>>( RCVslices_d,SRCslices_d);
// back to f-x domain
if( cufftExecC2C(plan2d,SRCslices_d,SRCslices_d,CUFFT_INVERSE) != CUFFT_SUCCESS)
std::cout << " cufft exec 2d c2c failed " << std::endl;
if( cufftExecC2C(plan2d,RCVslices_d,RCVslices_d,CUFFT_INVERSE) != CUFFT_SUCCESS)
std::cout << " cufft exec 2d c2c failed " << std::endl;
checkCudaErrors(cudaMemcpy(SRCalt, SRCslices_d,pacsize ,
cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(RCValt,RCVslices_d,pacsize ,
cudaMemcpyDeviceToHost));
for(int k=pos;k<pos+num;k++)
for(int i=0;i<NY;i++)
for(int j=0;j<NX;j++){
if(j==NX/2 && i==NY/2 )
std::cout << k-pos << " " << i << " " << j << " " << RCVslices[k-pos][i][j].x << " " << RCVslices[k-pos][i][j].y << std::endl;
}
checkCudaErrors(cudaEventRecord(stop, 0));
// make sure GPU has finished copying
checkCudaErrors(cudaEventSynchronize(stop));
//get the total elapsed time in ms
checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));
gputime += elapsedTimeInMs/1000.0;
}
pos += PACKETSIZE;
w0 += num*dw;
}
std::cout << " GPU time " << gputime << std::endl;
// std::cout << "destroy plans " << std::endl;
cufftDestroy(plan2d);
cufftDestroy(plan1d);
cudaFree(RCVslices_d);
cudaFree(SRCslices_d);
return 0;
}