I have been struggling last four days to resolve this problem but I couldn’t solve it. I really appreciate it if anyone can help me. The kernels written inside the code are working perfectly fine and outputs are matched with MATLAB.
However, the problem is coming from the last function fft_check()
where the line
checkcuFFT(cufftExecD2Z(plann, vpad, vz))
throws illegal memory access.
Note that I have another function Aj_projection_FFT()
where I also have computed the three similar FFTs but I didn’t get any error.
More interestingly, if I suppress Aj_projection_FFT()
, which is independent of the function fft_check()
, fft_check()
works fine and the complex output vz
perfectly matched with MATLAB output (first half-length).
Here is the cuda-memcheck
output
========= CUDA-MEMCHECK
========= Invalid __global__ read of size 8
========= at 0x00000270 in __nv_static_41__28_bluestein_compute_75_cpp1_ii_d293d249
__Z17bluestein_pad_r2cId7ComplexIdEiEvT1_S2_S2_S2_S2_S2_iT_PvS4_S4_10callback_t
========= by thread (159,0,0) in block (6603,0,0)
========= Address 0x2306ec5cf8 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib64/libcuda.so.1 (cuLaunchKernel + 0x2cd) [0x24f88d]
========= Host Frame:/usr/local/cuda-10.0/lib64/libcufft.so.10.0 [0x28ee32]
========= Host Frame:/usr/local/cuda-10.0/lib64/libcufft.so.10.0 [0x28f027]
========= Host Frame:/usr/local/cuda-10.0/lib64/libcufft.so.10.0 [0x2c33e5]
========= Host Frame:/usr/local/cuda-10.0/lib64/libcufft.so.10.0 [0x19a3a5]
========= Host Frame:/usr/local/cuda-10.0/lib64/libcufft.so.10.0 [0x19bdb2]
========= Host Frame:/usr/local/cuda-10.0/lib64/libcufft.so.10.0 [0xb2edf]
========= Host Frame:/usr/local/cuda-10.0/lib64/libcufft.so.10.0 [0x3ff6b]
========= Host Frame:/usr/local/cuda-10.0/lib64/libcufft.so.10.0 [0x3fd19]
========= Host Frame:/usr/local/cuda-10.0/lib64/libcufft.so.10.0 [0x2aed4]
========= Host Frame:/usr/local/cuda-10.0/lib64/libcufft.so.10.0 [0x2b08b]
========= Host Frame:/usr/local/cuda-10.0/lib64/libcufft.so.10.0 (cufftExecD2Z + 0x72) [0x3ad32]
========= Host Frame:./a.out [0x62b9]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0x100) [0x1ed20]
========= Host Frame:./a.out [0x3379]
=========
========= Invalid __global__ read of size 8
========= at 0x00000270 in __nv_static_41__28_bluestein_compute_75_cpp1_ii_d293d249__Z17bluestein_pad_r2cId7ComplexIdEiEvT1_S2_S2_S2_S2_S2_iT_PvS4_S4_10callback_t
========= by thread (158,0,0) in block (6603,0,0)
========= Address 0x2306ec5cf0 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib64/libcuda.so.1 (cuLaunchKernel + 0x2cd) [0x24f88d]
========= Host Frame:/usr/local/cuda-10.0/lib64/libcufft.so.10.0 [0x28ee32]
========= Host Frame:/usr/local/cuda-10.0/lib64/libcufft.so.10.0 [0x28f027]
========= Host Frame:/usr/local/cuda-10.0/lib64/libcufft.so.10.0 [0x2c33e5]
========= Host Frame:/usr/local/cuda-10.0/lib64/libcufft.so.10.0 [0x19a3a5]
========= Host Frame:/usr/local/cuda-10.0/lib64/libcufft.so.10.0 [0x19bdb2]
========= Host Frame:/usr/local/cuda-10.0/lib64/libcufft.so.10.0 [0xb2edf]
========= Host Frame:/usr/local/cuda-10.0/lib64/libcufft.so.10.0 [0x3ff6b]
========= Host Frame:/usr/local/cuda-10.0/lib64/libcufft.so.10.0 [0x3fd19]
========= Host Frame:/usr/local/cuda-10.0/lib64/libcufft.so.10.0 [0x2aed4]
========= Host Frame:/usr/local/cuda-10.0/lib64/libcufft.so.10.0 [0x2b08b]
========= Host Frame:/usr/local/cuda-10.0/lib64/libcufft.so.10.0 (cufftExecD2Z + 0x72) [0x3ad32]
========= Host Frame:./a.out [0x62b9]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0x100) [0x1ed20]
========= Host Frame:./a.out [0x3379]
--More--
Here is the code
#include<stdio.h>
#include<stdlib.h>
#include<cuda.h>
#include <cufft.h>
const int ThreadsPerBlock = 128;
#define checkCuda(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
static const char *_cudaGetErrorEnum(cufftResult error)
{
switch (error)
{
case CUFFT_SUCCESS:
return "CUFFT_SUCCESS";
case CUFFT_INVALID_PLAN:
return "CUFFT_INVALID_PLAN";
case CUFFT_ALLOC_FAILED:
return "CUFFT_ALLOC_FAILED";
case CUFFT_INVALID_TYPE:
return "CUFFT_INVALID_TYPE";
case CUFFT_INVALID_VALUE:
return "CUFFT_INVALID_VALUE";
case CUFFT_INTERNAL_ERROR:
return "CUFFT_INTERNAL_ERROR";
case CUFFT_EXEC_FAILED:
return "CUFFT_EXEC_FAILED";
case CUFFT_SETUP_FAILED:
return "CUFFT_SETUP_FAILED";
case CUFFT_INVALID_SIZE:
return "CUFFT_INVALID_SIZE";
case CUFFT_UNALIGNED_DATA:
return "CUFFT_UNALIGNED_DATA";
}
return "<unknown>";
}
#define checkcuFFT(call) { \
cufftResult_t err; \
if ((err = (call)) != CUFFT_SUCCESS) { \
fprintf(stderr, "cuFFT error %d:%s at %s:%d\n", err, _cudaGetErrorEnum(err), \
__FILE__, __LINE__); \
exit(1); \
} \
}
__global__ void create_x(double K, double xf, double *x, int N)
{
int id = blockIdx.x*ThreadsPerBlock + threadIdx.x;
double c = K/2.5;
double del_eps = (1.0/(N-1.0))*(asinh((xf-K)/c)-asinh(-K/c));
if(id < N)
x[id] = K + c*sinh(asinh(-K/c) + id*del_eps);
}
__global__ void create_y(double K, double yf, double *y, int M)
{
int id = blockIdx.x*ThreadsPerBlock + threadIdx.x;
double c = K/2.5;
double del_eps = (1.0/(M-1.0))*(asinh((yf-K)/c)-asinh(-K/c));
if(id < M)
y[id] = K + c*sinh(asinh(-K/c) + id*del_eps);
}
__global__ void find_min_element(double *x, double *min_element, int *mutex, int N)
{
__shared__ double lmin[ThreadsPerBlock];
int lId = threadIdx.x;
int gId = blockIdx.x*ThreadsPerBlock + lId;
double lminimum = 100.0;
if(gId > 0 && gId < N-1)
lminimum = log(x[gId+1]) - log(x[gId]);
lmin[lId] = lminimum;
if(gId == 0)
*min_element = 100.0;
__syncthreads();
int stride = ThreadsPerBlock;
while(stride > 1)
{
stride >>= 1;
if (lId < stride)
lmin[lId] = min(lmin[lId], lmin[lId + stride]);
__syncthreads();
}
if (threadIdx.x == 0)
{
while(atomicCAS(mutex,0,1) != 0); //lock
*min_element = min(*min_element, lmin[0]);
atomicExch(mutex, 0);
}
}
__global__ void Ajprojection(double delp, double delq, double gamma1, double gamma2,
double delta1, double delta2, double rho_N,
double cons2, double cons3, double cons4,
int n, int m,
double *Ajp)
{
int gId = blockIdx.x * blockDim.x + threadIdx.x; //global Id
if(gId < n)
{
int m1 = (n+1)/2; int m2 = (m+1)/2; // recalculating M1 and M2
double ext_p, ext_q;
int lId;
for(int k = 0; k<m; k++)
{
lId = k*n + gId;
ext_p = (m1 - 1 - gId)*delp;
ext_q = (m2 - 1 - k)*delq;
ext_p = (ext_p - gamma1)/delta1;
ext_q = (ext_q - gamma2)/delta2;
Ajp[lId] = cons2*cons3*exp(-(ext_p*ext_p + ext_q*ext_q - 2.0*rho_N*ext_p*ext_q)/cons4);
}
}
}
__global__ void Ajs10projection(double cons1, double delp, double gamma1, double delta1,
int n,
double *Ajps10)
{
int gId = blockIdx.x*blockDim.x + threadIdx.x; //global Id
if(gId < n)
{
int m1 = (n+1)/2; // recalculating M1
double ext_p = (m1 - 1 - gId)*delp;
ext_p = (ext_p - gamma1)/delta1;
Ajps10[gId] = cons1*exp(-0.5*ext_p*ext_p);
}
}
__global__ void Ajs20projection(double cons1, double delq, double gamma2, double delta2,
int m,
double *Ajps20)
{
int gId = blockIdx.x*blockDim.x + threadIdx.x; //global Id
if(gId < m)
{
int m2 = (m+1)/2; // recalculating M2
double ext_q = (m2 - 1 - gId)*delq;
ext_q = (ext_q - gamma2)/delta2;
Ajps20[gId] = cons1*exp(-0.5*ext_q*ext_q);
}
}
__global__ void init_u(double K, double *x, double *y, double *u, int N, int M)
{
int Rid = blockIdx.x*blockDim.x + threadIdx.x; //Row Id
int gId; //global id
double xId;
if(Rid < N)
{
xId = x[Rid];
for(int Cid = 0; Cid<M; Cid++)
{
gId = Cid*N + Rid;
u[gId] = max(K- min(xId, y[Cid]), 0.0);
}
}
}
__global__ void interp2_nonuni_to_uni(int N, int M, int Np, int Mp,
double c, double K, double stride, double del_eps, double delp, double delq,
double *x, double *y, double *u, double *v)
{
int Rid = blockIdx.x*blockDim.x + threadIdx.x;
int Cid = blockIdx.y*blockDim.y + threadIdx.y;
if(Rid == 0)
{
if(exp((-Np/2+1)*delp) < x[0] || exp((Np/2-1)*delp) > x[N-1])
printf("out of the range of x\n");
if(exp((-Mp/2+1)*delq) < y[0] || exp((Mp/2-1)*delq) > y[M-1])
printf("out of the range of y\n");
}
if(Rid < Np && Cid < Mp)
{
double ux = exp((-Np/2 + 1 + Rid)*delp);
double uy = exp((-Mp/2 + 1 + Cid)*delp);
if(Rid == Np - 1) ux = x[N-1];
if(Cid == Mp - 1) uy = y[M-1];
int nu_xid = floor((asinh((ux-K)/c)-stride)/del_eps);
int nu_yid = floor((asinh((uy-K)/c)-stride)/del_eps);
if(nu_xid == N-1)
nu_xid--;
if(nu_yid == M-1)
nu_yid--;
int nu_xxid = nu_xid + 1; int nu_yyid = nu_yid + 1;
double xv = x[nu_xid]; double xxv = x[nu_xxid];
double yv = y[nu_yid]; double yyv = y[nu_yyid];
v[Cid*Np + Rid] = 1.0/((xxv - xv)*(yyv - yv))*( (xxv - ux)*(yyv - uy)*u[nu_yid*N + nu_xid]
+ (ux - xv)*(yyv - uy)*u[nu_yid*N + nu_xxid]
+ (xxv - ux)*(uy - yv)*u[nu_yyid*N + nu_xid]
+ (ux - xv)*(uy - yv)*u[nu_yyid*N + nu_xxid]);
}
}
__global__ void padding(double* x, double *pdx, int m1)
{
int i = threadIdx.x; int j = blockIdx.x;
int blocksize = blockDim.x; int tsize = 2*m1-1;
int step = tsize/blocksize + 1; int Gsize = j*tsize;
int gsize = j*m1; int id;
for(int k = 0; k<step; k++)
{
id = k*blocksize + i;
if(id<tsize)
{
if(id<m1)
pdx[Gsize + id] = x[gsize + id];
else
pdx[Gsize + id] = 0.0;
}
}
}
void Aj_projection_FFT(double delp, double delq, double gamma1, double gamma2,
double delta1, double delta2, double rho_N, double lambda,
int n, int m,
cuDoubleComplex *Ajp_Z, cuDoubleComplex *Ajps10_Z, cuDoubleComplex *Ajps20_Z)
{
double pi = 3.141592653589793238;
double cons1 = 2.0*sqrt(1.0 - rho_N*rho_N);
double cons2 = 1.0/(delta1*delta2*pi*cons1);
double cons3 = lambda*delp*delq;
double cons4 = 2.0*(1.0 - rho_N*rho_N);
int nB = n/ThreadsPerBlock + (n %ThreadsPerBlock == 0 ? 0 : 1);
/************** Creating nonredundant data of Aj ************/
double *Ajp, *Ajps10, *Ajps20;
checkCuda(cudaMalloc((void**) &Ajp, n*m*sizeof(double)));
checkCuda(cudaMalloc((void**) &Ajps10, n*sizeof(double)));
checkCuda(cudaMalloc((void**) &Ajps20, m*sizeof(double)));
Ajprojection<<<nB, ThreadsPerBlock>>>(delp, delq, gamma1, gamma2, delta1, delta2, rho_N,
cons2, cons3, cons4, n, m,
Ajp);
checkCuda(cudaPeekAtLastError()); checkCuda(cudaDeviceSynchronize());
cons1 = (lambda*delp)/(delta1*sqrt(2.0*pi));
Ajs10projection<<<nB, ThreadsPerBlock>>>(cons1, delp, gamma1, delta1, n, Ajps10);
checkCuda(cudaPeekAtLastError()); checkCuda(cudaDeviceSynchronize());
cons2 = (lambda*delq)/(delta2*sqrt(2.0*pi));
nB = m/ThreadsPerBlock + (m %ThreadsPerBlock == 0 ? 0 : 1);
Ajs20projection<<<nB, ThreadsPerBlock>>>(cons2, delq, gamma2, delta2, m, Ajps20);
checkCuda(cudaPeekAtLastError()); checkCuda(cudaDeviceSynchronize());
cufftHandle plan_Ajp, plan_Ajp1, plan_Ajp2;
checkcuFFT(cufftPlan1d(&plan_Ajp, n*m, CUFFT_D2Z, 1));
checkcuFFT(cufftPlan1d(&plan_Ajp1, n, CUFFT_D2Z, 1));
checkcuFFT(cufftPlan1d(&plan_Ajp2, m, CUFFT_D2Z, 1));
checkcuFFT(cufftExecD2Z(plan_Ajp, Ajp, Ajp_Z));
checkcuFFT(cufftExecD2Z(plan_Ajp1, Ajps10, Ajps10_Z));
checkcuFFT(cufftExecD2Z(plan_Ajp2, Ajps20, Ajps20_Z));
cufftDestroy(plan_Ajp); cufftDestroy(plan_Ajp1); cufftDestroy(plan_Ajp2);
cudaFree(Ajp); cudaFree(Ajps10); cudaFree(Ajps20);
}
void fft_check(int N, int M, int Np, int Mp, int n, int m, int nm_FFT,
double c, double K, double stride, double del_eps, double delp, double delq,
double *x, double *y, double *u)
{
dim3 TpB(16, 16);
dim3 nTB(Np/TpB.x + (Np % TpB.x == 0 ? 0 : 1), Mp/TpB.y + (Mp % TpB.y == 0 ? 0 : 1));
double *v; checkCuda(cudaMalloc((void**) &v, Np*Mp*sizeof(double)));
double *vpad; checkCuda(cudaMalloc((void**) &vpad, n*Mp*sizeof(double)));
checkCuda(cudaMemset(v, 0, Np*Mp*sizeof(double)));
interp2_nonuni_to_uni<<<nTB, TpB>>>(N, M, Np, Mp,
c, K, stride, del_eps, delp, delq,
x, y, u, v);
checkCuda(cudaPeekAtLastError()); checkCuda(cudaDeviceSynchronize());
padding<<<Mp, ThreadsPerBlock>>>(v, vpad, Np);
checkCuda(cudaPeekAtLastError()); checkCuda(cudaDeviceSynchronize());
cuDoubleComplex* vz; checkCuda(cudaMalloc((void**) &vz, nm_FFT*sizeof(cuDoubleComplex)));
cufftHandle plann;
checkcuFFT(cufftPlan1d(&plann, n*m, CUFFT_D2Z, 1))
checkcuFFT(cufftExecD2Z(plann, vpad, vz))
checkCuda(cudaPeekAtLastError()); checkCuda(cudaDeviceSynchronize());
cufftDestroy(plann);
}
int main()
{
checkCuda(cudaDeviceReset());
int N, M;
double K, rho2, lambda, gamma1, gamma2, delta1, delta2;
N = 128; M = 128;
K = 100.0; rho2 = -0.2; lambda = 0.6; gamma1 = -0.10;
gamma2 = 0.10; delta1 = 0.17; delta2 = 0.13;
const int Num_of_Blocks_x = N/ThreadsPerBlock + (N % ThreadsPerBlock == 0 ? 0 : 1);
const int Num_of_Blocks_y = M/ThreadsPerBlock + (M % ThreadsPerBlock == 0 ? 0 : 1);
double xf, yf; xf = 5.0*K; yf = 5.0*K;
double *x; checkCuda(cudaMalloc((void**) &x, N*sizeof(double)));
create_x<<<Num_of_Blocks_x, ThreadsPerBlock>>>(K, xf, x, N);
checkCuda(cudaPeekAtLastError()); checkCuda(cudaDeviceSynchronize());
double *y; checkCuda(cudaMalloc((void**) &y, M*sizeof(double)));
create_y<<<Num_of_Blocks_y, ThreadsPerBlock>>>(K, yf, y, M);
checkCuda(cudaPeekAtLastError()); checkCuda(cudaDeviceSynchronize());
double lxf = log(xf); double lyf = log(yf);
double *delx_min, *dely_min; int *d_mutex;
checkCuda(cudaMalloc((void**) &delx_min, sizeof(double)));
checkCuda(cudaMalloc((void**) &dely_min, sizeof(double)));
checkCuda(cudaMalloc((void**) &d_mutex, sizeof(int)));
checkCuda(cudaMemset(d_mutex, 0, sizeof(float)));
find_min_element<<<Num_of_Blocks_x, ThreadsPerBlock>>>(x, delx_min, d_mutex, N);
checkCuda(cudaPeekAtLastError()); checkCuda(cudaDeviceSynchronize());
find_min_element<<<Num_of_Blocks_y, ThreadsPerBlock>>>(y, dely_min, d_mutex, M);
checkCuda(cudaPeekAtLastError()); checkCuda(cudaDeviceSynchronize());
double lnx_min, lny_min;
checkCuda(cudaMemcpy(&lnx_min, delx_min, sizeof(double), cudaMemcpyDeviceToHost));
checkCuda(cudaMemcpy(&lny_min, dely_min, sizeof(double), cudaMemcpyDeviceToHost));
int Np, Mp; Np = ceil((2*lxf)/lnx_min); Mp = ceil((2*lyf)/lny_min);
if(Np%2 != 0)
Np++;
if(Mp%2 != 0)
Mp++;
double delp, delq; delp = (2*lxf)/Np; delq = (2*lyf)/Mp;
int n = (2*Np-1); int m = (2*Mp-1); int nm = n*m;
int nm_FFT = nm/2 + 1; int n_FFT = n/2 + 1; int m_FFT = m/2 + 1;
cuDoubleComplex *Ajp_Z, *Ajps10_Z, *Ajps20_Z;
checkCuda(cudaMalloc((void**) &Ajp_Z, nm_FFT*sizeof(cuDoubleComplex)));
checkCuda(cudaMalloc((void**) &Ajps10_Z, n_FFT*sizeof(cuDoubleComplex)));
checkCuda(cudaMalloc((void**) &Ajps20_Z, m_FFT*sizeof(cuDoubleComplex)));
Aj_projection_FFT(delp, delq, gamma1, gamma2, delta1, delta2, rho2, lambda,
n, m,
Ajp_Z, Ajps10_Z, Ajps20_Z);
double c = K/2.5; double stride = asinh(-K/c);
double del_eps = (1.0/(N-1.0))*(asinh((xf-K)/c)-asinh(-K/c));
double *u; checkCuda(cudaMalloc((void**) &u, N*M*sizeof(double)));
init_u<<<Num_of_Blocks_x, ThreadsPerBlock>>>(K, x, y, u, N, M);
checkCuda(cudaPeekAtLastError()); checkCuda(cudaDeviceSynchronize());
fft_check(N, M, Np, Mp, n, m, nm_FFT, c, K, stride, del_eps, delp, delq, x, y, u);
return 0;
}