Hello,
I am trying to build a program based on fft. For the start I just want to make a simple test. Such as define 3D matrix with only 1 element non-nonzero–> forward fft–> apply a filter–> get the result. It appears that my kernel which is supposed to apply the filter to the transform does not get executed.
This is the kernel with the problem:
global void laplace(cufftComplex A,const float B, int llx,int lly,int llz)
{
// Add here the computation kernel
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int idy = blockIdx.y * blockDim.y + threadIdx.y;
int idz = blockIdx.z * blockDim.z + threadIdx.z;
int ind=idz+llz(idy+llyidx);
// int ind=idx+llx*(idy+llyidz);
if(ind<llxllyllz)
{
A[ind].x=A[ind].xB[ind]0.25f;
A[ind].y=A[ind].yB[ind]*0.25f;
}
}
and howe I call it in the program
laplace<<<grid,threads>>>(dkpsi,dcene, lx,ly,lz);.
The final result indicated nothing happens when I call the kernel.
Id there some problem with working with the cufft data types? Becasue I was able to run simple similar kernels with only real data.
Below is my whole code.
#include
#include
#include
#include “error_checks.h” // Macros CUDA_CHECK and CHECK_ERROR_MSG
#include <cuda.h>
#include <cuda_runtime.h>
#include <cufft.h>
global void laplace(cufftComplex A,const float B, int llx,int lly,int llz)
{
// Add here the computation kernel
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int idy = blockIdx.y * blockDim.y + threadIdx.y;
int idz = blockIdx.z * blockDim.z + threadIdx.z;
int ind=idz+llz(idy+llyidx);
// int ind=idx+llx*(idy+llyidz);
if(ind<llxllyllz)
{
A[ind].x=A[ind].xB[ind]0.25f;
A[ind].y=A[ind].yB[ind]*0.25f;
}
}
global void real2complex(float f,cufftComplex fc,int llx,int lly,int llz)
{
// Add here the computation kernel
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int idy = blockIdx.y * blockDim.y + threadIdx.y;
int idz = blockIdx.z * blockDim.z + threadIdx.z;
int ind=idz+llz(idy+llyidx);
// int ind=idx+llx*(idy+llyidz);
if(ind<llxlly*llz)
{
fc[ind].x=f[ind];
fc[ind].y=0.0f;
}
}
global void complex2real(cufftComplex fc,float f, int llx,int lly,int llz)
{
// Add here the computation kernel
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int idy = blockIdx.y * blockDim.y + threadIdx.y;
int idz = blockIdx.z * blockDim.z + threadIdx.z;
int ind=idz+llz(idy+llyidx);
// int ind=idx+llx*(idy+llyidz);
if(ind<llxllyllz)
{
f[ind]=fc[ind].x/((float) llx (float) lly *(float) llz);
}
}
int main(void)
{
const int lx=16,ly=8,lz=8;
// const float dx=1.;
const float M_Pi=acos(-1);
float kx;
float hpsi[lx][ly][lz],hcene[lx][ly][lz];
float *dpsi,*dcene;
cufftComplex *dpsic;
cufftComplex *dkpsi;
cufftHandle plan;
CUDA_CHECK( cudaMalloc((void**)&dpsi, sizeof(float)*lx*ly*lz) );
CUDA_CHECK( cudaMalloc((void**)&dpsic, sizeof(cufftComplex)*lx*ly*lz) );
CUDA_CHECK( cudaMalloc((void**)&dkpsi, sizeof(cufftComplex)*lx*ly*lz) );
CUDA_CHECK( cudaMalloc((void**)&dcene, sizeof(float)*lx*ly*lz) );
cufftPlan3d(&plan,lx,ly,lz,CUFFT_C2C);
dim3 grid,threads;
grid.x=lx;
grid.y=lz;
grid.z=1;
threads.x=2*ly;
threads.y=1;
threads.z=1;
for(int i=0;i<lx;i++)
{
for(int j=0;j<ly;j++)
{
for(int k=0;k<lz;k++)
{
if(i==8 && j==4 && k==2)
// if(j==4)
{hpsi[i][j][k]=1.0;}
else
{hpsi[i][j][k]=0;}
}}}
for(int i=0;i<lx;i++)
{
if(i<=lx/2)
{kx=(float) i*2.0*M_Pi/(lx);}
if(i>lx/2)
{kx=(float) (i-lx)*2.0*M_Pi/(lx); }
for(int j=0;j<ly;j++)
{
for(int k=0;k<lz;k++)
{
hcene[i][j][k]=2.0*(cos(kx)-1.0);
}}}
CUDA_CHECK( cudaMemcpy(dpsi, hpsi, sizeof(float)*lx*ly*lz,cudaMemcpyHostToDevice) );
CUDA_CHECK( cudaMemcpy(dcene, hcene, sizeof(float)*lx*ly*lz,cudaMemcpyHostToDevice) );
real2complex<<<grid,threads>>>(dpsi,dpsic,lx,ly,lz);
cufftExecC2C(plan,dpsic,dkpsi,CUFFT_FORWARD);
laplace<<<grid,threads>>>(dkpsi,dcene, lx,ly,lz);
cufftExecC2C(plan,dkpsi,dkpsi,CUFFT_INVERSE);
complex2real<<<grid,threads>>>(dkpsi,dpsi,lx,ly,lz);
for(int i=0;i<lx;i++)
{ for(int j=0;j<ly;j++)
{ for(int k=0;k<lz;k++)
{ hpsi[i][j][k]=-5.0;}}}
CUDA_CHECK( cudaMemcpy(hpsi, dpsi, sizeof(float)*lx*ly*lz,cudaMemcpyDeviceToHost) );
CUDA_CHECK( cudaFree((void*)dpsi) );
CUDA_CHECK( cudaFree((void*)dpsic) );
CUDA_CHECK( cudaFree((void*)dkpsi) );
CUDA_CHECK( cudaFree((void*)dcene) );
printf(" \n");
for(int i=0;i<lx;i++)
{ for(int j=0;j<ly;j++)
{ for(int k=0;k<lz;k++)
{
if(i>=7 & i<=9 & j>=3 & j<=5 & k>=1 & k<=3)
// if(i>=0 & i<lx & j==4 & k==2)
printf(“%f %d %d %d \n”, hpsi[i][j][k],i,j,k);
}}}
return 0;
}