My kernel is not executed. cufft data type problem??

Hello,

I am trying to build a program based on fft. For the start I just want to make a simple test. Such as define 3D matrix with only 1 element non-nonzero–> forward fft–> apply a filter–> get the result. It appears that my kernel which is supposed to apply the filter to the transform does not get executed.
This is the kernel with the problem:

global void laplace(cufftComplex A,const float B, int llx,int lly,int llz)
{
// Add here the computation kernel
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int idy = blockIdx.y * blockDim.y + threadIdx.y;
int idz = blockIdx.z * blockDim.z + threadIdx.z;
int ind=idz+llz
(idy+lly
idx);
// int ind=idx+llx*(idy+llyidz);
if(ind<llx
llyllz)
{
A[ind].x=A[ind].x
B[ind]0.25f;
A[ind].y=A[ind].y
B[ind]*0.25f;
}

}

and howe I call it in the program

laplace<<<grid,threads>>>(dkpsi,dcene, lx,ly,lz);.

The final result indicated nothing happens when I call the kernel.

Id there some problem with working with the cufft data types? Becasue I was able to run simple similar kernels with only real data.

Below is my whole code.

#include
#include
#include
#include “error_checks.h” // Macros CUDA_CHECK and CHECK_ERROR_MSG
#include <cuda.h>
#include <cuda_runtime.h>
#include <cufft.h>

global void laplace(cufftComplex A,const float B, int llx,int lly,int llz)
{
// Add here the computation kernel
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int idy = blockIdx.y * blockDim.y + threadIdx.y;
int idz = blockIdx.z * blockDim.z + threadIdx.z;
int ind=idz+llz
(idy+lly
idx);
// int ind=idx+llx*(idy+llyidz);
if(ind<llx
llyllz)
{
A[ind].x=A[ind].x
B[ind]0.25f;
A[ind].y=A[ind].y
B[ind]*0.25f;
}

}

global void real2complex(float f,cufftComplex fc,int llx,int lly,int llz)
{
// Add here the computation kernel
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int idy = blockIdx.y * blockDim.y + threadIdx.y;
int idz = blockIdx.z * blockDim.z + threadIdx.z;
int ind=idz+llz
(idy+lly
idx);
// int ind=idx+llx*(idy+llyidz);
if(ind<llx
lly*llz)
{
fc[ind].x=f[ind];
fc[ind].y=0.0f;
}

}

global void complex2real(cufftComplex fc,float f, int llx,int lly,int llz)
{
// Add here the computation kernel
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int idy = blockIdx.y * blockDim.y + threadIdx.y;
int idz = blockIdx.z * blockDim.z + threadIdx.z;
int ind=idz+llz
(idy+lly
idx);
// int ind=idx+llx*(idy+llyidz);
if(ind<llx
llyllz)
{
f[ind]=fc[ind].x/((float) llx
(float) lly *(float) llz);
}

}

int main(void)
{
const int lx=16,ly=8,lz=8;
// const float dx=1.;
const float M_Pi=acos(-1);
float kx;

float hpsi[lx][ly][lz],hcene[lx][ly][lz];
float *dpsi,*dcene;
cufftComplex *dpsic;
cufftComplex *dkpsi;
cufftHandle plan; 


CUDA_CHECK( cudaMalloc((void**)&dpsi, sizeof(float)*lx*ly*lz) );
CUDA_CHECK( cudaMalloc((void**)&dpsic, sizeof(cufftComplex)*lx*ly*lz) );  
CUDA_CHECK( cudaMalloc((void**)&dkpsi, sizeof(cufftComplex)*lx*ly*lz) );  
CUDA_CHECK( cudaMalloc((void**)&dcene, sizeof(float)*lx*ly*lz) );          

cufftPlan3d(&plan,lx,ly,lz,CUFFT_C2C);


dim3 grid,threads;
grid.x=lx;
grid.y=lz;
grid.z=1;
threads.x=2*ly;
threads.y=1;
threads.z=1;


for(int i=0;i<lx;i++)
{
for(int j=0;j<ly;j++)
{     
for(int k=0;k<lz;k++)
{
if(i==8 && j==4 && k==2)

// if(j==4)
{hpsi[i][j][k]=1.0;}
else
{hpsi[i][j][k]=0;}
}}}

for(int i=0;i<lx;i++)
{  
if(i<=lx/2)
{kx=(float) i*2.0*M_Pi/(lx);}
if(i>lx/2)
{kx=(float) (i-lx)*2.0*M_Pi/(lx);     } 
for(int j=0;j<ly;j++)
{                      
for(int k=0;k<lz;k++)
{    
hcene[i][j][k]=2.0*(cos(kx)-1.0);  
}}}

CUDA_CHECK( cudaMemcpy(dpsi, hpsi, sizeof(float)*lx*ly*lz,cudaMemcpyHostToDevice) );
CUDA_CHECK( cudaMemcpy(dcene, hcene, sizeof(float)*lx*ly*lz,cudaMemcpyHostToDevice) );


real2complex<<<grid,threads>>>(dpsi,dpsic,lx,ly,lz);
cufftExecC2C(plan,dpsic,dkpsi,CUFFT_FORWARD);


laplace<<<grid,threads>>>(dkpsi,dcene, lx,ly,lz);

cufftExecC2C(plan,dkpsi,dkpsi,CUFFT_INVERSE);


complex2real<<<grid,threads>>>(dkpsi,dpsi,lx,ly,lz);

for(int i=0;i<lx;i++)
{  for(int j=0;j<ly;j++)
{     for(int k=0;k<lz;k++)
{        hpsi[i][j][k]=-5.0;}}}

CUDA_CHECK( cudaMemcpy(hpsi, dpsi, sizeof(float)*lx*ly*lz,cudaMemcpyDeviceToHost) );


CUDA_CHECK( cudaFree((void*)dpsi) );
CUDA_CHECK( cudaFree((void*)dpsic) );
CUDA_CHECK( cudaFree((void*)dkpsi) );
CUDA_CHECK( cudaFree((void*)dcene) );

printf(" \n");
for(int i=0;i<lx;i++)
{  for(int j=0;j<ly;j++)
{     for(int k=0;k<lz;k++)
{          
       if(i>=7 & i<=9 & j>=3 & j<=5 & k>=1 & k<=3)  

// if(i>=0 & i<lx & j==4 & k==2)
printf(“%f %d %d %d \n”, hpsi[i][j][k],i,j,k);
}}}

return 0;

}

You probably want something like

grid.x=lz;

grid.y=1;

grid.z=1;

threads.x=lx;

threads.y=ly;

threads.z=1;

and then

int idx = threadIdx.x;

int idy = threadIdx.y;

int idz = blockIdx.x;

inside the kernel.

Hello,

Thank for your reply the error was in the way I was getting the indices and something related to the cufft data types…

In order to finish my code I have to calcualtate the array C defined as

C[i]=0.5A[i]B[i]-aA[i]^3+bA[i]^4; with i=1–>N

How should this be implemented with minimum usage of memory?