Hi everyone,
I’m doing a kernel for making the fftshift with CUDA. First I do a CUFFT 2D and then I call a kernel, this is my code:
extern “C” void
FFT_BMP(const int argc, const char** argv, uchar1 *dato_pixeles, int alto_BMP, int ancho_BMP)
{
int width=ancho_BMP;
int height=alto_BMP;
int N=width*height;
size_t size = sizeof(float)*N;
int* dato_pixeles_pre = (int*)malloc(sizeof(int) *height*width);
Complex* datoo_host = (Complex*)malloc(sizeof(Complex) * width*height);
Complex* datoi_host = (Complex*)malloc(sizeof(Complex) * width*height);
Complex* datoFFT_host = (Complex*)malloc(sizeof(Complex) * width*height);
cufftHandle plan;
cufftComplex *odata;
cufftComplex *idata;
cufftComplex *tempdata;
float *d_temp13x;
float *d_temp13y;
float *d_temp24x;
float *d_temp24y;
cudaMalloc((void**)&odata, sizeof(cufftComplex)*width*height);
cudaMalloc((void**)&idata, sizeof(cufftComplex)*width*height);
cudaMalloc((void**)&tempdata, sizeof(cufftComplex)*width*height);
cudaMalloc((void**)&d_temp13x, size);
cudaMalloc((void**)&d_temp13y, size);
cudaMalloc((void**)&d_temp24x, size);
cudaMalloc((void**)&d_temp24y, size);
for (unsigned int i = 0; i < height; ++i) {
for (unsigned int j = 0; j < width; ++j) {
datoi_host[j+(i*width)].x = (float)dato_pixeles[j+(i*width)].x;
datoi_host[j+(i*height)].y = (float)0; }
}
cudaMemcpy(idata, datoi_host, sizeof(float)*width*height, cudaMemcpyHostToDevice);
cufftPlan2d(&plan, width, height, CUFFT_C2C);
cufftExecC2C(plan, idata, odata, CUFFT_FORWARD);
dim3 tamaño_bloque(512,512);
dim3 tamaño_grid(width/tamaño_bloque.x,height/tamaño_bloque.y);
fft_shift<<< tamaño_grid, tamaño_bloque >>>( odata, d_temp13x, d_temp13y, d_temp24x, d_temp24y, width, height );
cudaMemcpy(datoFFT_host, odata,sizeof(float)*width*height,cudaMemcpyDeviceToHost);
FILE *real;
FILE *imag;
real = fopen ( "real.txt", "w" );
imag = fopen ( "imag.txt", "w" );
for (unsigned int i = 0; i < height; ++i) {
for (unsigned int j = 0; j < width; ++j) {
fprintf(real,"%f ",datoFFT_host[j+(i*width)].x);
fprintf(imag,"%f ",datoFFT_host[j+(i*width)].y);
}
fprintf(real,"\n");
fprintf(imag,"\n");
}
fclose ( real );
fclose ( imag );
cufftExecC2C(plan, odata, tempdata, CUFFT_INVERSE);
cudaMemcpy(datoo_host, tempdata,sizeof(float)*width*height,cudaMemcpyDeviceToHost);
for (unsigned int i = 0; i < height; ++i) {
for (unsigned int j = 0; j < width; ++j) {
dato_pixeles_pre[j+(i*width)] = (int)(datoo_host[j+(i*width)].x);
}
}
for (unsigned int i = 0; i < height; ++i) {
for (unsigned int j = 0; j < width; ++j) {
dato_pixeles[j+(i*width)].x = (dato_pixeles_pre[j+(i*width)])/N;
}
}
cufftDestroy(plan);
cudaFree(idata);
cudaFree(odata);
cudaThreadExit();
}
and kernel is this:
global void
fft_shift( cufftComplex *odata, float *d_temp13x, float *d_temp13y, float d_temp24x, float d_temp24y, int width, int height )
{
int col = blockIdx.xblockDim.x+threadIdx.x;
int fila = blockIdx.yblockDim.y+threadIdx.y;
int m2 = width/2;
int n2 = height/2;
int col2 = (blockIdx.x)*blockDim.x+threadIdx.x+m2;
int fila2 = (blockIdx.y)*blockDim.y+threadIdx.y+n2;
d_temp13x[ filawidth+col ] = odata[ filawidth+col ].x;
odata[ filawidth+col ].x = odata[ fila2width+col2 ].x;
odata[ fila2width+col2 ].x = d_temp13x[ filawidth+col ];
d_temp13y[ filawidth+col ] = odata[ filawidth+col ].y;
odata[ filawidth+col ].y = odata[ fila2width+col2 ].y;
odata[ fila2width+col2 ].y = d_temp13y[ filawidth+col ];
d_temp24x[ filawidth+col ] = odata[ fila2width+col ].x;
odata[ fila2width+col ].x = odata[ filawidth+col2 ].x;
odata[ filawidth+col2 ].x = d_temp24x[ filawidth+col ];
d_temp24y[ filawidth+col ] = odata[ fila2width+col ].y;
odata[ fila2width+col ].y = odata[ filawidth+col2 ].y;
odata[ filawidth+col2 ].y = d_temp24y[ filawidth+col ];
}
The problem is exactly in: cufftExecC2C(plan, odata, tempdata, CUFFT_INVERSE);
The program finish here.
If somebody knows what is happening, please tell me.