fftshift with a kernel An error ocurre

Hi everyone,

I’m doing a kernel for making the fftshift with CUDA. First I do a CUFFT 2D and then I call a kernel, this is my code:

extern “C” void
FFT_BMP(const int argc, const char** argv, uchar1 *dato_pixeles, int alto_BMP, int ancho_BMP)
{

int width=ancho_BMP;
int height=alto_BMP;
int N=width*height;

size_t size = sizeof(float)*N;

int* dato_pixeles_pre = (int*)malloc(sizeof(int) *height*width);

Complex* datoo_host = (Complex*)malloc(sizeof(Complex) * width*height);
Complex* datoi_host = (Complex*)malloc(sizeof(Complex) * width*height);
Complex* datoFFT_host = (Complex*)malloc(sizeof(Complex) * width*height);	

cufftHandle plan;
cufftComplex *odata;
cufftComplex *idata;
cufftComplex *tempdata;
float *d_temp13x;  
float *d_temp13y;
float *d_temp24x;
float *d_temp24y;

cudaMalloc((void**)&odata, sizeof(cufftComplex)*width*height);
cudaMalloc((void**)&idata, sizeof(cufftComplex)*width*height);
cudaMalloc((void**)&tempdata, sizeof(cufftComplex)*width*height);
cudaMalloc((void**)&d_temp13x, size);
cudaMalloc((void**)&d_temp13y, size);
cudaMalloc((void**)&d_temp24x, size);
cudaMalloc((void**)&d_temp24y, size);

for (unsigned int i = 0; i < height; ++i) {  
	for (unsigned int j = 0; j < width; ++j) {   
		datoi_host[j+(i*width)].x = (float)dato_pixeles[j+(i*width)].x; 
	               datoi_host[j+(i*height)].y = (float)0; 		}
}

cudaMemcpy(idata, datoi_host, sizeof(float)*width*height, cudaMemcpyHostToDevice);

cufftPlan2d(&plan, width, height, CUFFT_C2C);

cufftExecC2C(plan, idata, odata, CUFFT_FORWARD);

dim3 tamaño_bloque(512,512);
dim3 tamaño_grid(width/tamaño_bloque.x,height/tamaño_bloque.y);

fft_shift<<< tamaño_grid, tamaño_bloque >>>( odata, d_temp13x, d_temp13y, d_temp24x, d_temp24y, width, height );


cudaMemcpy(datoFFT_host, odata,sizeof(float)*width*height,cudaMemcpyDeviceToHost);

FILE *real;  	
FILE *imag; 

real = fopen ( "real.txt", "w" );  
imag = fopen ( "imag.txt", "w" );  


for (unsigned int i = 0; i < height; ++i) {  
	for (unsigned int j = 0; j < width; ++j) {   
		fprintf(real,"%f ",datoFFT_host[j+(i*width)].x);
		fprintf(imag,"%f ",datoFFT_host[j+(i*width)].y);

		}
	fprintf(real,"\n"); 
	fprintf(imag,"\n");
}

fclose ( real );   
fclose ( imag );   

cufftExecC2C(plan, odata, tempdata, CUFFT_INVERSE);

cudaMemcpy(datoo_host, tempdata,sizeof(float)*width*height,cudaMemcpyDeviceToHost);

for (unsigned int i = 0; i < height; ++i) {   
	for (unsigned int j = 0; j < width; ++j) {   

		dato_pixeles_pre[j+(i*width)] = (int)(datoo_host[j+(i*width)].x);
	}
}

  for (unsigned int i = 0; i < height; ++i) {   
	for (unsigned int j = 0; j < width; ++j) {   

		dato_pixeles[j+(i*width)].x = (dato_pixeles_pre[j+(i*width)])/N;
	}
}


cufftDestroy(plan);

cudaFree(idata); 
cudaFree(odata);

cudaThreadExit();

}

and kernel is this:

global void
fft_shift( cufftComplex *odata, float *d_temp13x, float *d_temp13y, float d_temp24x, float d_temp24y, int width, int height )
{
int col = blockIdx.x
blockDim.x+threadIdx.x;
int fila = blockIdx.y
blockDim.y+threadIdx.y;

int m2 = width/2;
int n2 = height/2;

int col2 = (blockIdx.x)*blockDim.x+threadIdx.x+m2;
int fila2 = (blockIdx.y)*blockDim.y+threadIdx.y+n2;

d_temp13x[ filawidth+col ] = odata[ filawidth+col ].x;
odata[ filawidth+col ].x = odata[ fila2width+col2 ].x;
odata[ fila2width+col2 ].x = d_temp13x[ filawidth+col ];

d_temp13y[ filawidth+col ] = odata[ filawidth+col ].y;
odata[ filawidth+col ].y = odata[ fila2width+col2 ].y;
odata[ fila2width+col2 ].y = d_temp13y[ filawidth+col ];

d_temp24x[ filawidth+col ] = odata[ fila2width+col ].x;
odata[ fila2width+col ].x = odata[ filawidth+col2 ].x;
odata[ filawidth+col2 ].x = d_temp24x[ filawidth+col ];

d_temp24y[ filawidth+col ] = odata[ fila2width+col ].y;
odata[ fila2width+col ].y = odata[ filawidth+col2 ].y;
odata[ filawidth+col2 ].y = d_temp24y[ filawidth+col ];

}

The problem is exactly in: cufftExecC2C(plan, odata, tempdata, CUFFT_INVERSE);
The program finish here.

If somebody knows what is happening, please tell me.