plz help me! cufft error 'code=6(CUFFT_EXEC_FAILED)'

Hi,
I just implement hilbert transform using cufft.

When I just tested with small data(width=16, height=8, total 128 elements), it worked well.
However, it doesn’t work when I used such a big data(width=2400, height=1024).
The error occurred Line #111 attached source code.

I attach the source code, plz help me!

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <iomanip>


// includes, project
#include <cuda_runtime.h>
#include <cufft.h>
#include <helper_functions.h>
#include <helper_cuda.h>
#include <windows.h>
#include "device_launch_parameters.h"


#define WIDTH 2400
#define HEIGHT 1024
#define INPUTSIZE WIDTH*HEIGHT
#define TILE_WIDTH 256

void File_Read(FILE *fp, cufftComplex* input_value);
void Generate_Pointwise_Coeff(int* pointwise_coeff);
__global__ void point_wise_product(cufftComplex *a, int *b, int numElements);



int main(int argc, char **argv)
{
	FILE *fp=NULL;	

	int complexSize = sizeof(cufftComplex) * INPUTSIZE;
	int normalSize = sizeof(int) * INPUTSIZE;
	int* h_pointwise_coeff = new int[INPUTSIZE];
	
	
	// Allocate the host memory set
	cufftComplex* h_input_value = new cufftComplex[INPUTSIZE];
	cufftComplex* MID_result = (cufftComplex *)malloc(complexSize);
	cufftComplex* Hilbert_result = (cufftComplex *)malloc(complexSize);

	//Read the input signal file(it should be the real time signal)
	File_Read(fp, h_input_value);
	//Pointwise-product
	Generate_Pointwise_Coeff(h_pointwise_coeff);

	printf("\n===============================================================\n");
	printf("===================  Pointwise coeff Result  ===================\n");	
	printf("================================================================\n");
//	for(int i=0; i<INPUTSIZE; i++)
//		printf("index %d: %d \n", i+1, h_pointwise_coeff[i]);
//	printf("\n");

	// Allocate the device memory set
	cufftComplex* d_input_value;
	int *d_pointwise_coeff;

	checkCudaErrors(cudaMalloc((void**)&d_input_value, complexSize));	
	checkCudaErrors(cudaMalloc((void**)&d_pointwise_coeff, normalSize));

	//Copy input value & pointwise coeff host memory to device
	checkCudaErrors(cudaMemcpy(d_input_value, h_input_value, complexSize, 
												cudaMemcpyHostToDevice));
	checkCudaErrors(cudaMemcpy(d_pointwise_coeff, h_pointwise_coeff, normalSize,
													cudaMemcpyHostToDevice));


	//cufft plan 
	cufftHandle plan;	
	checkCudaErrors(cufftPlan1d(&plan, INPUTSIZE, CUFFT_C2C, 1));
	
	
	//FFT the input signal
	printf("====================================================\n");
	printf("==========      FFT signal cufftexecR2C   ==========\n");
	printf("====================================================\n");
	checkCudaErrors(cufftExecC2C(plan, d_input_value, d_input_value, CUFFT_FORWARD));
	
	

	printf("\n====================================================\n");
	printf("===================  FFT Result  ===================\n");	
	printf("====================================================\n");
	
	


	printf("\n=====================================================\n");
	printf("=======Launching ComplexPointwiseAndScale<<< >>>======\n");
	printf("======================================================\n");

	
	dim3 dimGrids((WIDTH)/TILE_WIDTH , (HEIGHT)/TILE_WIDTH, 1);
	dim3 dimBlocks(TILE_WIDTH, TILE_WIDTH, 1);
	
	point_wise_product<<<((WIDTH)/TILE_WIDTH)*((HEIGHT)/TILE_WIDTH) ,(TILE_WIDTH*TILE_WIDTH)>>>(d_input_value, d_pointwise_coeff,(int)INPUTSIZE);

	printf("\n=================================================================\n");
	printf("===================  PointWise-Product result  ===================\n");	
	printf("==================================================================\n");
    
		
	


	printf("\n====================================================\n");
	printf("===========    IFFT signal cufftexecR2C   ===========\n");
	printf("=====================================================\n");
	//Inverse FFT cufftHandle plan	
	checkCudaErrors(cufftExecC2C(plan, d_input_value, d_input_value, 1));



	//Copy Final result memory to host
	checkCudaErrors(cudaMemcpy(Hilbert_result, d_input_value, complexSize, cudaMemcpyDeviceToHost));

	
	printf("\n====================================================================\n");
	printf("===================  Result of Hilbert Transform  ===================\n");	
	printf("=====================================================================\n");
	
	for(unsigned int i = 0; i<INPUTSIZE; i++)
	{
		printf("Index %d:  Real-> %.2f,  imagi-> %.2f \n", i+1, Hilbert_result[i].x/((float)INPUTSIZE), Hilbert_result[i].y/((float)INPUTSIZE));
	}

	cufftDestroy(plan);
	free(h_pointwise_coeff);
	free(h_input_value);
	free(MID_result);
	free(Hilbert_result);
	cudaFree(d_input_value);
	cudaFree(d_pointwise_coeff);
	

	return 0;
}


void File_Read(FILE *fp, cufftComplex* input_value)
{
	fp=NULL;
	int i,j;
	float temp;	
	int offset=0;

	if((fp=fopen("input_full.dat","r"))==NULL)
	{
		fprintf(stderr,"Cannot open the file\n");
		exit(1);
	}

	for(j=0;j<WIDTH*HEIGHT;j++){
			fscanf(fp, "%f", &temp);			
			input_value[j].x = temp;
			input_value[j].y = 0;
	}	
	printf("============================================\n");
	printf("================ Input Data ================\n");
	printf("============================================\n");
	
}

void Generate_Pointwise_Coeff(int* pointwise_coeff)
{
	pointwise_coeff[0] = 1;
	pointwise_coeff[INPUTSIZE/2] = 1;

	for(unsigned int i = 0; i < INPUTSIZE/2 -1; i++)
	{
		pointwise_coeff[i+1] = 2;
		pointwise_coeff[INPUTSIZE/2 + 1 + i] = 0;
	}

}


__global__ void point_wise_product(cufftComplex *a, int *b, int numElements){

	
	int i = blockDim.x * blockIdx.x + threadIdx.x;	

	
		if(i < WIDTH*HEIGHT){		
		
		a[i].x = a[i].x * b[i];
		a[i].y = a[i].y * b[i];
	}
}

You’re not doing proper error checking at all.

CUFFT calls are not CUDA calls, and so it is not correct to use checkCudaErrors on a CUFFT call.
Furthermore, you are not doing proper cuda error checking on the kernel you are launching.

Your point_wise_product kernel launch is invalid and would not work for any GPU. You are requesting a threadblock size (number of threads per block) of TILE_WIDTHTILE_WIDTH. Since you’ve defined TILE_WIDTH to be 256, you are requesting 256256 threads per block, or 65536 threads per block.

This is not legal in CUDA.

In the future, if you want help, I suggest you demonstrate proper error checking throughout your code.

Since you are running on windows, you may also be running into a windows WDDM timeout when you try to run large data sets.