Getting Segmentation fault for higher sized input arrays for 2d CuFFT application.

Hii,
I was trying to develop a CUDA (with C) code for finding 2d fft of any input matrix. The problem that i am facing is the code is running well for smaller sized input like X[25][25] but as i am increasing the size and reaching a size of even X[1000][1000] , it is producing ‘Segmentation Fault’ on my terminal screen. I have checked the whole code several times but i am not able to find the bug. I thought that it could be a memory allocation or pointer dereferencing issue but i have checked it by alternate methods that these are not the issues. Through MATLAB, i checked that i am able to generate a matrix of size 1000x1000 (even 4096x4096 too) on GPU.
Herein, i am posting my code. I request you to please look into the code once and help me with the relevant solution.
Thanking you.

#define nx 750
#define ny 750
main(int argc, char** argv)
{
clock_t start;
double diff;
start = clock();
FILE* input;
FILE* output;

    cufftComplex h_signal[nx][ny];
cufftComplex h_out[nx][ny];
cufftHandle plan;	

int c=0;
for (int i = 0; i < nx; i++)
	{
	for(int j=0; j < ny; j++)
	{
    		h_signal[i][j].x = double(c)+1.0;
    		h_signal[i][j].y = double(c)+1.0;
		c++;
		}
}

//Device Arrays
cufftComplex *d_signal;
if(cudaSuccess != cudaMalloc(&d_signal,sizeof(cufftComplex)*nx*ny))
{ printf("Malloc for d_signal failed");}
if (cudaSuccess!=

cudaMemcpy(d_signal,h_signal,sizeof(cufftComplex)nxny,cudaMemcpyHostToDevice))
{ printf(“memcpy for h_signal to d_signal failed”);}

cufftComplex* d_out;

long int size_el=nx*ny*sizeof(cufftComplex);
if(cudaSuccess != cudaMalloc(&d_out,size_el))
{ printf("malloc for d_out failed");}

		
//Perform the cufft

cufftResult _cp = cufftPlan2d(&plan,ny,nx,CUFFT_C2C);
if(CUFFT_SUCCESS != _cp)
{ printf("CUFFT error: Plan creation failed,error code=%d",_cp);}	
cufftResult _ce = cufftExecC2C(plan,(cufftComplex *)d_signal,(cufftComplex *)d_out,CUFFT_FORWARD);
if(CUFFT_SUCCESS != _ce)
{ printf("CUFFT error: Plan execution failed,error code=%d",_ce);}
if(cudaSuccess != cudaThreadSynchronize())
{ printf("Failed to synchronize\n");}
cufftDestroy(plan);


if(cudaSuccess != cudaMemcpy(h_out,d_out,size_el,cudaMemcpyDeviceToHost))
{ printf("Memcpy of d_out to h_out failed");}

    input=fopen("Input2D.txt","w");

for (unsigned int i = 0; i < nx; i++)
	{
	for(unsigned int j=0; j < ny; j++)
	{        	
		fprintf(input,"%f+i*%f\t",h_signal[i][j].x,h_signal[i][j].y);
   		}
	fprintf(input,"\n");
	}
fclose(input);

output=fopen("Output2D.txt","w");

for (unsigned int i = 0; i < nx; i++)
	{
	for(unsigned int j=0; j < ny; j++)
	{        	
		fprintf(output,"%f+i*%f\t",h_out[i][j].x,h_out[i][j].y);
   		}
	fprintf(output,"\n");
	}
fclose(output);


cudaFree(d_signal);
//free(h_signal);
cudaFree(d_out);
diff = ( std::clock() - start ) / (double)CLOCKS_PER_SEC;
    printf("Time Elapsed:%f",diff);
	printf("\n");
return 0;	

}

These create stack variables, which have limits as to size:

cufftComplex h_signal[nx][ny];
cufftComplex h_out[nx][ny];

When you make stack variables that are too large, and try to use them, you will corrupt the stack, leading to all sorts of odd program behavior.

You’ll need to re-cast your problem using dynamically allocated arrays (e.g. with malloc, or new)

When you do so, creating convenient 2D (i.e. doubly-subscripted) arrays will become somewhat harder, but if you know the dimensions (i.e. width) of your arrays at compile-time, you can use the compiler to help make it easy with some carefully chosen typedefs.

Something like this:

typedef cufftComplex my_array[ny];
my_array *h_signal = (my_array *)malloc(nx*sizeof(my_array));
my_array *h_out = (my_array *)malloc(nx*sizeof(my_array));

By the way, this error of yours has nothing to do with CUDA.

A few other comments about posting things. When you have long code blocks to paste into a posting, please mark the sections using the button provided at the top of the edit pane.

Also, don’t strip off the #include statements at the beginning of your program. This makes it harder for someone else to help you. Post a complete compilable program.

Alright sir. Thank you very much for your extremely useful comments and suggestions.
From now on, i will take care of everything and will definitely follow these trends while posting my problems.
Thanks again.

Sir,
I tried to rewrite the same code with some changes as per your instructions and the code is working well for any sized input matrix for 2d FFT but when i am trying to write a similar program for calculating 1D FFT then except for single row and column vector it is not working for any other square or rectangular matrix. The results obtained are very much different from the MATLAB fft() result, when operated over the similar matrix there. Here is the code:

#include"stdio.h"
#include"cuda_runtime.h"
#include"cufft.h"
#include"complex"
#include"cuda.h"
#include"ctime"

#define nx 5
#define ny 50
main(int argc, char** argv)
{
	clock_t start;
    	double diff;
	long int batch,N;
    	start = clock();
	FILE* input;
	FILE* output;
	typedef cufftComplex my_array[ny];
	if(nx==1)
        {batch=1;N=ny;}
	else
 	{batch=ny;N=nx;}
        my_array *h_signal = (my_array *)malloc(nx*sizeof(my_array));
        my_array *h_out = (my_array *)malloc(nx*sizeof(my_array));
	//cufftComplex *h_signal = (cufftComplex *)malloc(sizeof(cufftComplex) *nx*ny);		
	//cufftComplex *h_out = (cufftComplex *)malloc(sizeof(cufftComplex) *nx*ny);		
	cufftHandle plan;	

	int c=0;
	for (int i = 0; i < nx; i++)
    	{
		for(int j=0; j < ny; j++)
		{
        		h_signal[i][j].x = double(c)+1.0;
        		h_signal[i][j].y = double(c)+1.0;
			c++;
    		}
	}
	
	//Device Arrays
	my_array *d_signal;
	if(cudaSuccess != cudaMalloc(&d_signal,nx*sizeof(my_array)))
	{ printf("Malloc for d_signal failed");}
	if (cudaSuccess != cudaMemcpy(d_signal,h_signal,nx*sizeof(my_array),cudaMemcpyHostToDevice))
	{ printf("memcpy for h_signal to d_signal failed");}
	
	my_array *d_out;

	long int size_el=nx*sizeof(my_array);
	if(cudaSuccess != cudaMalloc(&d_out,size_el))
	{ printf("malloc for d_out failed");}

			
	//Perform the cufft
	
	cufftResult _cp = cufftPlan1d(&plan,N,CUFFT_C2C,batch);
	if(CUFFT_SUCCESS != _cp)
	{ printf("CUFFT error: Plan creation failed,error code=%d",_cp);}	
	cufftResult _ce = cufftExecC2C(plan,(cufftComplex *)d_signal,(cufftComplex *)d_out,CUFFT_FORWARD);
	if(CUFFT_SUCCESS != _ce)
	{ printf("CUFFT error: Plan execution failed,error code=%d",_ce);}
	if(cudaSuccess != cudaThreadSynchronize())
	{ printf("Failed to synchronize\n");}
	cufftDestroy(plan);
	
	
	if(cudaSuccess != cudaMemcpy(h_out,d_out,size_el,cudaMemcpyDeviceToHost))
	{ printf("Memcpy of d_out to h_out failed");}

        input=fopen("Input1D.txt","w");

	for (unsigned int i = 0; i < nx; i++)
    	{
		for(unsigned int j=0; j < ny; j++)
		{        	
			fprintf(input,"%f+i*%f\t",h_signal[i][j].x,h_signal[i][j].y);
       		}
		fprintf(input,"\n");
    	}
	fclose(input);
	
	output=fopen("Output1D.txt","w");

	for (unsigned int i = 0; i < nx; i++)
    	{
		for(unsigned int j=0; j < ny; j++)
		{        	
			fprintf(output,"%f+i*%f\t",h_out[i][j].x,h_out[i][j].y);
       		}
		fprintf(output,"\n");
    	}
	fclose(output); 
	
	
	cudaFree(d_signal);
	free(h_signal);
	cudaFree(d_out);
	diff = ( std::clock() - start ) / (double)CLOCKS_PER_SEC;
        printf("Time Elapsed:%f",diff);
    	printf("\n");
	return 0;	
}

Please help me with the necessary changes required.

You mixed up your array layout:
Your array has the layout h_signal[x*Ny+y], i.e. y elements are next to each other in memory. If you want to do a batch FFT you need to do Nx 1D FFTs of rows.

The error is in line 22: replace the line by

{batch=nx;N=ny;}

Yeah, according to my data layout it should be like this only and i am sorry for that but i tried using this also and output is still the same as the previous one.
Please comment!!

I used the code like this and I verified the result with Matlab. For me it works.

I am also running the code after making those changes but the results are not matching with those of the MATLAB.
Could you please paste your working code here…please!!

I did not change anything except the one line.

Now let’s verify an example: I set nx=50 and ny=5 (thats the second change I did in your code). That means we have a 50x5 input matrix and we are doing 50 1D fourier transformations of a vector of size 5.

If you have the whole matrix “Input” in Matlab then you get the first vector to transform by

Input(1,:)
ans = 1.0000 + 1.0000i   2.0000 + 2.0000i   3.0000 + 3.0000i   4.0000 + 4.0000i   5.0000 + 5.0000i

Applying the fft yields

fft(Input(1,:))
ans = 15.0000 +15.0000i  -5.9410 + 0.9410i  -3.3123 - 1.6877i  -1.6877 - 3.3123i   0.9410 - 5.9410i

And this is exactly the first line of the “Output1D.txt” file from your code.

I also verified the nx=5 and ny=50 example. Note that you now transform a 5x50 matrix which is NOT the transpose of the 50x5 matrix!

Okay. Now i got it. You are performing fft on each row of input matrix in MATLAB and then checking it with the result obtained from our CUDA code.
I have also verified the result in that sense now but i want to get the exact matrix as that obtained from MATLAB when we use this command :

>> y= fft(x);

where x is the 50 x 5 complex input matrix.
I tried to do this by changing typedef definition and making all other necessary changes but it is getting aborted abnormally. Here is my attempt:

#include"stdio.h"
#include"cuda_runtime.h"
#include"cufft.h"
#include"complex"
#include"cuda.h"
#include"ctime"

#define nx 50
#define ny 5
main(int argc, char** argv)
{
	clock_t start;
    	double diff;
	long int batch,N;
    	start = clock();
	FILE* input;
	FILE* output;
	typedef cufftComplex my_array[nx];
	if(nx==1)
        {batch=1;N=ny;}
	else
 	{batch=ny;N=nx;}
        my_array *h_signal = (my_array *)malloc(ny*sizeof(my_array));
        my_array *h_out = (my_array *)malloc(ny*sizeof(my_array));
	//cufftComplex *h_signal = (cufftComplex *)malloc(sizeof(cufftComplex) *nx*ny);		
	//cufftComplex *h_out = (cufftComplex *)malloc(sizeof(cufftComplex) *nx*ny);		
	cufftHandle plan;	

	int c=0;
	for (int i = 0; i < nx; i++)
    	{
		for(int j=0; j < ny; j++)
		{
        		h_signal[i][j].x = double(c)+1.0;
        		h_signal[i][j].y = double(c)+1.0;
			c++;
    		}
	}
	
	//Device Arrays
	my_array *d_signal;
	if(cudaSuccess != cudaMalloc(&d_signal,ny*sizeof(my_array)))
	{ printf("Malloc for d_signal failed");}
	if (cudaSuccess != cudaMemcpy(d_signal,h_signal,ny*sizeof(my_array),cudaMemcpyHostToDevice))
	{ printf("memcpy for h_signal to d_signal failed");}
	
	my_array *d_out;

	long int size_el=ny*sizeof(my_array);
	if(cudaSuccess != cudaMalloc(&d_out,size_el))
	{ printf("malloc for d_out failed");}

			
	//Perform the cufft
	
	cufftResult _cp = cufftPlan1d(&plan,N,CUFFT_C2C,batch);
	if(CUFFT_SUCCESS != _cp)
	{ printf("CUFFT error: Plan creation failed,error code=%d",_cp);}	
	cufftResult _ce = cufftExecC2C(plan,(cufftComplex *)d_signal,(cufftComplex *)d_out,CUFFT_FORWARD);
	if(CUFFT_SUCCESS != _ce)
	{ printf("CUFFT error: Plan execution failed,error code=%d",_ce);}
	if(cudaSuccess != cudaThreadSynchronize())
	{ printf("Failed to synchronize\n");}
	cufftDestroy(plan);
	
	
	if(cudaSuccess != cudaMemcpy(h_out,d_out,size_el,cudaMemcpyDeviceToHost))
	{ printf("Memcpy of d_out to h_out failed");}

        input=fopen("Input.txt","w");

	for (unsigned int i = 0; i < nx; i++)
    	{
		for(unsigned int j=0; j < ny; j++)
		{        	
			fprintf(input,"%f+i*%f\t",h_signal[i][j].x,h_signal[i][j].y);
       		}
		fprintf(input,"\n");
    	}
	fclose(input);
	
	output=fopen("Output.txt","w");

	for (unsigned int i = 0; i < nx; i++)
    	{
		for(unsigned int j=0; j < ny; j++)
		{        	
			fprintf(output,"%f+i*%f\t",h_out[i][j].x,h_out[i][j].y);
       		}
		fprintf(output,"\n");
    	}
	fclose(output); 
	
	
	cudaFree(d_signal);
	free(h_signal);
	cudaFree(d_out);
	diff = ( std::clock() - start ) / (double)CLOCKS_PER_SEC;
        printf("Time Elapsed:%f",diff);
    	printf("\n");
	return 0;	
}

Please help.
Thanks

Your code does 50 1D transforms of size 5. The Matlab code does a 2D transform of size 50x5. That is not the same. If you want to do a 2D transform then use the appropriate routines like in your first post.

In principle, it is possible to do a 2D transform with a 2 batched 1D transformations:
a) FFT y-direction (batch = 50, size = 5)
b) rearrange array (it’s a matrix transpose in 2D)
c) FFT x-direction (batch = 5, size = 50)
But that will be slower than the 2D transform. If you want to do a 4D FFT then such a procedure is necessary, since cuFFT does not support this natively.

Sir,
I am not talking about 2d fft. I have already written a code for that. Let me explain it thoroughly.
See, when we are taking an input array of size 50 x 5, the above code is doing 50 1D transforms (row wise) of size 5 and then it is storing them row wise too into an output array. Meaning,the first row in our output array is the fft of first row of input array and likewise.
But when we use the matlab fft() over any rectangular input array say 50 x 5 in our case, it takes a column say for example 1st column,then performs fft for that column and stores the resulting column vector as the 1st column of output matrix( 50 x 5 in our case ) and likewise. Similar thing i also want to do using CUDA with C with some alterations in my above code i.e. instead of doing 50 1D transforms of size 5, i want to do 5 transforms of size 50 and then storing them similarly into an output matrix. If we will be able to do so then our output matrix will exactly be equal to what obtained from MATLAB fft().
Looking for some advice.
Thank you.

Sorry! I did expect that Matlab uses automatically a 2D fft for Matrix input. You are absolutely right: fft(50x5) in Matlab does 5 x 1D transform of size 50.

To compare with your GPU code do the following:
choose

#define nx 5
#define ny 50

and run your program.

Take the data from Input1.txt and copy to Matlab. There you have now a 5x50 matrix. Apply transpose() to get a 50x5 matrix and apply fft() and transpose() back:

transpose(fft(transpose(Input)))

Now compare this 5x50 matrix to the output in Output1.txt.

That should work now.

The problem is: Matlab does the fft along the columns whereas your code does the fft along the rows.
The easiest solution is to reverse rows and cols in your file write routines.

Thank you for sharing the idea with me. I tried the idea of

y = (fft(x'))';

where x is our input array but the two results ( Output.txt and variable y) are different for some rows and columns. I tried the idea of reversing rows and columns too but that approach is giving correct solution ( same output as that of obtained using fft() ) only for square matrices and not for rectangular ones.
I think i have tried every possible method, therefore i am now thinking of dumping the idea of performing column wise 1D FFT for any matrix using CUDA with C. If you have any other idea or if you managed to do this by any other trick then i request you to please convey it to me otherwise thank you very much for the help till now.

Thanks.

x’ is not the transpose but the complex transpose.
There is no trick! You just need to do it the way I explained in the last post. It gives the correct result.

At the bottom you will find the code where I exchanged rows and columns in the output. Execute the code. Copy the Input and the Output to Matlab and compare the result. The error per element is of the order 10^-5 which is the precision of the output files. Here is the code to execute in Matlab:

Input = [... the data from Input1.txt ...];
Output = [... the data from Output1.txt ...];
MatlabResult = fft( Input );
norm( Output - MatlabResult )

The last line will give you the norm of the error which should be 3.8294e-04.

Here is the code:

#include"stdio.h"
#include"cuda_runtime.h"
#include"cufft.h"
#include"complex"
#include"cuda.h"
#include"ctime"

#define nx 5
#define ny 50
int main(int argc, char** argv)
{
	clock_t start;
	double diff;
	long int batch, N;
	start = clock();
	FILE* input;
	FILE* output;
	typedef cufftComplex my_array[ny];
	if (nx == 1)
	{
		batch = 1;
		N = ny;
	}
	else
	{
		batch = nx;
		N = ny;
	}
	my_array *h_signal = (my_array *) malloc(nx * sizeof(my_array));
	my_array *h_out = (my_array *) malloc(nx * sizeof(my_array));
	//cufftComplex *h_signal = (cufftComplex *)malloc(sizeof(cufftComplex) *nx*ny);
	//cufftComplex *h_out = (cufftComplex *)malloc(sizeof(cufftComplex) *nx*ny);
	cufftHandle plan;

	int c = 0;
	for (int i = 0; i < nx; i++)
	{
		for (int j = 0; j < ny; j++)
		{
			h_signal[i][j].x = double(c) + 1.0;
			h_signal[i][j].y = double(c) + 1.0;
			c++;
		}
	}

	//Device Arrays
	my_array *d_signal;
	if (cudaSuccess != cudaMalloc(&d_signal, nx * sizeof(my_array)))
	{
		printf("Malloc for d_signal failed");
	}
	if (cudaSuccess
			!= cudaMemcpy(d_signal, h_signal, nx * sizeof(my_array),
					cudaMemcpyHostToDevice))
	{
		printf("memcpy for h_signal to d_signal failed");
	}

	my_array *d_out;

	long int size_el = nx * sizeof(my_array);
	if (cudaSuccess != cudaMalloc(&d_out, size_el))
	{
		printf("malloc for d_out failed");
	}

	//Perform the cufft

	cufftResult _cp = cufftPlan1d(&plan, N, CUFFT_C2C, batch);
	if (CUFFT_SUCCESS != _cp)
	{
		printf("CUFFT error: Plan creation failed,error code=%d", _cp);
	}
	cufftResult _ce = cufftExecC2C(plan, (cufftComplex *) d_signal,
			(cufftComplex *) d_out, CUFFT_FORWARD);
	if (CUFFT_SUCCESS != _ce)
	{
		printf("CUFFT error: Plan execution failed,error code=%d", _ce);
	}
	if (cudaSuccess != cudaThreadSynchronize())
	{
		printf("Failed to synchronize\n");
	}
	cufftDestroy(plan);

	if (cudaSuccess
			!= cudaMemcpy(h_out, d_out, size_el, cudaMemcpyDeviceToHost))
	{
		printf("Memcpy of d_out to h_out failed");
	}

	input = fopen("Input1D.txt", "w");

	for (unsigned int j = 0; j < ny; j++)
	{
		for (unsigned int i = 0; i < nx; i++)
		{
			fprintf(input, "%f+i*%f\t", h_signal[i][j].x, h_signal[i][j].y);
		}
		fprintf(input, "\n");
	}
	fclose(input);

	output = fopen("Output1D.txt", "w");

	for (unsigned int j = 0; j < ny; j++)
	{
		for (unsigned int i = 0; i < nx; i++)
		{
			fprintf(output, "%f+i*%f\t", h_out[i][j].x, h_out[i][j].y);
		}
		fprintf(output, "\n");
	}
	fclose(output);

	cudaFree(d_signal);
	free(h_signal);
	cudaFree(d_out);
	diff = (std::clock() - start) / (double) CLOCKS_PER_SEC;
	printf("Time Elapsed:%f", diff);
	printf("\n");
	return 0;
}

Oh, i am really sorry for having made that silly mistake by using x’.
I have now verified everything and it’s working perfectly. By the way your new code better implements the thought which i had in my mind. The only thing is, i should always be careful while initializing the input array at both places for comparison or else i could also copy the inputs directly into MATLAB by reading ‘Input.txt’.
Anyways thank you very much for all your help. Thanks a lot.

Regards,
Niraj