Hii,
I was trying to develop a CUDA (with C) code for finding 2d fft of any input matrix. The problem that i am facing is the code is running well for smaller sized input like X[25][25] but as i am increasing the size and reaching a size of even X[1000][1000] , it is producing ‘Segmentation Fault’ on my terminal screen. I have checked the whole code several times but i am not able to find the bug. I thought that it could be a memory allocation or pointer dereferencing issue but i have checked it by alternate methods that these are not the issues. Through MATLAB, i checked that i am able to generate a matrix of size 1000x1000 (even 4096x4096 too) on GPU.
Herein, i am posting my code. I request you to please look into the code once and help me with the relevant solution.
Thanking you.
#define nx 750
#define ny 750
main(int argc, char** argv)
{
clock_t start;
double diff;
start = clock();
FILE* input;
FILE* output;
cufftComplex h_signal[nx][ny];
cufftComplex h_out[nx][ny];
cufftHandle plan;
int c=0;
for (int i = 0; i < nx; i++)
{
for(int j=0; j < ny; j++)
{
h_signal[i][j].x = double(c)+1.0;
h_signal[i][j].y = double(c)+1.0;
c++;
}
}
//Device Arrays
cufftComplex *d_signal;
if(cudaSuccess != cudaMalloc(&d_signal,sizeof(cufftComplex)*nx*ny))
{ printf("Malloc for d_signal failed");}
if (cudaSuccess!=
cudaMemcpy(d_signal,h_signal,sizeof(cufftComplex)nxny,cudaMemcpyHostToDevice))
{ printf(“memcpy for h_signal to d_signal failed”);}
cufftComplex* d_out;
long int size_el=nx*ny*sizeof(cufftComplex);
if(cudaSuccess != cudaMalloc(&d_out,size_el))
{ printf("malloc for d_out failed");}
//Perform the cufft
cufftResult _cp = cufftPlan2d(&plan,ny,nx,CUFFT_C2C);
if(CUFFT_SUCCESS != _cp)
{ printf("CUFFT error: Plan creation failed,error code=%d",_cp);}
cufftResult _ce = cufftExecC2C(plan,(cufftComplex *)d_signal,(cufftComplex *)d_out,CUFFT_FORWARD);
if(CUFFT_SUCCESS != _ce)
{ printf("CUFFT error: Plan execution failed,error code=%d",_ce);}
if(cudaSuccess != cudaThreadSynchronize())
{ printf("Failed to synchronize\n");}
cufftDestroy(plan);
if(cudaSuccess != cudaMemcpy(h_out,d_out,size_el,cudaMemcpyDeviceToHost))
{ printf("Memcpy of d_out to h_out failed");}
input=fopen("Input2D.txt","w");
for (unsigned int i = 0; i < nx; i++)
{
for(unsigned int j=0; j < ny; j++)
{
fprintf(input,"%f+i*%f\t",h_signal[i][j].x,h_signal[i][j].y);
}
fprintf(input,"\n");
}
fclose(input);
output=fopen("Output2D.txt","w");
for (unsigned int i = 0; i < nx; i++)
{
for(unsigned int j=0; j < ny; j++)
{
fprintf(output,"%f+i*%f\t",h_out[i][j].x,h_out[i][j].y);
}
fprintf(output,"\n");
}
fclose(output);
cudaFree(d_signal);
//free(h_signal);
cudaFree(d_out);
diff = ( std::clock() - start ) / (double)CLOCKS_PER_SEC;
printf("Time Elapsed:%f",diff);
printf("\n");
return 0;
}