CudaMemcpy returns failed in loop while outside loop It does. loop is: for(INDEX=0; INDEX < time_...

// make this .c file not .cu. this should solve “warning: parsing restarts here after previous syntax error”.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <float.h>
#include <complex.h>

void evolve_inhom_unconstrained_ceta(int n_x, int n_y, double delta_x, double delta_y, double kappac,
double kappae, double A, double B, double P, double M, double L,
double delta_t, int time_steps,
double t0, cufftComplex *comp, cufftComplex *eta, double *Ceff,
double *DeltaC, double *sigma_Tc, double *epsilon_Tc,
double *sigma_Te, double *epsilon_Te,
double ave_comp, double *sig_app, int STEPS, int MAXITR, double MAXERR,
int n_alpha, int n_beta, double *alpha, double *beta, double *alpha_prime, double *beta_prime,
int plan_indicator, int SNA, int NOISEADDER, double noise_str, int factor_alpha, int factor_beta){

FILE *fpd, *fpmu, *fptmp;

FILE *fit;

cudaDeviceReset();

int INDEX = 0;
int i1,i2,i3,i4,i5,i6;
int J;

int half_nx, half_ny;
double *kx, *ky;

kx = (double *)malloc(n_x* sizeof(double));
ky = (double *)malloc(n_y* sizeof(double));

double delta_kx, delta_ky;
double k2, k4;
double inv_denom;

double rr;
double s11, s12, s21, s22;
int i11, i22;

int nxny;
int inv_nxny;

char NAME[50];

double epsstr11c, epsstr22c, epsstr12c;
double epsstr11e, epsstr22e, epsstr12e;

double *c;
double *e;
double *Del_sigma_Tc;
double *Del_sigma_Te;
double *Ec;
double *strainc;
double *Omega11;
double *Omega12;
double *Omega21;
double *Omega22;
double *Ee;
double *straine;

double a, b;
double ap, bp;
double realc;
double reale;

double W, Wp;

unsigned FLAG;

size_t tmp;

cufftComplex *g;
cufftComplex *u1_oldc, *u2_oldc;
cufftComplex *u1_newc, *u2_newc;
cufftComplex *eps_star11c, *eps_star12c, *eps_star22c;
cufftComplex *mu_elc;

cufftComplex *h;
cufftComplex *u1_olde, *u2_olde;
cufftComplex *u1_newe, *u2_newe;
cufftComplex *eps_star11e, *eps_star12e, *eps_star22e;
cufftComplex *mu_ele;

cufftHandle plan;

nxny = n_x*n_y;
inv_nxny = 1.0/nxny;

g = (cufftComplex *)malloc(nxny* sizeof(cufftComplex));  // nxny is grid size
u1_oldc = (cufftComplex *)malloc(nxny* sizeof(cufftComplex));
u2_oldc = (cufftComplex *)malloc(nxny* sizeof(cufftComplex));
u1_newc = (cufftComplex *)malloc(nxny* sizeof(cufftComplex));
u2_newc = (cufftComplex *)malloc(nxny* sizeof(cufftComplex));
eps_star11c = (cufftComplex *)malloc(nxny* sizeof(cufftComplex));
eps_star12c = (cufftComplex *)malloc(nxny* sizeof(cufftComplex));
eps_star22c = (cufftComplex *)malloc(nxny* sizeof(cufftComplex));
mu_elc = (cufftComplex *)malloc(nxny* sizeof(cufftComplex));

h = (cufftComplex *)malloc(nxny* sizeof(cufftComplex));
u1_olde = (cufftComplex *)malloc(nxny* sizeof(cufftComplex));
u2_olde = (cufftComplex *)malloc(nxny* sizeof(cufftComplex));
u1_newe = (cufftComplex *)malloc(nxny* sizeof(cufftComplex));
u2_newe = (cufftComplex *)malloc(nxny* sizeof(cufftComplex));
eps_star11e = (cufftComplex *)malloc(nxny* sizeof(cufftComplex));
eps_star12e = (cufftComplex *)malloc(nxny* sizeof(cufftComplex));
eps_star22e = (cufftComplex *)malloc(nxny* sizeof(cufftComplex));
mu_ele = (cufftComplex *)malloc(nxny* sizeof(cufftComplex));

double *d_kx;
double *d_ky;
cudaMalloc((void**)&d_kx, n_x*sizeof(double));
cudaMalloc((void**)&d_ky, n_y*sizeof(double));

cufftComplex *d_g;
if(cudaMalloc((void**)&d_g, nxny*sizeof(cufftComplex)) != cudaSuccess) printf("d_g allocation failed");
cufftComplex *d_h;
if(cudaMalloc((void**)&d_h, n_x*n_y*sizeof(cufftComplex)) != cudaSuccess) printf("d_h allocation failed");
cufftComplex *d_comp;
cudaMalloc((void**)&d_comp, nxny*sizeof(cufftComplex));
cufftComplex *d_eta;
cudaMalloc((void**)&d_eta, nxny*sizeof(cufftComplex));
double *d_Ceff;
cudaMalloc((void**)&d_Ceff, 3*3*3*3*sizeof(double));
double *d_Omega11, *d_Omega22, *d_Omega21, *d_Omega12;
cudaMalloc((void**)&d_Omega11, nxny*sizeof(double));
cudaMalloc((void**)&d_Omega22, nxny*sizeof(double));
cudaMalloc((void**)&d_Omega21, nxny*sizeof(double));
cudaMalloc((void**)&d_Omega12, nxny*sizeof(double));

cufftComplex *d_eps_star11c;
cudaMalloc((void**)&d_eps_star11c, nxny*sizeof(cufftComplex));
cufftComplex *d_eps_star12c;
cudaMalloc((void**)&d_eps_star12c, nxny*sizeof(cufftComplex));
cufftComplex *d_eps_star22c;
cudaMalloc((void**)&d_eps_star22c, nxny*sizeof(cufftComplex));
cufftComplex *d_eps_star11e;
cudaMalloc((void**)&d_eps_star11e, nxny*sizeof(cufftComplex));
cufftComplex *d_eps_star12e;
cudaMalloc((void**)&d_eps_star12e, nxny*sizeof(cufftComplex));
cufftComplex *d_eps_star22e;
cudaMalloc((void**)&d_eps_star22e, nxny*sizeof(cufftComplex));
cufftComplex *d_mu_elc;
cudaMalloc((void**)&d_mu_elc, nxny*sizeof(cufftComplex));
cufftComplex *d_mu_ele;
cudaMalloc((void**)&d_mu_ele, nxny*sizeof(cufftComplex));


cufftPlan2d(&plan, n_x, n_y, CUFFT_C2C);  //

/* Declare and initialize acoustic tensor */

half_nx = (int)n_x/2;
half_ny = (int)n_y/2;


delta_kx = 2.0*3.14/(n_x*delta_x);
delta_ky = 2.0*3.14/(n_y*delta_y);  // PI is upto two digis here.

Omega11 = (double *)malloc((nxny)*sizeof(double));
Omega12 = (double *)malloc((nxny)*sizeof(double));
Omega21 = (double *)malloc((nxny)*sizeof(double));
Omega22 = (double *)malloc((nxny)*sizeof(double));

for(i1=0; i1 < n_x; ++i1){
	if(i1 < half_nx){
		kx[i1] = i1*delta_kx;
	}
	else{
		kx[i1] = (i1-n_x)*delta_kx;
	}
}
for(i2=0; i2 < n_y; ++i2){
	if(i2 < half_ny){
		ky[i2] = i2*delta_ky;
	}
	else{
		ky[i2] = (i2-n_y)*delta_ky;
	}
}

for(i1=0; i1 < n_x; ++i1){
	for(i2=0; i2 < n_y; ++i2){
		J = i2+n_y*i1;		// for some reason
		Omega11[J] = 0.0;
		Omega12[J] = 0.0;
		Omega21[J] = 0.0;
		Omega22[J] = 0.0;
	}
}


/* Write the initial configuration */

c = (double *)malloc( n_x*n_y*sizeof(double));
e = (double *)malloc( n_x*n_y*sizeof(double));
for(i1=0; i1 < n_x; ++i1){
	for(i2=0; i2 < n_y; ++i2){
		J = i2+n_y*i1;
		c[J] = comp[J].x;
		e[J] = eta[J].x;
		if(c[J] <= -0.5 || c[J] >= 1.5){
			printf("The composition goes out of bounds1\n");
			printf("5 %d %le\n",J,comp[J].x);
			printf("Exiting\n");
			exit(0);
		}
	}
}


sprintf(NAME,"../output/data/time%dc.dat",   
(int) (INDEX + t0));
fpd = fopen(NAME,"w");
tmp = fwrite(&c[0],sizeof(double), nxny,fpd);
fclose(fpd);

sprintf(NAME,"../output/data/time%de.dat",
(int) (INDEX + t0));
fpd = fopen(NAME,"w");
tmp = fwrite(&e[0],sizeof(double), nxny,fpd);
fclose(fpd);


/* Calculate the Del_sigma_T tensor */

Del_sigma_Tc = (double *)malloc(4*sizeof(double)); // dmatrix(1,2,1,2);  // this variable has 4 integers in it. maybe can use cudaMAlloc afterall, eh?
Del_sigma_Te = (double *)malloc(4*sizeof(double));
calculate_Del_sigma_T(DeltaC,epsilon_Tc,Del_sigma_Tc);  // using __global__ here will battleneck as x,y,z is upto 3 or whatever!
calculate_Del_sigma_T(DeltaC,epsilon_Te,Del_sigma_Te);
Ec = (double *)malloc(4*sizeof(double));
strainc = (double *)malloc(4*sizeof(double));
Ee = (double *)malloc(4*sizeof(double));
straine = (double *)malloc(4*sizeof(double));


/* Evolve */

/* calculate g and its fourier transformation */

fptmp = fopen("../output/noise_info","a");  // "a" appends to the file.

if(cudaMemcpy(d_g, g, nxny*sizeof(cufftComplex), cudaMemcpyHostToDevice) != cudaSuccess) 
	printf("mem cpy of d_g failed ..................................... on 248");

//time_steps = 15;     //////////////////////////////////////////////////////////////////////////////
//printf("%d", time_steps);  //////////////////////////////////////////////////////////////////////////

for(INDEX=0; INDEX < time_steps+1; ++INDEX){   
	printf("\nenetering loop\n");
	
	//printf("\n%f", g[5].x); 
	
	if(cudaMemcpy(d_g, g, nxny*sizeof(cufftComplex), cudaMemcpyHostToDevice) != cudaSuccess) 
		printf("mem cpy of g failed on 248");
	if(cudaMemcpy(d_h, h, n_x*n_y*sizeof(cufftComplex), cudaMemcpyHostToDevice) != cudaSuccess)
		printf("memcpy of h failed on line 258");
	if(cudaMemcpy(d_comp, comp, nxny*sizeof(cufftComplex), cudaMemcpyHostToDevice) != cudaSuccess)
		printf("memcpy of comp failed on line 260");
	if(cudaMemcpy(d_eta, eta, nxny*sizeof(cufftComplex), cudaMemcpyHostToDevice) != cudaSuccess)
		printf("memcpy failed of eta on line 262");

	g_and_h_values <<<(nxny/256), 256>>> (d_g, d_h, d_comp, d_eta, n_x, n_y, A, B, P);
	

	cufftExecC2C(plan, d_g, d_g, CUFFT_FORWARD);
	cufftExecC2C(plan, d_h, d_h, CUFFT_FORWARD);

	
	cudaMemcpy(d_kx, kx, n_x*sizeof(double), cudaMemcpyHostToDevice);
	cudaMemcpy(d_ky, ky, n_y*sizeof(double), cudaMemcpyHostToDevice);
	// Ceff to device
	cudaMemcpy(d_Ceff, Ceff, 3*3*3*3*sizeof(double),cudaMemcpyHostToDevice);  // are you sure!?
	// dvector omega11, 12, 22, 21
	cudaMemcpy(d_Omega11, Omega11, nxny*sizeof(double), cudaMemcpyHostToDevice);
	cudaMemcpy(d_Omega21, Omega21, nxny*sizeof(double), cudaMemcpyHostToDevice);
	cudaMemcpy(d_Omega22, Omega22, nxny*sizeof(double), cudaMemcpyHostToDevice);
	cudaMemcpy(d_Omega12, Omega12, nxny*sizeof(double), cudaMemcpyHostToDevice);
	cudaMemcpy(kx, d_kx, n_x*sizeof(double), cudaMemcpyDeviceToHost);
	cudaMemcpy(ky, d_ky, n_y*sizeof(double), cudaMemcpyDeviceToHost);
	cudaMemcpy(Ceff, d_Ceff, 3*3*3*3*sizeof(double), cudaMemcpyDeviceToHost);
	cudaMemcpy(Omega11, d_Omega11, nxny*sizeof(double), cudaMemcpyDeviceToHost);
	cudaMemcpy(Omega22, d_Omega22, nxny*sizeof(double), cudaMemcpyDeviceToHost);
	cudaMemcpy(Omega21, d_Omega21, nxny*sizeof(double), cudaMemcpyDeviceToHost);
	cudaMemcpy(Omega12, d_Omega12, nxny*sizeof(double), cudaMemcpyDeviceToHost);
	cudaMemcpy(comp, d_comp, nxny*sizeof(cufftComplex), cudaMemcpyDeviceToHost);
	cudaMemcpy(eta, d_eta, nxny*sizeof(cufftComplex), cudaMemcpyDeviceToHost);
	

	if(INDEX == 0){
		 printf("\nentering loop if INDEX\n");
		/* The acoustic tensor omega and the homogenous base solution */

		/* The acoustic tensor Omega and the (homogeneous) base solution */

		calculate_Omega <<<(nxny/256),256>>> (n_x, n_y, half_nx, half_ny, d_kx, d_ky, d_Ceff,
			 d_Omega11, d_Omega12, d_Omega21, d_Omega22);


		calculate_uzero(n_x,n_y,half_nx,half_ny,delta_kx,delta_ky,ave_comp,  // comp
		comp,Ceff,sigma_Tc,u1_oldc,u2_oldc,n_beta,beta,Omega11,Omega12,Omega21,
		Omega22);  // using cufft inside __global__ does no work.


		calculate_uzero(n_x,n_y,half_nx,half_ny,delta_kx,delta_ky,ave_comp,  // comp
		eta,Ceff,sigma_Te,u1_olde,u2_olde,n_beta,beta,Omega11,Omega12,Omega21,
		Omega22);
	}


	#ifdef APPROX_CALC
		printf("I am doing approximate calculation\n");
		printf("So I am calculating uzero everytime\n");
		if(INDEX != 0){
			calculate_uzero(n_x,n_y,half_nx,half_ny,delta_kx,delta_ky,ave_comp,  // comp
			comp,Ceff,sigma_Tc,u1_oldc,u2_oldc,n_beta,beta,Omega11,Omega12,Omega21,
			Omega22);
			calculate_uzero(n_x,n_y,half_nx,half_ny,delta_kx,delta_ky,ave_comp,  // comp
			eta,Ceff,sigma_Te,u1_olde,u2_olde,n_beta,beta,Omega11,Omega12,Omega21,
			Omega22);
		}
	#endif

	/* Refine the displacements */
		//printf("enterinf refine\n");
	refine_u(n_x, n_y, half_nx, half_ny, delta_x, delta_y, epsilon_Tc,   //comp
	delta_kx, delta_ky, comp, ave_comp, Ceff, DeltaC, sigma_Tc,
	u1_oldc, u2_oldc, u1_newc, u2_newc, Del_sigma_Tc, sig_app, Ec, MAXITR, MAXERR,
	n_alpha, n_beta, alpha, beta, Omega11, Omega12, Omega21, Omega22, plan, factor_alpha, factor_beta);

	refine_u(n_x, n_y, half_nx, half_ny, delta_x, delta_y, epsilon_Te,	// eta
	delta_kx, delta_ky, eta, ave_comp, Ceff, DeltaC, sigma_Te,
	u1_olde, u2_olde, u1_newe, u2_newe, Del_sigma_Te, sig_app, Ee, MAXITR, MAXERR,
	n_alpha, n_beta, alpha, beta, Omega11, Omega12, Omega21, Omega22, plan, factor_alpha, factor_beta);

	for(i1=0; i1 < n_x; ++i1){
		for(i2=0; i2 < n_y; ++i2){
			J = i2+n_y*i1;
			eps_star11c[J].x =  kx[i1]*u1_newc[J].y;
			eps_star11c[J].y = -kx[i1]*u1_newc[J].x;
			eps_star22c[J].x =  ky[i2]*u2_newc[J].y;
			eps_star22c[J].y = -ky[i2]*u2_newc[J].x;
			eps_star12c[J].x =  0.5*(kx[i1]*u2_newc[J].y + ky[i2]*u1_newc[J].y);
			eps_star12c[J].y = -0.5*(kx[i1]*u2_newc[J].x + ky[i2]*u1_newc[J].x);
			eps_star11e[J].x =  kx[i1]*u1_newe[J].x;
			eps_star11e[J].y = -kx[i1]*u1_newe[J].y;
			eps_star22e[J].x =  ky[i2]*u2_newe[J].x;
			eps_star22e[J].y = -ky[i2]*u2_newe[J].y;
			eps_star12e[J].x =  0.5*(kx[i1]*u2_newe[J].y + ky[i2]*u1_newe[J].y);
			eps_star12e[J].y = -0.5*(kx[i1]*u2_newe[J].x + ky[i2]*u1_newe[J].x);
		}
	}

	/* Get the strain back to real space */
	/*  C */
	cudaMemcpy(d_eps_star11c, eps_star11c, nxny*sizeof(cufftComplex), cudaMemcpyHostToDevice);
	cudaMemcpy(d_eps_star12c, eps_star12c, nxny*sizeof(cufftComplex), cudaMemcpyHostToDevice);
	cudaMemcpy(d_eps_star22c, eps_star22c, nxny*sizeof(cufftComplex), cudaMemcpyHostToDevice);
	/* E */
	cudaMemcpy(d_eps_star11e, eps_star11e, nxny*sizeof(cufftComplex), cudaMemcpyHostToDevice);
	cudaMemcpy(d_eps_star12e, eps_star12e, nxny*sizeof(cufftComplex), cudaMemcpyHostToDevice);
	cudaMemcpy(d_eps_star22e, eps_star22e, nxny*sizeof(cufftComplex), cudaMemcpyHostToDevice);

	/* exec */
	cufftExecC2C(plan, d_eps_star11c, d_eps_star11c, CUFFT_INVERSE);  // INVERSE!? is it?
	cufftExecC2C(plan, d_eps_star12c, d_eps_star12c, CUFFT_INVERSE);
	cufftExecC2C(plan, d_eps_star22c, d_eps_star22c, CUFFT_INVERSE);
	cufftExecC2C(plan, d_eps_star11e, d_eps_star11e, CUFFT_INVERSE);
	cufftExecC2C(plan, d_eps_star12e, d_eps_star12e, CUFFT_INVERSE);
	cufftExecC2C(plan, d_eps_star22e, d_eps_star22e, CUFFT_INVERSE);




	/* Calculate mu_el and take it to Fourier Space */

	fit = fopen("numerical.dat","w");

	//printf("%d  %d", n_x, n_y);

	cudaMemcpy(d_comp, comp, nxny*sizeof(cufftComplex), cudaMemcpyHostToDevice);
	cudaMemcpy(d_eta, eta, nxny*sizeof(cufftComplex), cudaMemcpyHostToDevice);
	cudaMemcpy(d_mu_elc, mu_elc, nxny*sizeof(cufftComplex), cudaMemcpyHostToDevice);
	cudaMemcpy(d_mu_ele, mu_ele, nxny*sizeof(cufftComplex), cudaMemcpyHostToDevice);

	double *d_strainc, *d_straine;
	cudaMalloc((void**)&d_strainc, 4*sizeof(double));
	cudaMalloc((void**)&d_straine, 4*sizeof(double));
	cudaMemcpy(d_strainc, strainc, 4*sizeof(double), cudaMemcpyHostToDevice);
	cudaMemcpy(d_straine, straine, 4*sizeof(double), cudaMemcpyHostToDevice);

	double *d_Ec, *d_Ee;
	cudaMalloc((void**)&d_Ec, 4*sizeof(double));
	cudaMalloc((void**)&d_Ee, 4*sizeof(double));
	cudaMemcpy(d_Ec, Ec, 4*sizeof(double), cudaMemcpyHostToDevice);
	cudaMemcpy(d_Ee, Ee, 4*sizeof(double), cudaMemcpyHostToDevice);


	double *d_epsilon_Tc, *d_epsilon_Te;
	cudaMalloc((void**)&d_epsilon_Tc, 4*sizeof(double));
	cudaMalloc((void**)&d_epsilon_Te, 4*sizeof(double));
	cudaMemcpy(d_epsilon_Tc, epsilon_Tc, 4*sizeof(double), cudaMemcpyHostToDevice);
	cudaMemcpy(d_epsilon_Te, epsilon_Te, 4*sizeof(double), cudaMemcpyHostToDevice);

	double *d_DeltaC;
	cudaMalloc((void**)&d_DeltaC, 3*3*3*3*sizeof(double));
	cudaMemcpy(d_DeltaC, DeltaC, 3*3*3*3*sizeof(double), cudaMemcpyHostToDevice);

	cudaMemcpy(d_Ceff, Ceff, 3*3*3*3*sizeof(double),cudaMemcpyHostToDevice);

	double *d_alpha;
	cudaMalloc((void**) &d_alpha, (n_alpha+2*factor_alpha+1)*sizeof(double));
	cudaMemcpy(d_alpha, alpha, (n_alpha+2*factor_alpha+1)*sizeof(double), cudaMemcpyHostToDevice);

	double* d_beta;
	cudaMalloc((void**) &d_beta, (n_beta+2*factor_beta+1)*sizeof(double));
	cudaMemcpy(d_beta, beta, (n_beta+2*factor_beta+1)*sizeof(double), cudaMemcpyHostToDevice);

	double *d_alpha_prime;
	cudaMalloc((void**) &d_alpha_prime, (n_alpha+2*factor_alpha+1)*sizeof(double));
	cudaMemcpy(d_alpha_prime, alpha_prime, (n_alpha+2*factor_alpha+1)*sizeof(double), cudaMemcpyHostToDevice);
	
	double* d_beta_prime;
	cudaMalloc((void**) &d_beta_prime, (n_beta+2*factor_beta+1)*sizeof(double));
	cudaMemcpy(d_beta_prime, beta_prime, (n_beta+2*factor_beta+1)*sizeof(double), cudaMemcpyHostToDevice);


	/*************************************************************************/
	calculate_mu_ele<<<(nxny/256),256>>>(n_x, n_y, n_alpha, n_beta, d_mu_elc, d_mu_ele, d_comp, d_eta, d_strainc, d_straine,
	 d_Ec, d_Ee, d_epsilon_Tc, d_epsilon_Te, d_DeltaC, d_Ceff, d_alpha, d_beta, d_alpha_prime, d_beta_prime,
	  d_eps_star11c, d_eps_star12c, d_eps_star22c, d_eps_star11e, d_eps_star12e, d_eps_star22e);
/*double *s11, double *s12, double *s21, double *s22*/

	/*****************************************************************************/


	cudaMemcpy(eps_star11c, d_eps_star11c, nxny*sizeof(cufftComplex), cudaMemcpyDeviceToHost);
	cudaMemcpy(eps_star12c, d_eps_star12c, nxny*sizeof(cufftComplex), cudaMemcpyDeviceToHost);
	cudaMemcpy(eps_star22c, d_eps_star22c, nxny*sizeof(cufftComplex), cudaMemcpyDeviceToHost);
	cudaMemcpy(eps_star11e, d_eps_star11e, nxny*sizeof(cufftComplex), cudaMemcpyDeviceToHost);
	cudaMemcpy(eps_star12e, d_eps_star12e, nxny*sizeof(cufftComplex), cudaMemcpyDeviceToHost);
	cudaMemcpy(eps_star22e, d_eps_star22e, nxny*sizeof(cufftComplex), cudaMemcpyDeviceToHost);
	cudaMemcpy(DeltaC, d_DeltaC, 3*3*3*3*sizeof(double), cudaMemcpyDeviceToHost);
	cudaMemcpy(Ceff, d_Ceff, 3*3*3*3*sizeof(double),cudaMemcpyDeviceToHost);

	//printf("\nok");


	cufftExecC2C(plan, d_mu_elc, d_mu_elc, CUFFT_FORWARD);
	cufftExecC2C(plan, d_mu_ele, d_mu_ele, CUFFT_FORWARD);
	cufftExecC2C(plan, d_comp, d_comp, CUFFT_FORWARD);
	cufftExecC2C(plan, d_eta, d_eta, CUFFT_FORWARD);



	/* Evolve the composition and eta profiles */
	evolve_composition_eta_profiles<<<(nxny/256) , 256>>>(n_x, n_y, d_kx, d_ky, M, L, kappac, kappae,
									 delta_t, d_comp, d_g, d_mu_elc, d_eta, d_h, d_mu_ele);


	/* Get the composition and eta back to real space */

	cufftExecC2C(plan, d_comp, d_comp, CUFFT_INVERSE);
	cufftExecC2C(plan, d_eta, d_eta, CUFFT_INVERSE);
	
	cudaMemcpy(comp, d_comp, nxny*sizeof(cufftComplex), cudaMemcpyDeviceToHost);
	cudaMemcpy(eta, d_eta, nxny*sizeof(cufftComplex), cudaMemcpyDeviceToHost);

	for(i1=0;i1<n_x;++i1){
		for(i2=0;i2<n_y;++i2){
			J = i2 + n_y*i1;
			comp[J].x = comp[J].x*inv_nxny;
			comp[J].y = comp[J].y*inv_nxny;
			eta[J].x = eta[J].x*inv_nxny;
			eta[J].y = eta[J].y*inv_nxny;
		}
	}

	/* Once in a while check the composition is within bounds
	And write the output */

	if(INDEX != 0 && INDEX%STEPS==0){
		for(i1=0;i1<n_x;++i1){
			for(i2=0;i2<n_y;++i2){
				J = i2 + n_y*i1;
				c[J] = comp[J].x;
				e[J] = eta[J].x;
				if(c[J] <= -0.5 || c[J] >= 1.5){
					printf("The composition is out of bounds2\n");
					printf("Exiting\n");

					exit(0);
				}
			}
		}

		sprintf(NAME,"../output/data/time%dc.dat",
			(int) (INDEX + t0));
		fpd = fopen(NAME,"w");
		tmp = fwrite(&c[0],sizeof(double), nxny,fpd);
		fclose(fpd);
		sprintf(NAME,"../output/data/time%de.dat",
			(int) (INDEX + t0));
		fpd = fopen(NAME,"w");
		tmp = fwrite(&e[0],sizeof(double), nxny,fpd);
		fclose(fpd);
	}

	if(INDEX != 0 && INDEX%NOISEADDER == 0 && SNA == 1){
		fprintf(fptmp, "Steps %d: Noise added\n", INDEX);
		for(i1=0;i1<n_x;++i1){
			for(i2=0;i2<n_y;++i2){
				J = i2 + n_y*i1;
				c[J] = comp[J].x;
			}
		}
		add_noise(n_x, n_y, c, noise_str);  // add add_noise function to the src folder.
		for(i1=0;i1<n_x;++i1){
			for(i2=0;i2<n_y;++i2){
				J = i2 + n_y*i1;
				comp[J].x = c[J];
				comp[J].y = 0.0;
			}
		}
	}
}
fclose(fptmp);

/* Free the variables and destroy the plan */

free(g);
free(u1_oldc);
free(u2_oldc);
free(u1_newc);
free(u2_newc);
free(eps_star11c);
free(eps_star12c);
free(eps_star22c);
free(mu_elc);
free(h);
free(u1_olde);
free(u2_olde);
free(u1_newe);
free(u2_newe);
free(eps_star11e);
free(eps_star12e);
free(eps_star22e);
free(mu_ele);
free(kx);
free(ky);
free(c);
free(e);

free(Omega11);    
free(Omega12);
free(Omega21);
free(Omega22);


free(Del_sigma_Te);
free(Del_sigma_Tc);
free(Ee);
free(Ec);
free(straine);
free(strainc);

cudaFree(d_kx);
cudaFree(d_ky);
cudaFree(d_g);
cudaFree(d_h);
cudaFree(d_comp);
cudaFree(d_eta);
cudaFree(d_Ceff);
cudaFree(d_Omega11);
cudaFree(d_Omega12);
cudaFree(d_Omega21);
cudaFree(d_Omega22);
cudaFree(d_eps_star11e);
cudaFree(d_eps_star12e);
cudaFree(d_eps_star22e);
cudaFree(d_eps_star11c);
cudaFree(d_eps_star12c);
cudaFree(d_eps_star22c);
cudaFree(d_mu_ele);
cudaFree(d_mu_elc);


cufftDestroy(plan);

}