The size is limited by the memory. Here is a code which does a convolution for real matrix , but I have few comments. I use in-place transforms. I can answer to your questions:
#include <cstdio>
#include <cmath>
#include <iostream>
#include "error_checks.h" // Macros CUDA_CHECK and CHECK_ERROR_MSG
#include <cuda.h>
#include <cuda_runtime.h>
#include <cufft.h>
__global__ void kexcess(cufftDoubleComplex *A,const double *B, int totsize_inspa,int totsize,
const double ddB,const double BBx)
{
// int ind = blockIdx.x * blockDim.x + threadIdx.x;
int ibl=blockIdx.y+blockIdx.x*gridDim.y;
int ind=threadIdx.x+blockDim.x*ibl;
double ccc,qq;
if(ind<totsize_inspa)
{
qq=B[ind];
ccc=(ddB+BBx+BBx*(2.0*qq+qq*qq))/((double) totsize);
A[ind].x=A[ind].x*ccc;
A[ind].y=A[ind].y*ccc;
}
}
__global__ void kupdt(cufftDoubleComplex *A,cufftDoubleComplex *B,const double *C,
int totsize_inspa,int totsize,const double ddB,const double BBx,double ddt)
{
// int ind = blockIdx.x * blockDim.x + threadIdx.x;
int ibl=blockIdx.y+blockIdx.x*gridDim.y;
int ind=threadIdx.x+blockDim.x*ibl;
double qq,fff,ccc;
if(ind<totsize_inspa)
{
qq=C[ind];
ccc=ddB+BBx+BBx*(2.0*qq+qq*qq);
fff=(1.0/(1.0-ddt*ccc*qq))/((double) totsize);
A[ind].x=A[ind].x*fff+B[ind].x*qq*ddt*fff;
A[ind].y=A[ind].y*fff+B[ind].y*qq*ddt*fff;
}
}
__global__ void totene(cufftDoubleReal *A,cufftDoubleReal *C, int totsize_pad,
double VVnot,cufftDoubleReal *P)
{
// int ind = blockIdx.x * blockDim.x + threadIdx.x;
int ibl=blockIdx.y+blockIdx.x*gridDim.y;
int ind=threadIdx.x+blockDim.x*ibl;
double ppp;
if(ind<totsize_pad)
{
ppp=C[ind];
A[ind]=0.5*A[ind]*ppp+ppp*ppp*ppp*ppp/12.0-ppp*ppp*ppp/6.0+VVnot*P[ind]*ppp;
}
}
__global__ void nonlinterm(double *A,double *C, int totsize_pad,
double VVnot,cufftDoubleReal *P)
{
// int ind = blockIdx.x * blockDim.x + threadIdx.x;
int ibl=blockIdx.y+blockIdx.x*gridDim.y;
int ind=threadIdx.x+blockDim.x*ibl;
double ppp;
if(ind<totsize_pad)
{
ppp=C[ind];
A[ind]=ppp*ppp*ppp/3.0-ppp*ppp/2.0+VVnot*P[ind];
}
}
__host__ double energy(cufftDoubleReal *dbbff,cufftDoubleReal *dppsi, double *ddqq,
cufftDoubleReal *hbbff,cufftDoubleReal *hppsi,
int llx,int lly,int totsize,int totsize_pad,int totsize_invspa,
double ddB,double BBx,cufftHandle pprc,cufftHandle ppcr,dim3 ggrid, dim3 tthreads,
double VVnot,cufftDoubleReal *dppin)
{
CUDA_CHECK( cudaMemcpy(dbbff, dppsi, sizeof(double)*totsize_pad,cudaMemcpyDeviceToDevice) );
cudaThreadSynchronize();
cufftExecD2Z(pprc,dbbff,(cufftDoubleComplex*)dbbff);
cudaThreadSynchronize();
kexcess<<<ggrid,tthreads>>>((cufftDoubleComplex*)dbbff,ddqq, totsize_invspa,totsize,ddB,BBx);
cufftExecZ2D(ppcr,(cufftDoubleComplex*)dbbff, dbbff);
totene<<<ggrid,tthreads>>>(dbbff,dppsi, totsize_pad,VVnot,dppin);
cudaThreadSynchronize();
CUDA_CHECK( cudaMemcpy(hbbff, dbbff, sizeof(double)*totsize_pad,cudaMemcpyDeviceToHost) );
cudaThreadSynchronize();
CUDA_CHECK( cudaMemcpy(hppsi, dppsi, sizeof(double)*totsize_pad,cudaMemcpyDeviceToHost) );
double enne=0;
double ppmm=0;
int count=0;
for(int i=0;i<llx;i++)
{
for(int j=0;j<2*(lly/2+1);j++)
{
if(j<lly)
{
enne=enne+hbbff[count];
ppmm=ppmm+hppsi[count];
}
count=count+1;
}}
// printf(" %66.60lf %66.60lf\n", enne/(totsize), ppmm/(totsize));
// printf("Average density is %66.60lf\n", ppmm/(totsize));
return enne/(double) totsize;
}
__host__ void update(cufftDoubleReal *dbbff,cufftDoubleReal *dppsi, double *ddqq,
int llx,int lly,int totsize,int totsize_pad,int totsize_invspa,
double ddB,double BBx,double ddt,
cufftHandle pprc,cufftHandle ppcr,dim3 ggrid, dim3 tthreads,
double VVnot,cufftDoubleReal *dppin)
{
CUDA_CHECK( cudaMemcpy(dbbff, dppsi, sizeof(double)*totsize_pad,cudaMemcpyDeviceToDevice) );
cudaThreadSynchronize();
nonlinterm<<<ggrid,tthreads>>>(dbbff,dppsi, totsize_pad,VVnot,dppin);
cudaThreadSynchronize();
cufftExecD2Z(pprc,dbbff,(cufftDoubleComplex*)dbbff);
cudaThreadSynchronize();
cufftExecD2Z(pprc,dppsi,(cufftDoubleComplex*)dppsi);
cudaThreadSynchronize();
kupdt<<<ggrid,tthreads>>>((cufftDoubleComplex*)dppsi,(cufftDoubleComplex*)dbbff,
ddqq,totsize_invspa,totsize,ddB,BBx,ddt);
cudaThreadSynchronize();
cufftExecZ2D(ppcr,(cufftDoubleComplex*)dppsi,dppsi);
cudaThreadSynchronize();
}
int main(void)
{
cudaSetDevice(1);
size_t free, total;
printf("\n");
cudaMemGetInfo(&free,&total);
printf("%d KB free of total %d KB at the beginning\n",free/1024,total/1024);
const int lcellx=16;
const int ncellx=128; //384; //,ncelly=ncellx,ncellz=ncellx;
const int lx=ncellx*lcellx,ly=lx;
const int totsize=lx*ly,totsize_pad=lx*2*(ly/2+1),totsize_invspa=lx*(ly/2+1);
const double Pi=acos(-1.0);
const double dt=0.5;
const double Vmax=0.013,Vmin=0.0;
const double at=1.0023048*7.255197456936871; //7.255197456936871;
const double alpha=0.945; //60.0/64.0; // (double)362/(double)384;
const double ap=at/alpha;
const double qt=2*acos(-1.0)/at, qp=2*Pi/ap;
const double dx=ap/((double)lcellx),dy=dx*sqrt(3.0)/2.0;
// const double dx=at/((double)lcellx),dy=dx*sqrt(3.0)/2.0;
const double Bx=1.0,delB=0.02;
const double radi=30.0;
const int nend=3000,nmes=500;
const int nsteps=60;
int ret;
static double hppp[lx][ly];
double pos[ncellx*ncellx*4][2];
char conf[11],part[11],struc[10];
FILE * pFile;
double tmpp;
int iip,iim,jjp,jjm;
int cpart;
const double g1x=(2*Pi/ap);
const double g1y=-(2*Pi/ap)/sqrt(3);
const double g2x=-(2*Pi/ap);
const double g2y=-(2*Pi/ap)/sqrt(3);
const double g3x=0;
const double g3y=(2*Pi/ap)/(sqrt(3)/2);
double kx,ky;
double ene[nmes+1],dpp[nmes+1],ppmm,eeii;
clock_t start, end;
double cpu_time_used;
double Vnot;
static cufftDoubleReal hbff[totsize_pad],hpsi[totsize_pad],hpin[totsize_pad];
static cufftDoubleComplex hkpsi[lx][ly/2+1];
static double hqq[totsize_invspa];
double *dqq;
cufftDoubleReal *dpsi,*dbff,*dpin;
cufftHandle prc,pcr;
start = clock();
printf("\n");
printf(" %d KB memory which will be allocated direct by the user on the device. lx= %d ly= %d\n",
(3*totsize_pad+totsize_invspa)*sizeof(double)/1024,lx,ly);
printf("\n");
cudaMemGetInfo(&free,&total);
printf("%d KB free of total %d KB before the cufft plans are made .\n",free/1024,total/1024);
cufftPlan2d(&prc,lx,ly,CUFFT_D2Z);
cufftPlan2d(&pcr,lx,ly,CUFFT_Z2D);
printf("\n");
cudaMemGetInfo(&free,&total);
printf("%d KB free of total %d KB after the cufft plans are made.\n",free/1024,total/1024);
dim3 grid,threads;
threads.x=1024;
threads.y=1;
threads.z=1;
grid.x=(int)ceil((double)lx/sqrt((double)threads.x));
grid.y=(int)ceil((double)2*(ly/2+1)/sqrt((double)threads.x));
grid.z=1;
printf("\n");
printf("%d %d %d %d\n",grid.x,grid.y,threads.x*grid.x*grid.y,totsize_pad);
printf("\n");
cudaMemGetInfo(&free,&total);
printf("%d KB free of total %d KB before allocations.\n",free/1024,total/1024);
CUDA_CHECK( cudaMalloc((void**)&dpsi, sizeof(cufftDoubleReal)*totsize_pad) );
CUDA_CHECK( cudaMalloc((void**)&dpin, sizeof(cufftDoubleReal)*totsize_pad) );
CUDA_CHECK( cudaMalloc((void**)&dbff, sizeof(cufftDoubleReal)*totsize_pad) );
CUDA_CHECK( cudaMalloc((void**)&dqq, sizeof(double)*totsize_invspa) );
printf("\n");
cudaMemGetInfo(&free,&total);
printf("%d KB free of total %d KB after allocations.\n",free/1024,total/1024);
int count;
count=0;
ppmm=0;
for(int i=0;i<lx;i++)
{
for(int j=0;j<2*(ly/2+1);j++)
{
if(j<ly)
{
hpin[count]=2*(cos(g1x*i*dx+g1y*j*dy)+cos(g2x*i*dx+g2y*j*dy)+cos(g3x*i*dx+g3y*j*dy));
// hpin[count]=4*(cos(qp*i*dx)*cos(qp*j*dy/sqrt(3.0))+0.5*cos(2*qp*j*dy/sqrt(3.0)));
}
else
{
hpsi[count]=0;
}
count=count+1;
}}
// printf("%66.60lf\n", ppmm/(double)totsize);
count=0;
for(int i=0;i<lx;i++)
{
if(i<=lx/2)
{kx=i*2*Pi/((double)lx*dx);}
else
{kx=(i-lx)*2*Pi/((double)lx*dx);}
for(int j=0;j<=ly/2;j++)
{
ky=j*2*Pi/((double)ly*dy);
hqq[count]=-(kx*kx+ky*ky);
count=count+1;
}}
CUDA_CHECK( cudaMemcpy(dqq, hqq, sizeof(double)*totsize_invspa,cudaMemcpyHostToDevice) );
CUDA_CHECK( cudaMemcpy(dpin, hpin, sizeof(double)*totsize_pad,cudaMemcpyHostToDevice) );
double ReA;
double ImA;
count=0;
ppmm=0;
for(int i=0;i<lx;i++)
{
for(int j=0;j<2*(ly/2+1);j++)
{
if(j<ly)
{
if(pow((i-lx/2)*dx,2)+pow((j-ly/2)*dy,2)<=at*at*radi*radi)
{
ReA=0.2067*cos(2*Pi/3); // 0.185558839341884 //15 //0.221536478409509
ImA=0.2067*sin(2*Pi/3);
}
else
{
ReA=0.2067*cos(2*Pi/3);
ImA=0.2067*sin(-2*Pi/3);
}
// hpsi[count]=-hpin[count];
hpsi[count]=2*(ReA*(cos(g1x*i*dx+g1y*j*dy)+cos(g2x*i*dx+g2y*j*dy)+cos(g3x*i*dx+g3y*j*dy))
-ImA*(sin(g1x*i*dx+g1y*j*dy)+sin(g2x*i*dx+g2y*j*dy)+sin(g3x*i*dx+g3y*j*dy)));
ppmm=ppmm+hpsi[count];
}
else
{
hpsi[count]=0;
}
count=count+1;
}}
ppmm=ppmm/(double)totsize;
printf("%66.60lf\n",ppmm);
count=0;
for(int i=0;i<lx;i++)
{
for(int j=0;j<2*(ly/2+1);j++)
{
if(j<ly)
{
hpsi[count]=hpsi[count]-ppmm;
}
else
{
hpsi[count]=0;
}
count=count+1;
}}
pFile=fopen("initpsione.txt","w");
count=0;
for(int i=0;i<lx;i++)
{
for(int j=0;j<2*(ly/2+1);j++)
{
if(j<ly)
{
fprintf(pFile,"%61.56lf\n",hpsi[count]);
}
count=count+1;
}}
fclose(pFile);
CUDA_CHECK( cudaMemcpy(dpsi, hpsi, sizeof(double)*totsize_pad,cudaMemcpyHostToDevice) );
ret=sprintf(conf,"psi000.txt");
pFile=fopen(conf,"w");
count=0;
for(int i=0;i<lx;i++)
{
for(int j=0;j<2*(ly/2+1);j++)
{
if(j<ly)
{
fprintf(pFile,"%61.56lf\n",hpsi[count]);
}
count=count+1;
}}
fclose(pFile);
Vnot=Vmax;
eeii=energy(dbff,dpsi,dqq,hbff,hpsi,lx,ly,totsize,totsize_pad,totsize_invspa,delB,Bx,
prc,pcr,grid,threads,Vnot,dpin);
printf("At starting energy is %66.60lf\n",eeii);
ene[0]=energy(dbff,dpsi,dqq,hbff,hpsi,lx,ly,totsize,totsize_pad,totsize_invspa,delB,Bx,
prc,pcr,grid,threads,Vnot,dpin);
pFile=fopen("enespeed.txt","a");
fprintf(pFile,"%66.60lf %66.60lf %66.60lf\n",Vnot,ene[0],dpp[0]);
fclose(pFile);
for(int sss=1;sss<=nmes;sss++)
{
// Vnot=Vmin+(double)sss*(Vmax-Vmin)/(double)nsteps;
for(int n=1;n<=nend;n++)
{
/*
eeii=energy(dbff,dpsi,dqq,hbff,hpsi,lx,ly,totsize,totsize_pad,totsize_invspa,delB,Bx,
prc,pcr,grid,threads,Vnot,dpin);
printf("Before starting %d energy is %66.60lf\n",sss,eeii);
*/
// Start update
update(dbff,dpsi,dqq,lx,ly,totsize,totsize_pad,totsize_invspa,delB,Bx,dt,prc,pcr,grid,threads,
Vnot,dpin);
// End update
}
CUDA_CHECK( cudaMemcpy(hpsi, dpsi, sizeof(double)*totsize_pad,cudaMemcpyDeviceToHost) );
dpp[sss]=0;
/*
for(int n=1;n<=nav;n++)
{
count=0;
for(int i=0;i<lx;i++)
{
for(int j=0;j<2*(ly/2+1);j++)
{
if(j<ly)
{
hbff[count]=hpsi[count];
}
count=count+1;
}}
update(dbff,dpsi,dqq,lx,ly,totsize,totsize_pad,totsize_invspa,delB,Bx,dt,prc,pcr,grid,threads,
Vnot,dpin);
cudaThreadSynchronize();
CUDA_CHECK( cudaMemcpy(hpsi, dpsi, sizeof(double)*totsize_pad,cudaMemcpyDeviceToHost) );
cudaThreadSynchronize();
count=0;
for(int i=0;i<lx;i++)
{
for(int j=0;j<2*(ly/2+1);j++)
{
if(j<ly)
{
dpp[sss]=dpp[sss]+fabs(hbff[count]-hpsi[count]);
}
count=count+1;
}}
}
*/
// Start of energy function
ene[sss]=energy(dbff,dpsi,dqq,hbff,hpsi,lx,ly,totsize,totsize_pad,totsize_invspa,delB,Bx,
prc,pcr,grid,threads,Vnot,dpin);
// dpp[sss]=dpp[sss]/(dt*(double)nav*(double)totsize);
pFile=fopen("enespeed.txt","a");
fprintf(pFile,"%66.60lf %66.60lf %66.60lf\n",Vnot,ene[sss],dpp[sss]);
fclose(pFile);
// End energy calculation
// printf("%d \n",sss);
if(sss<10)
{
ret=sprintf(part,"ppp00%d.txt",sss);
ret=sprintf(conf,"psi00%d.txt",sss);
ret=sprintf(struc,"sk00%d.txt",sss);
}
else if(sss<100)
{
ret=sprintf(part,"ppp0%d.txt",sss);
ret=sprintf(conf,"psi0%d.txt",sss);
ret=sprintf(struc,"sk0%d.txt",sss);
}
else if(sss<1000)
{
ret=sprintf(part,"ppp%d.txt",sss);
ret=sprintf(conf,"psi%d.txt",sss);
ret=sprintf(struc,"sk%d.txt",sss);
}
// printf("%s %s\n", conf,struc);
pFile=fopen(conf,"w");
count=0;
for(int i=0;i<lx;i++)
{
for(int j=0;j<2*(ly/2+1);j++)
{
if(j<ly)
{
fprintf(pFile,"%61.56lf\n",hpsi[count]);
hppp[i][j]=hpsi[count];
}
count=count+1;
}}
fclose(pFile);
cpart=0;
pFile=fopen(part,"w");
for(int i=0;i<lx;i++)
{
iip=i+1;
if(iip<0)
{
iip=iip+lx;
}
else if(iip>=lx)
{
iip=iip-lx;
}
iim=i-1;
if(iim<0)
{
iim=iim+lx;
}
else if(iip>=lx)
{
iim=iim-lx;
}
for(int j=0;j<ly;j++)
{
jjp=j+1;
if(jjp<0)
{
jjp=jjp+lx;
}
else if(jjp>=ly)
{
jjp=jjp-ly;
}
jjm=j-1;
if(jjm<0)
{
jjm=jjm+lx;
}
else if(jjm>=ly)
{
jjm=jjm-ly;
}
if(hppp[i][j]>=hppp[iip][j] & hppp[i][j]>=hppp[iim][j] &
hppp[i][j]>=hppp[i][jjp] & hppp[i][j]>=hppp[i][jjm]&
hppp[i][j]>=hppp[iim][jjm] & hppp[i][j]>=hppp[iim][jjp] &
hppp[i][j]>=hppp[iip][jjm] & hppp[i][j]>=hppp[iip][jjp] )
{
pos[cpart][0]=i*dx;
pos[cpart][1]=j*dy;
cpart=cpart+1;
fprintf(pFile,"%66.60lf %66.60lf\n",i*dx,j*dy);
}
}
}
fclose(pFile);
printf("%d %d\n",sss,cpart);
CUDA_CHECK( cudaMemcpy(dbff, dpsi, sizeof(double)*totsize_pad,cudaMemcpyDeviceToDevice) );
cudaThreadSynchronize();
cufftExecD2Z(prc,dbff,(cufftDoubleComplex*)dbff);
cudaThreadSynchronize();
CUDA_CHECK( cudaMemcpy(hkpsi,(cufftDoubleComplex*)dbff, sizeof(cufftDoubleComplex)*totsize_invspa,cudaMemcpyDeviceToHost) );
/*
pFile=fopen(struc,"w");
count=0;
for(int i=0;i<lx;i++)
{
for(int j=0;j<(ly/2+1);j++)
{
fprintf(pFile,"%61.56lf\n",(hkpsi[i][j].x*hkpsi[i][j].x+hkpsi[i][j].y*hkpsi[i][j].y)/((double)totsize*(double)totsize));
count=count+1;
}}
fclose(pFile);
*/
// printf("%s %s %66.60lf \n", conf,struc);
printf("%s %s %66.60lf \n", conf,struc,ene[sss]);
// printf(" %66.60lf %66.60lf\n", ene[sss],dpp[sss]);
}
CUDA_CHECK( cudaFree((void*)dpsi) );
CUDA_CHECK( cudaFree((void*)dbff) );
CUDA_CHECK( cudaFree((void*)dqq) );
cufftDestroy(prc);
cufftDestroy(pcr);
end = clock();
cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
printf("%66.60lf \n",cpu_time_used);
printf("\n");
cudaMemGetInfo(&free,&total);
printf("%d KB free of total %d KB at the end.\n",free/1024,total/1024);
/*
pFile=fopen("enespeed.txt","w");
for(int sss=0;sss<=nsteps;sss++)
{
Vnot=Vmin+(double)sss*(Vmax-Vmin)/(double)nsteps;
fprintf(pFile,"%66.60lf %66.60lf %66.60lf\n",Vnot,ene[sss],dpp[sss]/(dt*(double)nav*(double)totsize));
}
fclose(pFile);
*/
return 0;
}
So for inplace transforms: you have
const int lx=ncellx*lcellx,ly=lx;
const int totsize=lx*ly,totsize_pad=lx*2*(ly/2+1),totsize_invspa=lx*(ly/2+1);
static cufftDoubleReal hbff[totsize_pad],hpsi[totsize_pad],hpin[totsize_pad];
static cufftDoubleComplex hkpsi[lx][ly/2+1];
static double hqq[totsize_invspa];
double *dqq;
cufftDoubleReal *dpsi,*dbff,*dpin;
cufftHandle prc,pcr;
The allocation is done like this on the gpu:
CUDA_CHECK( cudaMalloc((void**)&dpsi, sizeof(cufftDoubleReal)*totsize_pad) );
CUDA_CHECK( cudaMalloc((void**)&dpin, sizeof(cufftDoubleReal)*totsize_pad) );
CUDA_CHECK( cudaMalloc((void**)&dbff, sizeof(cufftDoubleReal)*totsize_pad) );
CUDA_CHECK( cudaMalloc((void**)&dqq, sizeof(double)*totsize_invspa) );
Here is an example to initilize correct with padding:
double ReA;
double ImA;
count=0;
ppmm=0;
for(int i=0;i<lx;i++)
{
for(int j=0;j<2*(ly/2+1);j++)
{
if(j<ly)
{
if(pow((i-lx/2)*dx,2)+pow((j-ly/2)*dy,2)<=at*at*radi*radi)
{
ReA=0.2067*cos(2*Pi/3); // 0.185558839341884 //15 //0.221536478409509
ImA=0.2067*sin(2*Pi/3);
}
else
{
ReA=0.2067*cos(2*Pi/3);
ImA=0.2067*sin(-2*Pi/3);
}
// hpsi[count]=-hpin[count];
hpsi[count]=2*(ReA*(cos(g1x*i*dx+g1y*j*dy)+cos(g2x*i*dx+g2y*j*dy)+cos(g3x*i*dx+g3y*j*dy))
-ImA*(sin(g1x*i*dx+g1y*j*dy)+sin(g2x*i*dx+g2y*j*dy)+sin(g3x*i*dx+g3y*j*dy)));
ppmm=ppmm+hpsi[count];
}
else
{
hpsi[count]=0;
}
count=count+1;
}}
ppmm=ppmm/(double)totsize;
and Here is an sxample how to prepare the k filter:
count=0;
for(int i=0;i<lx;i++)
{
if(i<=lx/2)
{kx=i*2*Pi/((double)lx*dx);}
else
{kx=(i-lx)*2*Pi/((double)lx*dx);}
for(int j=0;j<=ly/2;j++)
{
ky=j*2*Pi/((double)ly*dy);
hqq[count]=-(kx*kx+ky*ky);
count=count+1;
}}