Why I can`t set the in the “setParam” Options “int __nx, int __ny, int __mx, int __my” arbitrary numbers, such __nx = __ ny = __ mx = __ my = 100?
The program displays an error (for example, “kernel launch timed out and was terminated”) or an incorrect result.
Maybe someone have any ideas?
project source:
#include <stdio.h>
#include <stdlib.h>
#include
#include <math.h>
#include <cuda_runtime_api.h>
#include <cuda_runtime.h>
#include <cuda_device_runtime_api.h>
#include “diff2.h”//prototype function
#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif
#define e2 1973.269631494949494949/137.035999679949494949494949494
#define pi2 6.2831853071795864769252867665590057683944
__inline host void gpuAssert(cudaError_t code, char *file, int line,
bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,“GPUassert: %s %s %d\n”, cudaGetErrorString(code),
file, line);
if (abort) exit(code);
}
}
#define gpuErrchk(ans) { gpuAssert((ans), FILE, LINE); }
host device inline float square(const float &x){ return x*x; }
//device:
constant float d6,sigma2;
constant int Atoms, Z1, Z2, Nx2,Ny2,Mx2,My2;
float *tmpBuf__dev;
float *Xj_dev,*Yj_dev;
float *res_dev;
float *Pi2dnx_ax__dev, *Pi2dny_ay__dev, *Pi2dmx_ax__dev, *Pi2dmy_ay__dev,
*Pi2nx_ax__dev, *Pi2ny_ay__dev, *Pi2mx_ax__dev, *Pi2my_ay__dev;
//host:
int blocksDimX,blocksDimY,maxThreadsPerBlock;
float ax,ay,az,d6_;
void setParam(const float *xj, const float *yj, float __d, float __sigma2, int __Z1, int __Z2, int atoms,
float __ax, float __ay, float __az,
int __nx, int __ny,
int __mx, int __my)
{
cudaDeviceProp devProp;
cudaGetDeviceProperties ( &devProp, 0);
printf ( “Compute capability : %d.%d\n”, devProp.major, devProp.minor );
printf ( “Name : %s\n”, devProp.name );
printf ( “Total Global Memory : %u\n”, devProp.totalGlobalMem );
printf ( “Shared memory per block: %d\n”, devProp.sharedMemPerBlock );
printf ( “Registers per block : %d\n”, devProp.regsPerBlock );
printf ( “Warp size : %d\n”, devProp.warpSize );
printf ( “Max threads per block : %d\n”, devProp.maxThreadsPerBlock );
printf ( “Total constant memory : %d\n”, devProp.totalConstMem );
printf(“Max threads dimensions: x = %d, y = %d, z = %d\n”,
devProp.maxThreadsDim[0],
devProp.maxThreadsDim[1],
devProp.maxThreadsDim[2]);
printf("Max grid size: x = %d, y = %d, z = %d\n",
devProp.maxGridSize[0],
devProp.maxGridSize[1],
devProp.maxGridSize[2]);
int __nx2 = 2*__nx,__ny2 = 2*__ny;
int __mx2 = 2*__mx,__my2 = 2*__my;
d6_ = powf(__d,6.0);
ax = __ax;
ay = __ay;
az = __az;
float *__Pi2dnx_ax = new float[__nx2]; float *__Pi2nx_ax = new float[__nx2];
float *__Pi2dny_ay = new float[__ny2]; float *__Pi2ny_ay = new float[__ny2];
float *__Pi2dmx_ax = new float[__mx2]; float *__Pi2mx_ax = new float[__mx2];
float *__Pi2dmy_ay = new float[__my2]; float *__Pi2my_ay = new float[__my2];
for(int i=0; i<__nx2; i++) {
__Pi2nx_ax[i] = float(pi2) * (i-__nx)/ax;
__Pi2dnx_ax[i] = __Pi2nx_ax[i]*__d;
}
for(int i=0; i<__ny2; i++) {
__Pi2ny_ay[i] = float(pi2) * (i-__ny)/ay;
__Pi2dny_ay[i] = __Pi2ny_ay[i]*__d;
}
for(int i=0; i<__mx2; i++) {
__Pi2mx_ax[i] = float(pi2) * (i-__mx)/ax;
__Pi2dmx_ax[i] = __Pi2mx_ax[i]*__d;
}
for(int i=0; i<__my2; i++) {
__Pi2my_ay[i] = float(pi2) * (i-__my)/ay;
__Pi2dmy_ay[i] = __Pi2my_ay[i]*__d;
}
//alloc in device
gpuErrchk(cudaMalloc((void**)&Xj_dev,sizeof(float)*atoms));
gpuErrchk(cudaMalloc((void**)&Yj_dev,sizeof(float)*atoms));
gpuErrchk(cudaMalloc((void**)&res_dev,sizeof(float)));
gpuErrchk(cudaMalloc((void**)&Pi2dnx_ax__dev,sizeof(float)*__nx2));
gpuErrchk(cudaMalloc((void**)&Pi2dny_ay__dev,sizeof(float)*__ny2));
gpuErrchk(cudaMalloc((void**)&Pi2dmx_ax__dev,sizeof(float)*__mx2));
gpuErrchk(cudaMalloc((void**)&Pi2dmy_ay__dev,sizeof(float)*__my2));
gpuErrchk(cudaMalloc((void**)&Pi2nx_ax__dev,sizeof(float)*__nx2));
gpuErrchk(cudaMalloc((void**)&Pi2ny_ay__dev,sizeof(float)*__ny2));
gpuErrchk(cudaMalloc((void**)&Pi2mx_ax__dev,sizeof(float)*__mx2));
gpuErrchk(cudaMalloc((void**)&Pi2my_ay__dev,sizeof(float)*__my2));
//copy mem to device
gpuErrchk(cudaMemcpy(Xj_dev,xj,sizeof(float)*atoms,cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(Yj_dev,yj,sizeof(float)*atoms,cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(Pi2dnx_ax__dev,__Pi2dnx_ax,sizeof(float)*__nx2,cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(Pi2dny_ay__dev,__Pi2dny_ay,sizeof(float)*__ny2,cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(Pi2dmx_ax__dev,__Pi2dmx_ax,sizeof(float)*__mx2,cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(Pi2dmy_ay__dev,__Pi2dmy_ay,sizeof(float)*__my2,cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(Pi2nx_ax__dev,__Pi2nx_ax,sizeof(float)*__nx2,cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(Pi2ny_ay__dev,__Pi2ny_ay,sizeof(float)*__ny2,cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(Pi2mx_ax__dev,__Pi2mx_ax,sizeof(float)*__mx2,cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(Pi2my_ay__dev,__Pi2my_ay,sizeof(float)*__my2,cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpyToSymbol(d6,&d6_,sizeof(float),0,cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpyToSymbol(sigma2,&__sigma2,sizeof(float),0,cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpyToSymbol(Atoms,&atoms,sizeof(int),0,cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpyToSymbol(Nx2,&__nx2,sizeof(int),0,cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpyToSymbol(Ny2,&__ny2,sizeof(int),0,cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpyToSymbol(Mx2,&__mx2,sizeof(int),0,cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpyToSymbol(My2,&__my2,sizeof(int),0,cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpyToSymbol(Z1,&__Z1,sizeof(int),0,cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpyToSymbol(Z2,&__Z2,sizeof(int),0,cudaMemcpyHostToDevice));
maxThreadsPerBlock = __nx*2;
blocksDimX = ((__nx2+maxThreadsPerBlock-1)/maxThreadsPerBlock);
blocksDimY = ((__ny2+maxThreadsPerBlock-1)/maxThreadsPerBlock);
std::cout<<"Grid: "<<blocksDimX<<" "<<blocksDimY<<std::endl;
//gpuErrchk(cudaMalloc((void**)&tmpBuf__dev,sizeof(float)*blocksDimX*blocksDimY));
}
device float a_tf(int __Z1, int __Z2)
{
return 0.88534137700011350.529177208593636363636powf(__Z2,-1.0/3.0);
}
device float V(float __gg, int __Z1, int __Z2) // Molier
{
float a = a_tf(__Z1,__Z2);
return 2.0pi2__Z1*__Z2e2(0.35 / (square(0.3/a) + __gg) + 0.55 / (square(1.2/a) + __gg) + 0.1 / (square(6.0/a) + __gg) );
}
device float VV(float __gg, float __qq, int __Z1, int __Z2) // Molier
{
float a = a_tf(__Z1,__Z2);
return square(2.0pi2__Z1*__Z2e2)(0.35 / (square(0.3/a) + __gg) + 0.55 / (square(1.2/a) + __gg) + 0.1 / (square(6.0/a) + __gg) ) *
(0.35 / (square(0.3/a) + __qq) + 0.55 / (square(1.2/a) + __qq) + 0.1 / (square(6.0/a) + __qq) );
}
#define LOCAL_1D_SUM(npos, count, tmpBuf)
for (unsigned int i = count; i>>1; i -= i>>1)
{
if (npos < i>>1)
{
tmpBuf[npos] += tmpBuf[i-1-npos];
}
__syncthreads();
}
global void kernelxx(float2 xy, float *res, float *xj , float *yj,
float *Pi2dnx_ax, float *Pi2dny_ay, float *Pi2dmx_ax, float *Pi2dmy_ay,
float *Pi2nx_ax, float Pi2ny_ay, float Pi2mx_ax, float Pi2my_ay, float __tmpBuf)
{
unsigned int npos = threadIdx.yblockDim.x+threadIdx.x;
/unsigned int x = threadIdx.x + blockIdx.xblockDim.x;
unsigned int y = threadIdx.y + blockIdx.yblockDim.y;
unsigned int npos = x + yblockDim.x+gridDim.x;/
unsigned int count = blockDim.x*blockDim.y;
extern __shared__ float tmpBuf[];
tmpBuf[npos] = 0.0;
//printf("threadIdx.x = %i blockDim.x=%i %i %i\n",threadIdx.x,blockDim.x,threadIdx.y,blockDim.y);
//printf("%i %i %i %i\n",x,y,npos,count);
for(int i = threadIdx.x; i<Nx2; i+=blockDim.x)
for(int j = threadIdx.y; j<Ny2; j+=blockDim.y)
for(int k = 0; k<Mx2; k++)
for(int l = 0; l<My2; l++)
for(int a = 0; a <Atoms; a++)
{
float gg = square(Pi2nx_ax[i]) + square(Pi2ny_ay[j]);
float qq = square(Pi2mx_ax[k]) + square(Pi2my_ay[l]);
float squaregq = square(Pi2nx_ax[i] - Pi2mx_ax[k]) + square(Pi2ny_ay[j] - Pi2my_ay[l]);
float VgVqexp = V(gg,Z1,Z2)*V(qq,Z1,Z2)*expf(-sigma2*squaregq/2.);
tmpBuf[npos] += Pi2dnx_ax[i]*Pi2dmx_ax[k]*VgVqexp *
sinf(Pi2dnx_ax[i]*(xy.x-xj[a]) + Pi2dny_ay[j] * (xy.y-yj[a])) *
sinf(Pi2dmx_ax[k]*(xy.x-xj[a]) + Pi2dmy_ay[l] * (xy.y-yj[a]));
}
__syncthreads();
LOCAL_1D_SUM(npos, count, tmpBuf)
if (!npos) res[0] = tmpBuf[0];
}
float Dnuclxx(const float &x, const float &y)
{
dim3 threads(maxThreadsPerBlock,maxThreadsPerBlock);
dim3 blocks(blocksDimX,blocksDimY);
size_t len = (maxThreadsPerBlock*maxThreadsPerBlock)*sizeof(float);
kernelxx<<<blocks, threads,len>>>(make_float2(x,y), res_dev, Xj_dev , Yj_dev,
Pi2dnx_ax__dev, Pi2dny_ay__dev, Pi2dmx_ax__dev, Pi2dmy_ay__dev,
Pi2nx_ax__dev, Pi2ny_ay__dev, Pi2mx_ax__dev, Pi2my_ay__dev, tmpBuf__dev);
cudaThreadSynchronize();
float res = 0.0;
cudaMemcpy(&res,res_dev,sizeof(float),cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
return az*res/d6_;
}
void destroyAll()
{
gpuErrchk(cudaFree(res_dev));
gpuErrchk(cudaFree(Xj_dev));
gpuErrchk(cudaFree(Yj_dev));
gpuErrchk(cudaFree(Pi2dnx_ax__dev));
gpuErrchk(cudaFree(Pi2dny_ay__dev));
gpuErrchk(cudaFree(Pi2dmx_ax__dev));
gpuErrchk(cudaFree(Pi2dmy_ay__dev));
gpuErrchk(cudaFree(Pi2nx_ax__dev));
gpuErrchk(cudaFree(Pi2ny_ay__dev));
gpuErrchk(cudaFree(Pi2mx_ax__dev));
gpuErrchk(cudaFree(Pi2my_ay__dev));
cudaDeviceReset();
}