problem with kernel settings

Why I can`t set the in the “setParam” Options “int __nx, int __ny, int __mx, int __my” arbitrary numbers, such __nx = __ ny = __ mx = __ my = 100?
The program displays an error (for example, “kernel launch timed out and was terminated”) or an incorrect result.
Maybe someone have any ideas?

project source:

#include <stdio.h>
#include <stdlib.h>
#include
#include <math.h>
#include <cuda_runtime_api.h>
#include <cuda_runtime.h>
#include <cuda_device_runtime_api.h>
#include “diff2.h”//prototype function

#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif

#define e2 1973.269631494949494949/137.035999679949494949494949494
#define pi2 6.2831853071795864769252867665590057683944

__inline host void gpuAssert(cudaError_t code, char *file, int line,
bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,“GPUassert: %s %s %d\n”, cudaGetErrorString(code),
file, line);
if (abort) exit(code);
}
}

#define gpuErrchk(ans) { gpuAssert((ans), FILE, LINE); }
host device inline float square(const float &x){ return x*x; }

//device:
constant float d6,sigma2;
constant int Atoms, Z1, Z2, Nx2,Ny2,Mx2,My2;
float *tmpBuf__dev;
float *Xj_dev,*Yj_dev;
float *res_dev;
float *Pi2dnx_ax__dev, *Pi2dny_ay__dev, *Pi2dmx_ax__dev, *Pi2dmy_ay__dev,
*Pi2nx_ax__dev, *Pi2ny_ay__dev, *Pi2mx_ax__dev, *Pi2my_ay__dev;

//host:
int blocksDimX,blocksDimY,maxThreadsPerBlock;
float ax,ay,az,d6_;

void setParam(const float *xj, const float *yj, float __d, float __sigma2, int __Z1, int __Z2, int atoms,
float __ax, float __ay, float __az,
int __nx, int __ny,
int __mx, int __my)
{
cudaDeviceProp devProp;
cudaGetDeviceProperties ( &devProp, 0);
printf ( “Compute capability : %d.%d\n”, devProp.major, devProp.minor );
printf ( “Name : %s\n”, devProp.name );
printf ( “Total Global Memory : %u\n”, devProp.totalGlobalMem );
printf ( “Shared memory per block: %d\n”, devProp.sharedMemPerBlock );
printf ( “Registers per block : %d\n”, devProp.regsPerBlock );
printf ( “Warp size : %d\n”, devProp.warpSize );
printf ( “Max threads per block : %d\n”, devProp.maxThreadsPerBlock );
printf ( “Total constant memory : %d\n”, devProp.totalConstMem );
printf(“Max threads dimensions: x = %d, y = %d, z = %d\n”,
devProp.maxThreadsDim[0],
devProp.maxThreadsDim[1],
devProp.maxThreadsDim[2]);

 printf("Max grid size: x = %d, y = %d, z = %d\n", 
   devProp.maxGridSize[0], 
   devProp.maxGridSize[1], 
   devProp.maxGridSize[2]);


int __nx2 = 2*__nx,__ny2 = 2*__ny;
int __mx2 = 2*__mx,__my2 = 2*__my;
d6_ = powf(__d,6.0);
ax = __ax;
ay = __ay;
az = __az;

float *__Pi2dnx_ax = new float[__nx2];	 float *__Pi2nx_ax = new float[__nx2];
float *__Pi2dny_ay = new float[__ny2];    float *__Pi2ny_ay = new float[__ny2];
float *__Pi2dmx_ax = new float[__mx2];    float *__Pi2mx_ax = new float[__mx2];
float *__Pi2dmy_ay = new float[__my2];    float *__Pi2my_ay = new float[__my2];

for(int i=0; i<__nx2; i++) {
    __Pi2nx_ax[i] = float(pi2) * (i-__nx)/ax;
    __Pi2dnx_ax[i] = __Pi2nx_ax[i]*__d;
}

for(int i=0; i<__ny2; i++) {
    __Pi2ny_ay[i] = float(pi2) * (i-__ny)/ay;
    __Pi2dny_ay[i] = __Pi2ny_ay[i]*__d;
}

for(int i=0; i<__mx2; i++) {
    __Pi2mx_ax[i] = float(pi2) * (i-__mx)/ax;
    __Pi2dmx_ax[i] = __Pi2mx_ax[i]*__d;
}

for(int i=0; i<__my2; i++) {
    __Pi2my_ay[i] = float(pi2) * (i-__my)/ay;
    __Pi2dmy_ay[i] = __Pi2my_ay[i]*__d;
}

//alloc in device
gpuErrchk(cudaMalloc((void**)&Xj_dev,sizeof(float)*atoms));
gpuErrchk(cudaMalloc((void**)&Yj_dev,sizeof(float)*atoms));
gpuErrchk(cudaMalloc((void**)&res_dev,sizeof(float)));
gpuErrchk(cudaMalloc((void**)&Pi2dnx_ax__dev,sizeof(float)*__nx2));
gpuErrchk(cudaMalloc((void**)&Pi2dny_ay__dev,sizeof(float)*__ny2));
gpuErrchk(cudaMalloc((void**)&Pi2dmx_ax__dev,sizeof(float)*__mx2));
gpuErrchk(cudaMalloc((void**)&Pi2dmy_ay__dev,sizeof(float)*__my2));

gpuErrchk(cudaMalloc((void**)&Pi2nx_ax__dev,sizeof(float)*__nx2));
gpuErrchk(cudaMalloc((void**)&Pi2ny_ay__dev,sizeof(float)*__ny2));
gpuErrchk(cudaMalloc((void**)&Pi2mx_ax__dev,sizeof(float)*__mx2));
gpuErrchk(cudaMalloc((void**)&Pi2my_ay__dev,sizeof(float)*__my2));

//copy mem to device
gpuErrchk(cudaMemcpy(Xj_dev,xj,sizeof(float)*atoms,cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(Yj_dev,yj,sizeof(float)*atoms,cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(Pi2dnx_ax__dev,__Pi2dnx_ax,sizeof(float)*__nx2,cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(Pi2dny_ay__dev,__Pi2dny_ay,sizeof(float)*__ny2,cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(Pi2dmx_ax__dev,__Pi2dmx_ax,sizeof(float)*__mx2,cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(Pi2dmy_ay__dev,__Pi2dmy_ay,sizeof(float)*__my2,cudaMemcpyHostToDevice));

gpuErrchk(cudaMemcpy(Pi2nx_ax__dev,__Pi2nx_ax,sizeof(float)*__nx2,cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(Pi2ny_ay__dev,__Pi2ny_ay,sizeof(float)*__ny2,cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(Pi2mx_ax__dev,__Pi2mx_ax,sizeof(float)*__mx2,cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(Pi2my_ay__dev,__Pi2my_ay,sizeof(float)*__my2,cudaMemcpyHostToDevice));

gpuErrchk(cudaMemcpyToSymbol(d6,&d6_,sizeof(float),0,cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpyToSymbol(sigma2,&__sigma2,sizeof(float),0,cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpyToSymbol(Atoms,&atoms,sizeof(int),0,cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpyToSymbol(Nx2,&__nx2,sizeof(int),0,cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpyToSymbol(Ny2,&__ny2,sizeof(int),0,cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpyToSymbol(Mx2,&__mx2,sizeof(int),0,cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpyToSymbol(My2,&__my2,sizeof(int),0,cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpyToSymbol(Z1,&__Z1,sizeof(int),0,cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpyToSymbol(Z2,&__Z2,sizeof(int),0,cudaMemcpyHostToDevice));

maxThreadsPerBlock = __nx*2;
blocksDimX = ((__nx2+maxThreadsPerBlock-1)/maxThreadsPerBlock);
blocksDimY = ((__ny2+maxThreadsPerBlock-1)/maxThreadsPerBlock);
std::cout<<"Grid: "<<blocksDimX<<"  "<<blocksDimY<<std::endl;

//gpuErrchk(cudaMalloc((void**)&tmpBuf__dev,sizeof(float)*blocksDimX*blocksDimY));

}

device float a_tf(int __Z1, int __Z2)
{
return 0.88534137700011350.529177208593636363636powf(__Z2,-1.0/3.0);
}

device float V(float __gg, int __Z1, int __Z2) // Molier
{
float a = a_tf(__Z1,__Z2);
return 2.0pi2__Z1*__Z2e2(0.35 / (square(0.3/a) + __gg) + 0.55 / (square(1.2/a) + __gg) + 0.1 / (square(6.0/a) + __gg) );
}

device float VV(float __gg, float __qq, int __Z1, int __Z2) // Molier
{
float a = a_tf(__Z1,__Z2);
return square(2.0pi2__Z1*__Z2e2)(0.35 / (square(0.3/a) + __gg) + 0.55 / (square(1.2/a) + __gg) + 0.1 / (square(6.0/a) + __gg) ) *
(0.35 / (square(0.3/a) + __qq) + 0.55 / (square(1.2/a) + __qq) + 0.1 / (square(6.0/a) + __qq) );
}

#define LOCAL_1D_SUM(npos, count, tmpBuf)
for (unsigned int i = count; i>>1; i -= i>>1)
{
if (npos < i>>1)
{
tmpBuf[npos] += tmpBuf[i-1-npos];
}
__syncthreads();
}

global void kernelxx(float2 xy, float *res, float *xj , float *yj,
float *Pi2dnx_ax, float *Pi2dny_ay, float *Pi2dmx_ax, float *Pi2dmy_ay,
float *Pi2nx_ax, float Pi2ny_ay, float Pi2mx_ax, float Pi2my_ay, float __tmpBuf)
{
unsigned int npos = threadIdx.y
blockDim.x+threadIdx.x;
/unsigned int x = threadIdx.x + blockIdx.xblockDim.x;
unsigned int y = threadIdx.y + blockIdx.y
blockDim.y;
unsigned int npos = x + y
blockDim.x+gridDim.x;
/

unsigned int count = blockDim.x*blockDim.y;

extern __shared__ float tmpBuf[];
tmpBuf[npos] = 0.0;
//printf("threadIdx.x = %i blockDim.x=%i %i %i\n",threadIdx.x,blockDim.x,threadIdx.y,blockDim.y);
//printf("%i %i %i %i\n",x,y,npos,count);

for(int i = threadIdx.x; i<Nx2; i+=blockDim.x)
	for(int j = threadIdx.y; j<Ny2; j+=blockDim.y)
	for(int k = 0; k<Mx2; k++)
		for(int l = 0; l<My2; l++)
		for(int a = 0; a <Atoms; a++)
		{
			float gg = square(Pi2nx_ax[i]) + square(Pi2ny_ay[j]);
			float qq = square(Pi2mx_ax[k]) + square(Pi2my_ay[l]);
			float squaregq = square(Pi2nx_ax[i] - Pi2mx_ax[k]) +  square(Pi2ny_ay[j] - Pi2my_ay[l]);
			float VgVqexp = V(gg,Z1,Z2)*V(qq,Z1,Z2)*expf(-sigma2*squaregq/2.);

			tmpBuf[npos] += Pi2dnx_ax[i]*Pi2dmx_ax[k]*VgVqexp *
							sinf(Pi2dnx_ax[i]*(xy.x-xj[a]) +  Pi2dny_ay[j] * (xy.y-yj[a])) *
							sinf(Pi2dmx_ax[k]*(xy.x-xj[a]) +  Pi2dmy_ay[l] * (xy.y-yj[a]));
		}

__syncthreads();
LOCAL_1D_SUM(npos, count, tmpBuf)
if (!npos) res[0] = tmpBuf[0];

}

float Dnuclxx(const float &x, const float &y)
{
dim3 threads(maxThreadsPerBlock,maxThreadsPerBlock);
dim3 blocks(blocksDimX,blocksDimY);
size_t len = (maxThreadsPerBlock*maxThreadsPerBlock)*sizeof(float);

kernelxx<<<blocks, threads,len>>>(make_float2(x,y), res_dev, Xj_dev , Yj_dev, 
					 Pi2dnx_ax__dev, Pi2dny_ay__dev, Pi2dmx_ax__dev, Pi2dmy_ay__dev,
					 Pi2nx_ax__dev,  Pi2ny_ay__dev,  Pi2mx_ax__dev,  Pi2my_ay__dev, tmpBuf__dev);
cudaThreadSynchronize();
float res = 0.0;
cudaMemcpy(&res,res_dev,sizeof(float),cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
return az*res/d6_;

}

void destroyAll()
{
gpuErrchk(cudaFree(res_dev));
gpuErrchk(cudaFree(Xj_dev));
gpuErrchk(cudaFree(Yj_dev));

gpuErrchk(cudaFree(Pi2dnx_ax__dev));
gpuErrchk(cudaFree(Pi2dny_ay__dev));
gpuErrchk(cudaFree(Pi2dmx_ax__dev));
gpuErrchk(cudaFree(Pi2dmy_ay__dev));

gpuErrchk(cudaFree(Pi2nx_ax__dev));
gpuErrchk(cudaFree(Pi2ny_ay__dev));
gpuErrchk(cudaFree(Pi2mx_ax__dev));
gpuErrchk(cudaFree(Pi2my_ay__dev));
cudaDeviceReset();

}

It looks like you are running on windows. The message:

“kernel launch timed out and was terminated”

usually indicates you have hit a windows TDR timeout. You can read this thread:

https://devtalk.nvidia.com/default/topic/459869/cuda-programming-and-performance/-quot-display-driver-stopped-responding-and-has-recovered-quot-wddm-timeout-detection-and-recovery-/

for more info.

Thanks! TDR timeout fix part of my problem.

Why when a given parameter to the kernel:
dim3 threads(32,32);
dim3 blocks(1,1);
kernelxx<<<blocks, threads>>>();
it’s working.

And when I run with the parameters of the
dim3 threads(64,64);
dim3 blocks(1,1);
kernelxx<<<blocks, threads>>>();
kernel is not performed.

How fix it? Ideas?

64,64 implies 4096 threads per block. That is not supported on any current CUDA GPU. 32,32 implies 1024 threads per block, which is the maximum on cc2.0 and newer GPUs (currently).