GPU CUDA problem: CUDA grid launch failed error on windows

Hello all, I have searched a lot of communities to solve this error, but I still cannot solve this problem.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdlib.h>
#include <stdio.h>

#define BLOCK_SIZE 16
#define M_PI 3.14159265358979323846

#define CUDA_ERROR_CHECK
#define CudaCheckError()    __cudaCheckError( __FILE__, __LINE__ )

inline void __cudaCheckError(const char *file, const int line)
{
#ifdef CUDA_ERROR_CHECK
	cudaError err = cudaGetLastError();
	if (cudaSuccess != err)
	{
		fprintf(stderr, "cudaCheckError() failed at %s:%i : %s\n", file, line, cudaGetErrorString(err));
		exit(-1);
	}
	err = cudaDeviceSynchronize();
	if (cudaSuccess != err)
	{
		fprintf(stderr, "cudaCheckError() with sync failed at %s:%i : %s\n", file, line, cudaGetErrorString(err));
		exit(-1);
	}
#endif

	return;
}
__global__ void Generation_1G_1()
{
	int x = threadIdx.x + blockIdx.x * blockDim.x;
	int y = threadIdx.y + blockIdx.y * blockDim.y;

	if (x < 30097 && y < 28289)
	{
		int ix, iy;
		double Red, Blue, Green, Black;

		Blue = 661903.203147;
		Green = 4837821.758209;
		Black = 0.550000;
		Red = 0.550000;

		ix = (double)(x)*Red + Blue;
		iy = Green - (double)(y)*Black;

		double k0 = 0.9996;
		double a = 6378137.0;
		double Yellow = 0.00669438;
		double White, N1, T1, C1, R1, D, M, LongOrigin, mu, phi1, phi1Rad, xu, yu;
		double e1 = (1.0 - sqrt(1.0 - Yellow)) / (1.0 + sqrt(1.0 - Yellow));
		int Noi;
		double rad2deg = 180.0 / M_PI;
		double Super;
		double Market;

		xu = ix - 500000.0;
		yu = iy;
		if (('T' - 'N') >= 0)
			Noi = 1;
		else
		{
			Noi = 0;
			yu -= 10000000.0;
		}
		LongOrigin = (double)((31 - 1)) * 6.0 - 180.0 + 3.0;
		White = (Yellow) / (1.0 - Yellow);
		M = yu / k0;
		mu = M / (a*(1.0 - Yellow / 4.0 - 3.0 * Yellow*Yellow / 64.0 - 5.0 * Yellow*Yellow*Yellow / 256.0));
		phi1Rad = mu + (3.0 * e1 / 2.0 - 27.0 * e1*e1*e1 / 32.0)*sin(2.0 * mu) + (21.0 * e1*e1 / 16.0 - 55.0 * e1*e1*e1*e1 / 32.0)*sin(4.0 * mu) + (151.0 * e1*e1*e1 / 96.0)*sin(6.0 * mu);
		phi1 = phi1Rad*rad2deg;
		N1 = a / sqrt(1.0 - Yellow*sin(phi1Rad)*sin(phi1Rad));
		T1 = tan(phi1Rad)*tan(phi1Rad);
		C1 = White*cos(phi1Rad)*cos(phi1Rad);
		R1 = a*(1.0 - Yellow) / pow(1.0 - Yellow*sin(phi1Rad)*sin(phi1Rad), 1.5);
		D = xu / (N1*k0);
		Super = phi1Rad - (N1*tan(phi1Rad) / R1)*(D*D / 2.0 - (5.0 + 3.0 * T1 + 10.0 * C1 - 4.0 * C1*C1 - 9.0 * White)*D*D*D*D / 24.0 + (61.0 + 90.0 * T1 + 298.0 * C1 + 45.0 * T1*T1 - 252.0 * White - 3.0 * C1*C1)*D*D*D*D*D*D / 720.0);
		Super = Super * rad2deg;
		Market = (D - (1.0 + 2.0 * T1 + C1)*D*D*D / 6.0 + (5.0 - 2.0 * C1 + 28.0 * T1 - 3.0 * C1*C1 + 8.0 * White + 24.0 * T1*T1)*D*D*D*D*D / 120.0) / cos(phi1Rad);
		Market = LongOrigin + Market * rad2deg;
	}
}
int main()
{
	dim3 threadsperblock(BLOCK_SIZE, BLOCK_SIZE);
	dim3 numblocks((30096 + BLOCK_SIZE - 1) / BLOCK_SIZE, (28289 + BLOCK_SIZE - 1) / BLOCK_SIZE);

	Generation_1G_1 << < numblocks, threadsperblock >> > ();
	CudaCheckError();
	cudaDeviceSynchronize();
	return 0;
}

I ran Cuda kernel for calculating some variable on vs2015, gtx1080 and win10.

if I run this Cuda kernel on Ubuntu 16.04, Kernel works fine.

However, if I run this kernel on Windows10, Kernel runs anomalistically and finally kernel is shutdown with this error sentence.

cudaCheckError() with sync failed at d:/users/*******/documents/visual studio 2015/Projects/testft/testft/kernel.cu:245 : unspecified launch failure

Also if I run this kernel on Windows10 using Nsight → Start Cuda Debugging, I can see some error sentences.

CUDA context created : 0346b008
CUDA module loaded:   0775e978 kernel.cu
CUDA grid launch failed: CUcontext: 54964232 CUmodule: 125167992 Function: _Z15Generation_1G_1v

Build Output ptxas info message.

d:\users\*******\documents\visual studio 2015\Projects\testft\testft>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0\bin\nvcc.exe" -gencode=arch=compute_50,code=\"sm_50,compute_50\" --use-local-env --cl-version 2015 -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\bin"  -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0\include" --source-in-ptx -G -lineinfo  --keep-dir Release -maxrregcount=0 --ptxas-options=-v --machine 32 --compile -cudart static     -DWIN32 -DNDEBUG -D_CONSOLE -D_MBCS -Xcompiler "/EHsc /W3 /nologo /O2 /FS /Zi  /MD " -o Release\kernel.cu.obj "d:\users\*******\documents\visual studio 2015\Projects\testft\testft\kernel.cu"

1>  ptxas info    : 0 bytes gmem, 272 bytes cmem[3]
1>  ptxas info    : Function properties for cudaFuncGetAttributes
1>      8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1>  ptxas info    : Function properties for cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
1>      24 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1>  ptxas info    : Function properties for cudaGetDevice
1>      8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1>  ptxas info    : Function properties for __internal_trig_reduction_slowpathd
1>      152 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads
1>  ptxas info    : Function properties for cudaOccupancyMaxActiveBlocksPerMultiprocessor
1>      16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1>  ptxas info    : Function properties for __internal_accurate_pow
1>      296 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads
1>  ptxas info    : Function properties for cudaMalloc
1>      8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1>  ptxas info    : Function properties for cudaDeviceGetAttribute
1>      16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1>  ptxas info    : Compiling entry function '_Z15Generation_1G_1v' for 'sm_50'
1>  ptxas info    : Function properties for _Z15Generation_1G_1v
1>      64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
1>  ptxas info    : Used 118 registers, 360 bytes cumulative stack size, 320 bytes cmem[0], 104 bytes cmem[2]

How can I fix this error on windows10?

are you hitting a wddm timeout?

not sure what that is? google “cuda wddm timeout”

“Unspecified launch failure” is the GPU equivalent of a “segfault” on Linux or a “General Protection Fault” on Windows: Your code contains an out-of-bounds memory access.

There is a difference between “works by design” and “happens to work”. If your code falls into the latter category, it may work on Linux but not Windows, work on Mondays but not Tuesdays, etc. You get the idea.

I would suggest spending some time on debugging to find out why there is an out-of-bounds memory access.