Strange Error in cuda

Hello all ;-)

I try to make my first cuda/c++ program with testing of performance, so I had problem with its…

Kernel code:

__global__ void krnFindClosestGpu(float3 *dev_points, int *dev_indices, int size)
{
	if (size <= 1) return;
	int currentIndex = threadIdx.x + blockDim.x * blockIdx.x;

	if (currentIndex < size)
	{
		float distToClosest = FLT_MAX;
		for (int i = 0; i < size; ++i)
		{
			if (currentIndex == i) continue;

			float distance = sqrt(
				(dev_points[currentIndex].x - dev_points[i].x) * (dev_points[currentIndex].x - dev_points[i].x) +
				(dev_points[currentIndex].y - dev_points[i].y) * (dev_points[currentIndex].y - dev_points[i].y) +
				(dev_points[currentIndex].z - dev_points[i].z) * (dev_points[currentIndex].z - dev_points[i].z));

			if (distance < distToClosest)
			{
				distToClosest = distance;
				//		//	dev_indices[currentIndex] = i;
			}
		}
	}
}

short of code in main:

int main () 
{
	const int size = 1500000;
	
	srand((unsigned)time(NULL));

	float3 *points = new float3;
	int *indices = new int;

	for (int i = 0; i < size; ++i)
	{
		points[i].x = (float)((rand() % 1001) + 100);
		points[i].y = (float)((rand() % 1001) + 100);
		points[i].z = (float)((rand() % 1001) + 100);
	}

RUN_GPU_FUNCTION....

	delete[] points;
	delete[] indices;

	cudaDeviceReset();
}

function of starting kernel:

cudaError_t runGpu(void(*fun)(float3*, int*, int), float3 *points, int *indices, int size)
{
	float3 *dev_points;
	int *dev_indices;
	int i_size = threadsPerBlock * 1024;  //blockdimx = 1024
	int nBlock = (size + threadsPerBlock - 1) / threadsPerBlock;

	cudaError_t status = cudaSuccess;

	do
	{

		status = cudaSetDevice(DEVICE_INDEX);
		if (status != cudaSuccess)
		{
			cout << "Unable to set device index. Check if your GPU is supporting CUDA." << endl;
			break;
		}
		status = cudaMalloc((void**)&dev_points, size * sizeof(float3));
		if (status != cudaSuccess)
		{
			cout << "cudaMalloc failed for dev_points" << endl;
			break;
		}
		status = cudaMalloc((void**)&dev_indices, size * sizeof(int));
		if (status != cudaSuccess)
		{
			cout << "cudaMalloc failed for dev_indices" << endl;
			break;
		}
		status = cudaMemcpy(dev_points, points, size * sizeof(float3), cudaMemcpyHostToDevice);
		if (status != cudaSuccess)
		{
			cout << "cudaMemcpy failed for dev_points" << endl;
			break;
		}
		//status = cudaMemcpy(dev_indices, points, size * sizeof(int), cudaMemcpyHostToDevice);
		////	status = cudaMemcpy(address, indices, size * sizeof(int), cudaMemcpyHostToDevice);
		//if (status != cudaSuccess)
		//{
		//	cout << "cudaMemcpy failed for dev_indices" << endl;
		//	break;
		//}
		(*fun) <<< nBlock, threadsPerBlock >>> (dev_points, dev_indices, size);
		status = cudaGetLastError();
		if (status != cudaSuccess)
		{
			cout << "Kernel startup failed. " << cudaGetErrorString(status) << endl;
			break;
		}
		status = cudaDeviceSynchronize();
		if (status != cudaSuccess)
		{
			cout << "cudaDeviceSynchronize failed with code: " << (int)status << endl;
			break;
		}
		status = cudaMemcpy(indices, dev_indices, size * sizeof(int), cudaMemcpyDeviceToHost);
		if (status != cudaSuccess)
		{
			cout << "cudaMemcpy failed for dev_indices" << endl;
			break;
		}

	} while (false);

	cudaFree(dev_points);
	cudaFree(dev_indices);

	return status;
}

where is the problem:

when the “size” is more than 100 000 - I had problem with write “dev_indices[currentIndex] = i;”
if I comment this line - problem disappear
when the “size” is more than 1 000 000 - I had problem with write “distToClosest = distance;”
if I comment this line - problem disappear

Result of problem:
cudaDeviceSynchronize failed with code: 4
cudaMalloc failed for dev_points

Info:
MSI GE62 6QC, with GTX960M (2gb)
Win 10, prof 64bit
MS Visual Studio 2015 prof.
In VisualStudio I change from CC 2.0 to CC 5.0

I try to run this program on x86 and x64, is not resolve the problem
I try to find similar problem in google - but no one have this problem ;-(

What is wrong?

You may be hitting a WDDM TDR timeout

Hi, you are right - on Saturday I change this configuration (set dissable) and it resolve this problem ;-)