cudaSynchronizeDevice() returns error code 6

When I launch my programm it crashes it makes my screen flickers, and graphic card is restarted. It is only when time of calculation is big. cudasynchronize returns error 6.

I found information in the documentation that: cudaDeviceSynchronize() returns an error if one of the preceding tasks has failed

what does it mean for me?

here is the code:

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include "device_functions.h"

#include "auxilary.h"

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <ctime>

#include <math.h>

#define M_E        2.71828182845904523536

__host__ cudaError_t PrimesWithCuda(int first, int last,int nofb,int noft,int* res,int size);

__host__ double logN(double n);

__host__ int numberOfPrimes(int first,int last);

__host__ int* allocateOutput(int first,int last, int* size);

__host__ void Primes(int first, int last,int* result);

__global__ void PrimesKernel(int first, int last,int* result);

double logN(double n)

{

	return log(n)/log(M_E);

}

int numberOfPrimes(int first,int last)

{

	if ( first < 0 && last < 0 )

		return -1;

	if ( last < 10000 || last-first < 10000 ) 

			return 1500;

	double nop = (0.997344779 * last/(logN(last) - 1.110432659) - 2.0)*1.01 - 0.997344779 * first/(logN(first) - 1.110432659) - 2.0;

	return nop;

}

int* allocateOutput(int first,int last,int* size)

{

	int* output;

	int nop = numberOfPrimes(first,last);

	*size = nop;

	if ( nop <= 0 )

	{

		fprintf(stderr, "Wrong parameters: first & last\n");

        return NULL;

	}

	output = (int*)malloc(sizeof(int)*nop);

	if ( output == NULL )

	{

		fprintf(stderr, "output: malloc failed\n");

        return NULL;

	}

	memset(output,0,nop*sizeof(int));

	output[0]=1;

	return output;

}

void Primes(int first, int last,int* result)

{

	bool prime = false;

	int ij = 1;

	if (first % 2 == 0)

		first++;

	for ( int i=first ; i <= last ; i+=2 )

	{

		prime = true;

		for ( int j = 3; j*j <= i && prime ; j+=2 )

		{

			if ( i % j == 0 )

				prime = false;

		}

		if ( prime)

			result[ij++] = i;

	}

	result[0] = ij-1;

}

__global__ void PrimesKernel(int first, int last,int* result)

{

	int id = blockDim.x*blockIdx.x + threadIdx.x + first;

	int onoft = gridDim.x*blockDim.x;

	if ( id % 2 == 0 ) id += (onoft % 2 == 0 ? onoft-1 : onoft);

	bool prime = false;

	int i,ij=0;

	for ( i=id  ; i <=last  ; i+=2*onoft )

	{

		prime = true;

		for ( int j = 3; j*j <= i && prime ; j+=2 )

		{

			if ( i % j == 0 )

				prime = false;

		}

		if ( prime)

		{

		//	ij++;

			while (atomicCAS(result+result[0]++,0,i));

		}

	}

	//atomicAdd(result,ij);

}

__host__ void DisplayPrimes(int* primes)

{

	int i =0;

	printf("Liczba liczb pierwszych: %d ",primes[i++]);

	while (  primes[i] || i < 100)

	{

		printf("%d ",primes[i++]);

	}

	printf("\n");

}

int main(int argc, char** argv)

{

	double elapsed;

	clock_t start, end;

	cudaError_t cudaStatus;

	int* res;

	int size;

	if ( argc != 6 )

		return -1;

	start = clock();

	int first= atoi(argv[1]);

	int last = atoi(argv[2]);

	int nofb = atoi(argv[3]);

	int noft = atoi(argv[4]);

	int parallel = atoi(argv[5]);

	printf("nop: %d\n",numberOfPrimes(first,last));

	res = allocateOutput(first,last,&size);

	if ( parallel )

	{

		cudaStatus = PrimesWithCuda(first,last,nofb,noft,res,size);

		if (cudaStatus != cudaSuccess) {

			fprintf(stderr, "LevenstheinWithCuda failed!\n");

			return -1;

		}

		// cudaDeviceReset must be called before exiting in order for profiling and

		// tracing tools such as Parallel Nsight and Visual Profiler to show complete traces.

	    cudaStatus = cudaDeviceReset();

		if (cudaStatus != cudaSuccess) {

			fprintf(stderr, "cudaDeviceReset failed!\n");

			return -1;

		}

	}

	else

	{

		Primes(first,last,res);

	}

	end = clock();

	elapsed = ((double)(end-start))/CLOCKS_PER_SEC;

	printf("Commercial time: %lf s\n",elapsed);

	DisplayPrimes(res);

	printf("Liczba liczb pierwszych: %d\n",res[0]);

	printf("Hit any key to terminate\n");

    getchar();

	

	free (res);

    return 0;

}

// Helper function for using CUDA to add vectors in parallel.

cudaError_t PrimesWithCuda(int first,int last,int nofb,int noft,int* output,int size)

{

    // Choose which GPU to run on, change this on a multi-GPU system.

    cudaError_t cudaStatus = cudaSetDevice(0);

    if (cudaStatus != cudaSuccess) {

        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?\n");

        goto Error;

    }

	/*int * dev_first;

	int * dev_last;*/

	int * dev_output;

	/*cudaStatus = cudaMalloc((void**)&dev_first,sizeof(int));

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_first: cudaMalloc failed\n", cudaStatus);

		goto Error;

    };

	cudaStatus = cudaMalloc((void**)&dev_last,sizeof(int));

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_last: cudaMalloc failed\n", cudaStatus);

		goto Error;

    };*/

	cudaStatus = cudaMalloc((void**)&dev_output,sizeof(int)*size);

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_output: cudaMalloc failed\n", cudaStatus);

		goto Error;

    };

	

	cudaStatus = cudaMemcpy(dev_output,output,sizeof(int)*size,cudaMemcpyHostToDevice);

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_output: HtD cudaMemcpy failed , code: %d\n", cudaStatus);

		goto Error;

    };

	/*cudaStatus = cudaMemcpy(dev_first,&first,sizeof(int),cudaMemcpyHostToDevice);

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_first: cudaMemcpy failed\n", cudaStatus);

		goto Error;

    };

	cudaStatus = cudaMemcpy(dev_last,&last,sizeof(int),cudaMemcpyHostToDevice);

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_last: cudaMemcpy failed\n", cudaStatus);

		goto Error;

    };*/

	if (first % 2 == 0)

		first++;

    // Launch a kernel on the GPU with one thread for each element.

    PrimesKernel<<<nofb, noft>>>(first,last,dev_output);

    // cudaDeviceSynchronize waits for the kernel to finish, and returns

    // any errors encountered during the launch.

    cudaStatus = cudaDeviceSynchronize();

    if (cudaStatus != cudaSuccess) {

        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching PrimesKernel!\n", cudaStatus);

        goto Error;

    }

	cudaStatus = cudaMemcpy(output,dev_output,sizeof(int)*size,cudaMemcpyDeviceToHost);

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_output: DtH cudaMemcpy failed: code %d\n", cudaStatus);

		goto Error;

    };

	cudaFree(dev_output);

Error:

	return cudaStatus;

}

I have found also description of the error, but I dont understand it fully.

/**

  • This indicates that the device kernel took too long to execute. This can

  • only occur if timeouts are enabled - see the device property

  • \ref ::cudaDeviceProp::kernelExecTimeoutEnabled “kernelExecTimeoutEnabled”

  • for more information. The device cannot be used until ::cudaThreadExit()

  • is called. All existing device memory allocations are invalid and must be

  • reconstructed if the program is to continue using CUDA.

*/

When I launch my programm it crashes it makes my screen flickers, and graphic card is restarted. It is only when time of calculation is big. cudasynchronize returns error 6.

I found information in the documentation that: cudaDeviceSynchronize() returns an error if one of the preceding tasks has failed

what does it mean for me?

here is the code:

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include "device_functions.h"

#include "auxilary.h"

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <ctime>

#include <math.h>

#define M_E        2.71828182845904523536

__host__ cudaError_t PrimesWithCuda(int first, int last,int nofb,int noft,int* res,int size);

__host__ double logN(double n);

__host__ int numberOfPrimes(int first,int last);

__host__ int* allocateOutput(int first,int last, int* size);

__host__ void Primes(int first, int last,int* result);

__global__ void PrimesKernel(int first, int last,int* result);

double logN(double n)

{

	return log(n)/log(M_E);

}

int numberOfPrimes(int first,int last)

{

	if ( first < 0 && last < 0 )

		return -1;

	if ( last < 10000 || last-first < 10000 ) 

			return 1500;

	double nop = (0.997344779 * last/(logN(last) - 1.110432659) - 2.0)*1.01 - 0.997344779 * first/(logN(first) - 1.110432659) - 2.0;

	return nop;

}

int* allocateOutput(int first,int last,int* size)

{

	int* output;

	int nop = numberOfPrimes(first,last);

	*size = nop;

	if ( nop <= 0 )

	{

		fprintf(stderr, "Wrong parameters: first & last\n");

        return NULL;

	}

	output = (int*)malloc(sizeof(int)*nop);

	if ( output == NULL )

	{

		fprintf(stderr, "output: malloc failed\n");

        return NULL;

	}

	memset(output,0,nop*sizeof(int));

	output[0]=1;

	return output;

}

void Primes(int first, int last,int* result)

{

	bool prime = false;

	int ij = 1;

	if (first % 2 == 0)

		first++;

	for ( int i=first ; i <= last ; i+=2 )

	{

		prime = true;

		for ( int j = 3; j*j <= i && prime ; j+=2 )

		{

			if ( i % j == 0 )

				prime = false;

		}

		if ( prime)

			result[ij++] = i;

	}

	result[0] = ij-1;

}

__global__ void PrimesKernel(int first, int last,int* result)

{

	int id = blockDim.x*blockIdx.x + threadIdx.x + first;

	int onoft = gridDim.x*blockDim.x;

	if ( id % 2 == 0 ) id += (onoft % 2 == 0 ? onoft-1 : onoft);

	bool prime = false;

	int i,ij=0;

	for ( i=id  ; i <=last  ; i+=2*onoft )

	{

		prime = true;

		for ( int j = 3; j*j <= i && prime ; j+=2 )

		{

			if ( i % j == 0 )

				prime = false;

		}

		if ( prime)

		{

		//	ij++;

			while (atomicCAS(result+result[0]++,0,i));

		}

	}

	//atomicAdd(result,ij);

}

__host__ void DisplayPrimes(int* primes)

{

	int i =0;

	printf("Liczba liczb pierwszych: %d ",primes[i++]);

	while (  primes[i] || i < 100)

	{

		printf("%d ",primes[i++]);

	}

	printf("\n");

}

int main(int argc, char** argv)

{

	double elapsed;

	clock_t start, end;

	cudaError_t cudaStatus;

	int* res;

	int size;

	if ( argc != 6 )

		return -1;

	start = clock();

	int first= atoi(argv[1]);

	int last = atoi(argv[2]);

	int nofb = atoi(argv[3]);

	int noft = atoi(argv[4]);

	int parallel = atoi(argv[5]);

	printf("nop: %d\n",numberOfPrimes(first,last));

	res = allocateOutput(first,last,&size);

	if ( parallel )

	{

		cudaStatus = PrimesWithCuda(first,last,nofb,noft,res,size);

		if (cudaStatus != cudaSuccess) {

			fprintf(stderr, "LevenstheinWithCuda failed!\n");

			return -1;

		}

		// cudaDeviceReset must be called before exiting in order for profiling and

		// tracing tools such as Parallel Nsight and Visual Profiler to show complete traces.

	    cudaStatus = cudaDeviceReset();

		if (cudaStatus != cudaSuccess) {

			fprintf(stderr, "cudaDeviceReset failed!\n");

			return -1;

		}

	}

	else

	{

		Primes(first,last,res);

	}

	end = clock();

	elapsed = ((double)(end-start))/CLOCKS_PER_SEC;

	printf("Commercial time: %lf s\n",elapsed);

	DisplayPrimes(res);

	printf("Liczba liczb pierwszych: %d\n",res[0]);

	printf("Hit any key to terminate\n");

    getchar();

	

	free (res);

    return 0;

}

// Helper function for using CUDA to add vectors in parallel.

cudaError_t PrimesWithCuda(int first,int last,int nofb,int noft,int* output,int size)

{

    // Choose which GPU to run on, change this on a multi-GPU system.

    cudaError_t cudaStatus = cudaSetDevice(0);

    if (cudaStatus != cudaSuccess) {

        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?\n");

        goto Error;

    }

	/*int * dev_first;

	int * dev_last;*/

	int * dev_output;

	/*cudaStatus = cudaMalloc((void**)&dev_first,sizeof(int));

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_first: cudaMalloc failed\n", cudaStatus);

		goto Error;

    };

	cudaStatus = cudaMalloc((void**)&dev_last,sizeof(int));

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_last: cudaMalloc failed\n", cudaStatus);

		goto Error;

    };*/

	cudaStatus = cudaMalloc((void**)&dev_output,sizeof(int)*size);

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_output: cudaMalloc failed\n", cudaStatus);

		goto Error;

    };

	

	cudaStatus = cudaMemcpy(dev_output,output,sizeof(int)*size,cudaMemcpyHostToDevice);

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_output: HtD cudaMemcpy failed , code: %d\n", cudaStatus);

		goto Error;

    };

	/*cudaStatus = cudaMemcpy(dev_first,&first,sizeof(int),cudaMemcpyHostToDevice);

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_first: cudaMemcpy failed\n", cudaStatus);

		goto Error;

    };

	cudaStatus = cudaMemcpy(dev_last,&last,sizeof(int),cudaMemcpyHostToDevice);

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_last: cudaMemcpy failed\n", cudaStatus);

		goto Error;

    };*/

	if (first % 2 == 0)

		first++;

    // Launch a kernel on the GPU with one thread for each element.

    PrimesKernel<<<nofb, noft>>>(first,last,dev_output);

    // cudaDeviceSynchronize waits for the kernel to finish, and returns

    // any errors encountered during the launch.

    cudaStatus = cudaDeviceSynchronize();

    if (cudaStatus != cudaSuccess) {

        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching PrimesKernel!\n", cudaStatus);

        goto Error;

    }

	cudaStatus = cudaMemcpy(output,dev_output,sizeof(int)*size,cudaMemcpyDeviceToHost);

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_output: DtH cudaMemcpy failed: code %d\n", cudaStatus);

		goto Error;

    };

	cudaFree(dev_output);

Error:

	return cudaStatus;

}

I have found also description of the error, but I dont understand it fully.

/**

  • This indicates that the device kernel took too long to execute. This can

  • only occur if timeouts are enabled - see the device property

  • \ref ::cudaDeviceProp::kernelExecTimeoutEnabled “kernelExecTimeoutEnabled”

  • for more information. The device cannot be used until ::cudaThreadExit()

  • is called. All existing device memory allocations are invalid and must be

  • reconstructed if the program is to continue using CUDA.

*/

Hi,

watchdog timer kicks in. You have e certain time on you to execute a kernel. If you reach the timeout you will get this error.

You could split the problem up in several kernel calls if possible. If you are using a Tesla card you can use the TCC drivers and this will disable the timeout.

Hi,

watchdog timer kicks in. You have e certain time on you to execute a kernel. If you reach the timeout you will get this error.

You could split the problem up in several kernel calls if possible. If you are using a Tesla card you can use the TCC drivers and this will disable the timeout.

I use GeForce GT 9600 M, mobie version on my laptop. I need to make project for classess. So the only way to solve it is to put kernel function inside loop ?

I use GeForce GT 9600 M, mobie version on my laptop. I need to make project for classess. So the only way to solve it is to put kernel function inside loop ?

Maybe you can disable the watchdog.

Google it and such… there could be solutions… but doing so might be risky… if games or any other stuff hang the display driver or something then you probably screwed and need a reboot ?

Perhaps terminating such applications blindly might work… or perhaps not because the driver hangs… me dont know about such problems External Image :)

The way I disabled the watchdog was via NVIDIA Parallel NSight (and developer drivers).

Then open NVIDIA Parallel NSight via tray icon and click options and such then set the following:

WDDM TDR Enabled = false

(WDDM = Windows Display Driver Model)
(TDR = Time Detection and Recovery)

More advanced control information at:

This is all for windows vista or windows 7 I think… what OS you have on your laptop ?

Maybe you can disable the watchdog.

Google it and such… there could be solutions… but doing so might be risky… if games or any other stuff hang the display driver or something then you probably screwed and need a reboot ?

Perhaps terminating such applications blindly might work… or perhaps not because the driver hangs… me dont know about such problems External Image :)

The way I disabled the watchdog was via NVIDIA Parallel NSight (and developer drivers).

Then open NVIDIA Parallel NSight via tray icon and click options and such then set the following:

WDDM TDR Enabled = false

(WDDM = Windows Display Driver Model)
(TDR = Time Detection and Recovery)

More advanced control information at:

This is all for windows vista or windows 7 I think… what OS you have on your laptop ?