cudaSynchronizeDevice() returns error code 6

mateuszef · June 13, 2011, 3:24pm

When I launch my programm it crashes it makes my screen flickers, and graphic card is restarted. It is only when time of calculation is big. cudasynchronize returns error 6.

I found information in the documentation that: cudaDeviceSynchronize() returns an error if one of the preceding tasks has failed

what does it mean for me?

here is the code:

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include "device_functions.h"

#include "auxilary.h"

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <ctime>

#include <math.h>

#define M_E        2.71828182845904523536

__host__ cudaError_t PrimesWithCuda(int first, int last,int nofb,int noft,int* res,int size);

__host__ double logN(double n);

__host__ int numberOfPrimes(int first,int last);

__host__ int* allocateOutput(int first,int last, int* size);

__host__ void Primes(int first, int last,int* result);

__global__ void PrimesKernel(int first, int last,int* result);

double logN(double n)

{

	return log(n)/log(M_E);

}

int numberOfPrimes(int first,int last)

{

	if ( first < 0 && last < 0 )

		return -1;

	if ( last < 10000 || last-first < 10000 ) 

			return 1500;

	double nop = (0.997344779 * last/(logN(last) - 1.110432659) - 2.0)*1.01 - 0.997344779 * first/(logN(first) - 1.110432659) - 2.0;

	return nop;

}

int* allocateOutput(int first,int last,int* size)

{

	int* output;

	int nop = numberOfPrimes(first,last);

	*size = nop;

	if ( nop <= 0 )

	{

		fprintf(stderr, "Wrong parameters: first & last\n");

        return NULL;

	}

	output = (int*)malloc(sizeof(int)*nop);

	if ( output == NULL )

	{

		fprintf(stderr, "output: malloc failed\n");

        return NULL;

	}

	memset(output,0,nop*sizeof(int));

	output[0]=1;

	return output;

}

void Primes(int first, int last,int* result)

{

	bool prime = false;

	int ij = 1;

	if (first % 2 == 0)

		first++;

	for ( int i=first ; i <= last ; i+=2 )

	{

		prime = true;

		for ( int j = 3; j*j <= i && prime ; j+=2 )

		{

			if ( i % j == 0 )

				prime = false;

		}

		if ( prime)

			result[ij++] = i;

	}

	result[0] = ij-1;

}

__global__ void PrimesKernel(int first, int last,int* result)

{

	int id = blockDim.x*blockIdx.x + threadIdx.x + first;

	int onoft = gridDim.x*blockDim.x;

	if ( id % 2 == 0 ) id += (onoft % 2 == 0 ? onoft-1 : onoft);

	bool prime = false;

	int i,ij=0;

	for ( i=id  ; i <=last  ; i+=2*onoft )

	{

		prime = true;

		for ( int j = 3; j*j <= i && prime ; j+=2 )

		{

			if ( i % j == 0 )

				prime = false;

		}

		if ( prime)

		{

		//	ij++;

			while (atomicCAS(result+result[0]++,0,i));

		}

	}

	//atomicAdd(result,ij);

}

__host__ void DisplayPrimes(int* primes)

{

	int i =0;

	printf("Liczba liczb pierwszych: %d ",primes[i++]);

	while (  primes[i] || i < 100)

	{

		printf("%d ",primes[i++]);

	}

	printf("\n");

}

int main(int argc, char** argv)

{

	double elapsed;

	clock_t start, end;

	cudaError_t cudaStatus;

	int* res;

	int size;

	if ( argc != 6 )

		return -1;

	start = clock();

	int first= atoi(argv[1]);

	int last = atoi(argv[2]);

	int nofb = atoi(argv[3]);

	int noft = atoi(argv[4]);

	int parallel = atoi(argv[5]);

	printf("nop: %d\n",numberOfPrimes(first,last));

	res = allocateOutput(first,last,&size);

	if ( parallel )

	{

		cudaStatus = PrimesWithCuda(first,last,nofb,noft,res,size);

		if (cudaStatus != cudaSuccess) {

			fprintf(stderr, "LevenstheinWithCuda failed!\n");

			return -1;

		}

		// cudaDeviceReset must be called before exiting in order for profiling and

		// tracing tools such as Parallel Nsight and Visual Profiler to show complete traces.

	    cudaStatus = cudaDeviceReset();

		if (cudaStatus != cudaSuccess) {

			fprintf(stderr, "cudaDeviceReset failed!\n");

			return -1;

		}

	}

	else

	{

		Primes(first,last,res);

	}

	end = clock();

	elapsed = ((double)(end-start))/CLOCKS_PER_SEC;

	printf("Commercial time: %lf s\n",elapsed);

	DisplayPrimes(res);

	printf("Liczba liczb pierwszych: %d\n",res[0]);

	printf("Hit any key to terminate\n");

    getchar();

	

	free (res);

    return 0;

}

// Helper function for using CUDA to add vectors in parallel.

cudaError_t PrimesWithCuda(int first,int last,int nofb,int noft,int* output,int size)

{

    // Choose which GPU to run on, change this on a multi-GPU system.

    cudaError_t cudaStatus = cudaSetDevice(0);

    if (cudaStatus != cudaSuccess) {

        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?\n");

        goto Error;

    }

	/*int * dev_first;

	int * dev_last;*/

	int * dev_output;

	/*cudaStatus = cudaMalloc((void**)&dev_first,sizeof(int));

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_first: cudaMalloc failed\n", cudaStatus);

		goto Error;

    };

	cudaStatus = cudaMalloc((void**)&dev_last,sizeof(int));

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_last: cudaMalloc failed\n", cudaStatus);

		goto Error;

    };*/

	cudaStatus = cudaMalloc((void**)&dev_output,sizeof(int)*size);

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_output: cudaMalloc failed\n", cudaStatus);

		goto Error;

    };

	

	cudaStatus = cudaMemcpy(dev_output,output,sizeof(int)*size,cudaMemcpyHostToDevice);

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_output: HtD cudaMemcpy failed , code: %d\n", cudaStatus);

		goto Error;

    };

	/*cudaStatus = cudaMemcpy(dev_first,&first,sizeof(int),cudaMemcpyHostToDevice);

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_first: cudaMemcpy failed\n", cudaStatus);

		goto Error;

    };

	cudaStatus = cudaMemcpy(dev_last,&last,sizeof(int),cudaMemcpyHostToDevice);

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_last: cudaMemcpy failed\n", cudaStatus);

		goto Error;

    };*/

	if (first % 2 == 0)

		first++;

    // Launch a kernel on the GPU with one thread for each element.

    PrimesKernel<<<nofb, noft>>>(first,last,dev_output);

    // cudaDeviceSynchronize waits for the kernel to finish, and returns

    // any errors encountered during the launch.

    cudaStatus = cudaDeviceSynchronize();

    if (cudaStatus != cudaSuccess) {

        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching PrimesKernel!\n", cudaStatus);

        goto Error;

    }

	cudaStatus = cudaMemcpy(output,dev_output,sizeof(int)*size,cudaMemcpyDeviceToHost);

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_output: DtH cudaMemcpy failed: code %d\n", cudaStatus);

		goto Error;

    };

	cudaFree(dev_output);

Error:

	return cudaStatus;

}

I have found also description of the error, but I dont understand it fully.

/**

This indicates that the device kernel took too long to execute. This can
only occur if timeouts are enabled - see the device property
\ref ::cudaDeviceProp::kernelExecTimeoutEnabled “kernelExecTimeoutEnabled”
for more information. The device cannot be used until ::cudaThreadExit()
is called. All existing device memory allocations are invalid and must be
reconstructed if the program is to continue using CUDA.

*/

mateuszef · June 13, 2011, 3:24pm

When I launch my programm it crashes it makes my screen flickers, and graphic card is restarted. It is only when time of calculation is big. cudasynchronize returns error 6.

I found information in the documentation that: cudaDeviceSynchronize() returns an error if one of the preceding tasks has failed

what does it mean for me?

here is the code:

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include "device_functions.h"

#include "auxilary.h"

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <ctime>

#include <math.h>

#define M_E        2.71828182845904523536

__host__ cudaError_t PrimesWithCuda(int first, int last,int nofb,int noft,int* res,int size);

__host__ double logN(double n);

__host__ int numberOfPrimes(int first,int last);

__host__ int* allocateOutput(int first,int last, int* size);

__host__ void Primes(int first, int last,int* result);

__global__ void PrimesKernel(int first, int last,int* result);

double logN(double n)

{

	return log(n)/log(M_E);

}

int numberOfPrimes(int first,int last)

{

	if ( first < 0 && last < 0 )

		return -1;

	if ( last < 10000 || last-first < 10000 ) 

			return 1500;

	double nop = (0.997344779 * last/(logN(last) - 1.110432659) - 2.0)*1.01 - 0.997344779 * first/(logN(first) - 1.110432659) - 2.0;

	return nop;

}

int* allocateOutput(int first,int last,int* size)

{

	int* output;

	int nop = numberOfPrimes(first,last);

	*size = nop;

	if ( nop <= 0 )

	{

		fprintf(stderr, "Wrong parameters: first & last\n");

        return NULL;

	}

	output = (int*)malloc(sizeof(int)*nop);

	if ( output == NULL )

	{

		fprintf(stderr, "output: malloc failed\n");

        return NULL;

	}

	memset(output,0,nop*sizeof(int));

	output[0]=1;

	return output;

}

void Primes(int first, int last,int* result)

{

	bool prime = false;

	int ij = 1;

	if (first % 2 == 0)

		first++;

	for ( int i=first ; i <= last ; i+=2 )

	{

		prime = true;

		for ( int j = 3; j*j <= i && prime ; j+=2 )

		{

			if ( i % j == 0 )

				prime = false;

		}

		if ( prime)

			result[ij++] = i;

	}

	result[0] = ij-1;

}

__global__ void PrimesKernel(int first, int last,int* result)

{

	int id = blockDim.x*blockIdx.x + threadIdx.x + first;

	int onoft = gridDim.x*blockDim.x;

	if ( id % 2 == 0 ) id += (onoft % 2 == 0 ? onoft-1 : onoft);

	bool prime = false;

	int i,ij=0;

	for ( i=id  ; i <=last  ; i+=2*onoft )

	{

		prime = true;

		for ( int j = 3; j*j <= i && prime ; j+=2 )

		{

			if ( i % j == 0 )

				prime = false;

		}

		if ( prime)

		{

		//	ij++;

			while (atomicCAS(result+result[0]++,0,i));

		}

	}

	//atomicAdd(result,ij);

}

__host__ void DisplayPrimes(int* primes)

{

	int i =0;

	printf("Liczba liczb pierwszych: %d ",primes[i++]);

	while (  primes[i] || i < 100)

	{

		printf("%d ",primes[i++]);

	}

	printf("\n");

}

int main(int argc, char** argv)

{

	double elapsed;

	clock_t start, end;

	cudaError_t cudaStatus;

	int* res;

	int size;

	if ( argc != 6 )

		return -1;

	start = clock();

	int first= atoi(argv[1]);

	int last = atoi(argv[2]);

	int nofb = atoi(argv[3]);

	int noft = atoi(argv[4]);

	int parallel = atoi(argv[5]);

	printf("nop: %d\n",numberOfPrimes(first,last));

	res = allocateOutput(first,last,&size);

	if ( parallel )

	{

		cudaStatus = PrimesWithCuda(first,last,nofb,noft,res,size);

		if (cudaStatus != cudaSuccess) {

			fprintf(stderr, "LevenstheinWithCuda failed!\n");

			return -1;

		}

		// cudaDeviceReset must be called before exiting in order for profiling and

		// tracing tools such as Parallel Nsight and Visual Profiler to show complete traces.

	    cudaStatus = cudaDeviceReset();

		if (cudaStatus != cudaSuccess) {

			fprintf(stderr, "cudaDeviceReset failed!\n");

			return -1;

		}

	}

	else

	{

		Primes(first,last,res);

	}

	end = clock();

	elapsed = ((double)(end-start))/CLOCKS_PER_SEC;

	printf("Commercial time: %lf s\n",elapsed);

	DisplayPrimes(res);

	printf("Liczba liczb pierwszych: %d\n",res[0]);

	printf("Hit any key to terminate\n");

    getchar();

	

	free (res);

    return 0;

}

// Helper function for using CUDA to add vectors in parallel.

cudaError_t PrimesWithCuda(int first,int last,int nofb,int noft,int* output,int size)

{

    // Choose which GPU to run on, change this on a multi-GPU system.

    cudaError_t cudaStatus = cudaSetDevice(0);

    if (cudaStatus != cudaSuccess) {

        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?\n");

        goto Error;

    }

	/*int * dev_first;

	int * dev_last;*/

	int * dev_output;

	/*cudaStatus = cudaMalloc((void**)&dev_first,sizeof(int));

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_first: cudaMalloc failed\n", cudaStatus);

		goto Error;

    };

	cudaStatus = cudaMalloc((void**)&dev_last,sizeof(int));

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_last: cudaMalloc failed\n", cudaStatus);

		goto Error;

    };*/

	cudaStatus = cudaMalloc((void**)&dev_output,sizeof(int)*size);

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_output: cudaMalloc failed\n", cudaStatus);

		goto Error;

    };

	

	cudaStatus = cudaMemcpy(dev_output,output,sizeof(int)*size,cudaMemcpyHostToDevice);

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_output: HtD cudaMemcpy failed , code: %d\n", cudaStatus);

		goto Error;

    };

	/*cudaStatus = cudaMemcpy(dev_first,&first,sizeof(int),cudaMemcpyHostToDevice);

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_first: cudaMemcpy failed\n", cudaStatus);

		goto Error;

    };

	cudaStatus = cudaMemcpy(dev_last,&last,sizeof(int),cudaMemcpyHostToDevice);

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_last: cudaMemcpy failed\n", cudaStatus);

		goto Error;

    };*/

	if (first % 2 == 0)

		first++;

    // Launch a kernel on the GPU with one thread for each element.

    PrimesKernel<<<nofb, noft>>>(first,last,dev_output);

    // cudaDeviceSynchronize waits for the kernel to finish, and returns

    // any errors encountered during the launch.

    cudaStatus = cudaDeviceSynchronize();

    if (cudaStatus != cudaSuccess) {

        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching PrimesKernel!\n", cudaStatus);

        goto Error;

    }

	cudaStatus = cudaMemcpy(output,dev_output,sizeof(int)*size,cudaMemcpyDeviceToHost);

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_output: DtH cudaMemcpy failed: code %d\n", cudaStatus);

		goto Error;

    };

	cudaFree(dev_output);

Error:

	return cudaStatus;

}

I have found also description of the error, but I dont understand it fully.

/**

This indicates that the device kernel took too long to execute. This can
only occur if timeouts are enabled - see the device property
\ref ::cudaDeviceProp::kernelExecTimeoutEnabled “kernelExecTimeoutEnabled”
for more information. The device cannot be used until ::cudaThreadExit()
is called. All existing device memory allocations are invalid and must be
reconstructed if the program is to continue using CUDA.

*/

brano · June 13, 2011, 3:30pm

When I launch my programm it crashes it makes my screen flickers, and graphic card is restarted. It is only when time of calculation is big. cudasynchronize returns error 6.

I found information in the documentation that: cudaDeviceSynchronize() returns an error if one of the preceding tasks has failed

what does it mean for me?

here is the code:

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include "device_functions.h"

#include "auxilary.h"

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <ctime>

#include <math.h>

#define M_E        2.71828182845904523536

__host__ cudaError_t PrimesWithCuda(int first, int last,int nofb,int noft,int* res,int size);

__host__ double logN(double n);

__host__ int numberOfPrimes(int first,int last);

__host__ int* allocateOutput(int first,int last, int* size);

__host__ void Primes(int first, int last,int* result);

__global__ void PrimesKernel(int first, int last,int* result);

double logN(double n)

{

	return log(n)/log(M_E);

}

int numberOfPrimes(int first,int last)

{

	if ( first < 0 && last < 0 )

		return -1;

	if ( last < 10000 || last-first < 10000 ) 

			return 1500;

	double nop = (0.997344779 * last/(logN(last) - 1.110432659) - 2.0)*1.01 - 0.997344779 * first/(logN(first) - 1.110432659) - 2.0;

	return nop;

}

int* allocateOutput(int first,int last,int* size)

{

	int* output;

	int nop = numberOfPrimes(first,last);

	*size = nop;

	if ( nop <= 0 )

	{

		fprintf(stderr, "Wrong parameters: first & last\n");

        return NULL;

	}

	output = (int*)malloc(sizeof(int)*nop);

	if ( output == NULL )

	{

		fprintf(stderr, "output: malloc failed\n");

        return NULL;

	}

	memset(output,0,nop*sizeof(int));

	output[0]=1;

	return output;

}

void Primes(int first, int last,int* result)

{

	bool prime = false;

	int ij = 1;

	if (first % 2 == 0)

		first++;

	for ( int i=first ; i <= last ; i+=2 )

	{

		prime = true;

		for ( int j = 3; j*j <= i && prime ; j+=2 )

		{

			if ( i % j == 0 )

				prime = false;

		}

		if ( prime)

			result[ij++] = i;

	}

	result[0] = ij-1;

}

__global__ void PrimesKernel(int first, int last,int* result)

{

	int id = blockDim.x*blockIdx.x + threadIdx.x + first;

	int onoft = gridDim.x*blockDim.x;

	if ( id % 2 == 0 ) id += (onoft % 2 == 0 ? onoft-1 : onoft);

	bool prime = false;

	int i,ij=0;

	for ( i=id  ; i <=last  ; i+=2*onoft )

	{

		prime = true;

		for ( int j = 3; j*j <= i && prime ; j+=2 )

		{

			if ( i % j == 0 )

				prime = false;

		}

		if ( prime)

		{

		//	ij++;

			while (atomicCAS(result+result[0]++,0,i));

		}

	}

	//atomicAdd(result,ij);

}

__host__ void DisplayPrimes(int* primes)

{

	int i =0;

	printf("Liczba liczb pierwszych: %d ",primes[i++]);

	while (  primes[i] || i < 100)

	{

		printf("%d ",primes[i++]);

	}

	printf("\n");

}

int main(int argc, char** argv)

{

	double elapsed;

	clock_t start, end;

	cudaError_t cudaStatus;

	int* res;

	int size;

	if ( argc != 6 )

		return -1;

	start = clock();

	int first= atoi(argv[1]);

	int last = atoi(argv[2]);

	int nofb = atoi(argv[3]);

	int noft = atoi(argv[4]);

	int parallel = atoi(argv[5]);

	printf("nop: %d\n",numberOfPrimes(first,last));

	res = allocateOutput(first,last,&size);

	if ( parallel )

	{

		cudaStatus = PrimesWithCuda(first,last,nofb,noft,res,size);

		if (cudaStatus != cudaSuccess) {

			fprintf(stderr, "LevenstheinWithCuda failed!\n");

			return -1;

		}

		// cudaDeviceReset must be called before exiting in order for profiling and

		// tracing tools such as Parallel Nsight and Visual Profiler to show complete traces.

    	cudaStatus = cudaDeviceReset();

		if (cudaStatus != cudaSuccess) {

			fprintf(stderr, "cudaDeviceReset failed!\n");

			return -1;

		}

	}

	else

	{

		Primes(first,last,res);

	}

	end = clock();

	elapsed = ((double)(end-start))/CLOCKS_PER_SEC;

	printf("Commercial time: %lf s\n",elapsed);

	DisplayPrimes(res);

	printf("Liczba liczb pierwszych: %d\n",res[0]);

	printf("Hit any key to terminate\n");

    getchar();

	

	free (res);

    return 0;

}

// Helper function for using CUDA to add vectors in parallel.

cudaError_t PrimesWithCuda(int first,int last,int nofb,int noft,int* output,int size)

{

    // Choose which GPU to run on, change this on a multi-GPU system.

    cudaError_t cudaStatus = cudaSetDevice(0);

    if (cudaStatus != cudaSuccess) {

        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?\n");

        goto Error;

    }

	/*int * dev_first;

	int * dev_last;*/

	int * dev_output;

	/*cudaStatus = cudaMalloc((void**)&dev_first,sizeof(int));

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_first: cudaMalloc failed\n", cudaStatus);

		goto Error;

    };

	cudaStatus = cudaMalloc((void**)&dev_last,sizeof(int));

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_last: cudaMalloc failed\n", cudaStatus);

		goto Error;

    };*/

	cudaStatus = cudaMalloc((void**)&dev_output,sizeof(int)*size);

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_output: cudaMalloc failed\n", cudaStatus);

		goto Error;

    };

	

	cudaStatus = cudaMemcpy(dev_output,output,sizeof(int)*size,cudaMemcpyHostToDevice);

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_output: HtD cudaMemcpy failed , code: %d\n", cudaStatus);

		goto Error;

    };

	/*cudaStatus = cudaMemcpy(dev_first,&first,sizeof(int),cudaMemcpyHostToDevice);

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_first: cudaMemcpy failed\n", cudaStatus);

		goto Error;

    };

	cudaStatus = cudaMemcpy(dev_last,&last,sizeof(int),cudaMemcpyHostToDevice);

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_last: cudaMemcpy failed\n", cudaStatus);

		goto Error;

    };*/

	if (first % 2 == 0)

		first++;

    // Launch a kernel on the GPU with one thread for each element.

    PrimesKernel<<<nofb, noft>>>(first,last,dev_output);

    // cudaDeviceSynchronize waits for the kernel to finish, and returns

    // any errors encountered during the launch.

    cudaStatus = cudaDeviceSynchronize();

    if (cudaStatus != cudaSuccess) {

        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching PrimesKernel!\n", cudaStatus);

        goto Error;

    }

	cudaStatus = cudaMemcpy(output,dev_output,sizeof(int)*size,cudaMemcpyDeviceToHost);

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_output: DtH cudaMemcpy failed: code %d\n", cudaStatus);

		goto Error;

    };

	cudaFree(dev_output);

Error:

	return cudaStatus;

}

I have found also description of the error, but I dont understand it fully.

/**

This indicates that the device kernel took too long to execute. This can
only occur if timeouts are enabled - see the device property
\ref ::cudaDeviceProp::kernelExecTimeoutEnabled “kernelExecTimeoutEnabled”
for more information. The device cannot be used until ::cudaThreadExit()
is called. All existing device memory allocations are invalid and must be
reconstructed if the program is to continue using CUDA.

*/

Hi,

watchdog timer kicks in. You have e certain time on you to execute a kernel. If you reach the timeout you will get this error.

You could split the problem up in several kernel calls if possible. If you are using a Tesla card you can use the TCC drivers and this will disable the timeout.

brano · June 13, 2011, 3:30pm

When I launch my programm it crashes it makes my screen flickers, and graphic card is restarted. It is only when time of calculation is big. cudasynchronize returns error 6.

I found information in the documentation that: cudaDeviceSynchronize() returns an error if one of the preceding tasks has failed

what does it mean for me?

here is the code:

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include "device_functions.h"

#include "auxilary.h"

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <ctime>

#include <math.h>

#define M_E        2.71828182845904523536

__host__ cudaError_t PrimesWithCuda(int first, int last,int nofb,int noft,int* res,int size);

__host__ double logN(double n);

__host__ int numberOfPrimes(int first,int last);

__host__ int* allocateOutput(int first,int last, int* size);

__host__ void Primes(int first, int last,int* result);

__global__ void PrimesKernel(int first, int last,int* result);

double logN(double n)

{

	return log(n)/log(M_E);

}

int numberOfPrimes(int first,int last)

{

	if ( first < 0 && last < 0 )

		return -1;

	if ( last < 10000 || last-first < 10000 ) 

			return 1500;

	double nop = (0.997344779 * last/(logN(last) - 1.110432659) - 2.0)*1.01 - 0.997344779 * first/(logN(first) - 1.110432659) - 2.0;

	return nop;

}

int* allocateOutput(int first,int last,int* size)

{

	int* output;

	int nop = numberOfPrimes(first,last);

	*size = nop;

	if ( nop <= 0 )

	{

		fprintf(stderr, "Wrong parameters: first & last\n");

        return NULL;

	}

	output = (int*)malloc(sizeof(int)*nop);

	if ( output == NULL )

	{

		fprintf(stderr, "output: malloc failed\n");

        return NULL;

	}

	memset(output,0,nop*sizeof(int));

	output[0]=1;

	return output;

}

void Primes(int first, int last,int* result)

{

	bool prime = false;

	int ij = 1;

	if (first % 2 == 0)

		first++;

	for ( int i=first ; i <= last ; i+=2 )

	{

		prime = true;

		for ( int j = 3; j*j <= i && prime ; j+=2 )

		{

			if ( i % j == 0 )

				prime = false;

		}

		if ( prime)

			result[ij++] = i;

	}

	result[0] = ij-1;

}

__global__ void PrimesKernel(int first, int last,int* result)

{

	int id = blockDim.x*blockIdx.x + threadIdx.x + first;

	int onoft = gridDim.x*blockDim.x;

	if ( id % 2 == 0 ) id += (onoft % 2 == 0 ? onoft-1 : onoft);

	bool prime = false;

	int i,ij=0;

	for ( i=id  ; i <=last  ; i+=2*onoft )

	{

		prime = true;

		for ( int j = 3; j*j <= i && prime ; j+=2 )

		{

			if ( i % j == 0 )

				prime = false;

		}

		if ( prime)

		{

		//	ij++;

			while (atomicCAS(result+result[0]++,0,i));

		}

	}

	//atomicAdd(result,ij);

}

__host__ void DisplayPrimes(int* primes)

{

	int i =0;

	printf("Liczba liczb pierwszych: %d ",primes[i++]);

	while (  primes[i] || i < 100)

	{

		printf("%d ",primes[i++]);

	}

	printf("\n");

}

int main(int argc, char** argv)

{

	double elapsed;

	clock_t start, end;

	cudaError_t cudaStatus;

	int* res;

	int size;

	if ( argc != 6 )

		return -1;

	start = clock();

	int first= atoi(argv[1]);

	int last = atoi(argv[2]);

	int nofb = atoi(argv[3]);

	int noft = atoi(argv[4]);

	int parallel = atoi(argv[5]);

	printf("nop: %d\n",numberOfPrimes(first,last));

	res = allocateOutput(first,last,&size);

	if ( parallel )

	{

		cudaStatus = PrimesWithCuda(first,last,nofb,noft,res,size);

		if (cudaStatus != cudaSuccess) {

			fprintf(stderr, "LevenstheinWithCuda failed!\n");

			return -1;

		}

		// cudaDeviceReset must be called before exiting in order for profiling and

		// tracing tools such as Parallel Nsight and Visual Profiler to show complete traces.

    	cudaStatus = cudaDeviceReset();

		if (cudaStatus != cudaSuccess) {

			fprintf(stderr, "cudaDeviceReset failed!\n");

			return -1;

		}

	}

	else

	{

		Primes(first,last,res);

	}

	end = clock();

	elapsed = ((double)(end-start))/CLOCKS_PER_SEC;

	printf("Commercial time: %lf s\n",elapsed);

	DisplayPrimes(res);

	printf("Liczba liczb pierwszych: %d\n",res[0]);

	printf("Hit any key to terminate\n");

    getchar();

	

	free (res);

    return 0;

}

// Helper function for using CUDA to add vectors in parallel.

cudaError_t PrimesWithCuda(int first,int last,int nofb,int noft,int* output,int size)

{

    // Choose which GPU to run on, change this on a multi-GPU system.

    cudaError_t cudaStatus = cudaSetDevice(0);

    if (cudaStatus != cudaSuccess) {

        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?\n");

        goto Error;

    }

	/*int * dev_first;

	int * dev_last;*/

	int * dev_output;

	/*cudaStatus = cudaMalloc((void**)&dev_first,sizeof(int));

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_first: cudaMalloc failed\n", cudaStatus);

		goto Error;

    };

	cudaStatus = cudaMalloc((void**)&dev_last,sizeof(int));

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_last: cudaMalloc failed\n", cudaStatus);

		goto Error;

    };*/

	cudaStatus = cudaMalloc((void**)&dev_output,sizeof(int)*size);

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_output: cudaMalloc failed\n", cudaStatus);

		goto Error;

    };

	

	cudaStatus = cudaMemcpy(dev_output,output,sizeof(int)*size,cudaMemcpyHostToDevice);

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_output: HtD cudaMemcpy failed , code: %d\n", cudaStatus);

		goto Error;

    };

	/*cudaStatus = cudaMemcpy(dev_first,&first,sizeof(int),cudaMemcpyHostToDevice);

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_first: cudaMemcpy failed\n", cudaStatus);

		goto Error;

    };

	cudaStatus = cudaMemcpy(dev_last,&last,sizeof(int),cudaMemcpyHostToDevice);

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_last: cudaMemcpy failed\n", cudaStatus);

		goto Error;

    };*/

	if (first % 2 == 0)

		first++;

    // Launch a kernel on the GPU with one thread for each element.

    PrimesKernel<<<nofb, noft>>>(first,last,dev_output);

    // cudaDeviceSynchronize waits for the kernel to finish, and returns

    // any errors encountered during the launch.

    cudaStatus = cudaDeviceSynchronize();

    if (cudaStatus != cudaSuccess) {

        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching PrimesKernel!\n", cudaStatus);

        goto Error;

    }

	cudaStatus = cudaMemcpy(output,dev_output,sizeof(int)*size,cudaMemcpyDeviceToHost);

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_output: DtH cudaMemcpy failed: code %d\n", cudaStatus);

		goto Error;

    };

	cudaFree(dev_output);

Error:

	return cudaStatus;

}

I have found also description of the error, but I dont understand it fully.

/**

This indicates that the device kernel took too long to execute. This can
only occur if timeouts are enabled - see the device property
\ref ::cudaDeviceProp::kernelExecTimeoutEnabled “kernelExecTimeoutEnabled”
for more information. The device cannot be used until ::cudaThreadExit()
is called. All existing device memory allocations are invalid and must be
reconstructed if the program is to continue using CUDA.

*/

Hi,

watchdog timer kicks in. You have e certain time on you to execute a kernel. If you reach the timeout you will get this error.

You could split the problem up in several kernel calls if possible. If you are using a Tesla card you can use the TCC drivers and this will disable the timeout.

mateuszef · June 13, 2011, 3:35pm

I use GeForce GT 9600 M, mobie version on my laptop. I need to make project for classess. So the only way to solve it is to put kernel function inside loop ?

mateuszef · June 13, 2011, 3:35pm

I use GeForce GT 9600 M, mobie version on my laptop. I need to make project for classess. So the only way to solve it is to put kernel function inside loop ?

Skybuck · June 16, 2011, 8:28am

Maybe you can disable the watchdog.

Google it and such… there could be solutions… but doing so might be risky… if games or any other stuff hang the display driver or something then you probably screwed and need a reboot ?

Perhaps terminating such applications blindly might work… or perhaps not because the driver hangs… me dont know about such problems External Image :)

The way I disabled the watchdog was via NVIDIA Parallel NSight (and developer drivers).

Then open NVIDIA Parallel NSight via tray icon and click options and such then set the following:

WDDM TDR Enabled = false

(WDDM = Windows Display Driver Model)
(TDR = Time Detection and Recovery)

More advanced control information at:

This is all for windows vista or windows 7 I think… what OS you have on your laptop ?

Skybuck · June 16, 2011, 8:28am

Maybe you can disable the watchdog.

Google it and such… there could be solutions… but doing so might be risky… if games or any other stuff hang the display driver or something then you probably screwed and need a reboot ?

Perhaps terminating such applications blindly might work… or perhaps not because the driver hangs… me dont know about such problems External Image :)

The way I disabled the watchdog was via NVIDIA Parallel NSight (and developer drivers).

Then open NVIDIA Parallel NSight via tray icon and click options and such then set the following:

WDDM TDR Enabled = false

(WDDM = Windows Display Driver Model)
(TDR = Time Detection and Recovery)

More advanced control information at:

This is all for windows vista or windows 7 I think… what OS you have on your laptop ?

Topic		Replies	Views
Error 719 (failure to launch) for JCUDA and PyCUDA; How to run GPU consecutive times for 'large' data blocks CUDA Programming and Performance	0	2327	December 13, 2016
unable to get the cpu and gpu to run in parallel CUDA Programming and Performance	34	23194	October 7, 2010
Using unified memory causes system crash CUDA Programming and Performance	28	5757	February 4, 2019
Can't get any concurrency on simple vector add across multi-GPU and streams CUDA Programming and Performance	17	5750	April 28, 2012
cudaDeviceSynchronize doesn't work if the kernel function takes too long to complete CUDA Programming and Performance	3	8580	January 29, 2012
CUDA very slow performance CUDA Programming and Performance	21	16306	March 6, 2020
Linking error for cuda separate compilation mode and static linking of cudart CUDA Programming and Performance cuda	2	1633	October 24, 2020
Cuda code performance CUDA Programming and Performance	14	3059	December 16, 2014
Issue with cudaMemcpyToSymbol and Separable Compilation. CUDA Programming and Performance	10	1239	February 28, 2019
cudaDeviceSynchronize() takes time CUDA Programming and Performance	11	1402	July 28, 2023

cudaSynchronizeDevice() returns error code 6

Related topics