"the launch timed out and was terminated" error I'm getting this error but not every t

Racz_Tamas · October 11, 2010, 11:21am

Hi All!

My problem is that I’m getting “the launch timed out and was terminated” error in kernel execution but only when my array’s length is large enough. The strange is that I’m only getting error 1 time from 2-4 runnings.

Before it was a constant error, but I increased the “numThreadsPerBlock” size from 4 to 32 and now its working like i wrote before… 1 time from 2-4 execution. I have no idea what’s the matter. Can you tell me whats wrong with my code?

a: my array to calculate from,

eredmenytom: my array that holds the calculated numbers

stuktura_tipus: my structure that looks like this:

in my header file:

struct stuktura_tipus

{

char *cuda_err;

};

extern “C” {

__declspec(dllexport) stuktura_tipus DLLalgoritm(int, int*, int, int, int,int);

}

the code:

[codebox]

stuktura_tipus* DLLalgoritm(int *a, int *eredmenytomb, int dimX, int dimY, int filter, int offset)

{

char *error;

stuktura_tipus *my_stuktura;

my_stuktura = (stuktura_tipus*) malloc(sizeof(stuktura_tipus)); 

int dimA = (dimX*dimY)+1; 

//set 0 device

cudaSetDevice(0);

error = checkCUDAError(" CUDA setDevice ");

if (error != "noerr"){

	my_stuktura->cuda_err = error;

	cudaThreadExit();

	return my_stuktura;

}

// define grid and block size

	int numThreadsPerBlock = 32;

int n_blocks = dimA/numThreadsPerBlock + (dimA%numThreadsPerBlock == 0?0:1);  // 1024 blocks

// pointer for device memory

int *d_a, *d_b;

// allocate host and device memory

	size_t memSize = dimA * sizeof(int);

cudaMalloc( (void **) &d_a, memSize );

cudaMalloc( (void **) &d_b, memSize );

// Copy host array to device array

	cudaMemcpy( d_a, a, memSize, cudaMemcpyHostToDevice );

error = checkCUDAError(" CUDA memcpy_todevice ");

if (error != "noerr"){

	my_stuktura->cuda_err = error;

	// free device memory on error

	cudaFree(d_a);

	cudaFree(d_b);

	cudaThreadExit();

	return my_stuktura;

}

// launch kernel

MarakovCUDAKernel<<< n_blocks, numThreadsPerBlock >>>( d_b , d_a , dimA, dimX, dimY, offset);

// block until the device has completed

	cudaThreadSynchronize();

// check if kernel execution generated an error

error = checkCUDAError(" CUDA kernel invocation ");

if (error != "noerr"){

	my_stuktura->cuda_err = error;

	// free device memory on error

	cudaFree(d_a);

	cudaFree(d_b);

	cudaThreadExit();

	return my_stuktura;

}

// device to host copy

cudaMemcpy( eredmenytomb, d_b, memSize, cudaMemcpyDeviceToHost );



	// Check for any CUDA errors

error = checkCUDAError(" CUDA memcpy_backtoHost: ");

if (error != "noerr"){

	my_stuktura->cuda_err = error;

	// free device memory on error

	cudaFree(d_a);

	cudaFree(d_b);

	cudaThreadExit();

	return my_stuktura;

}



// free device memory

	cudaFree(d_a);

cudaFree(d_b);

my_stuktura->cuda_err = error;

cudaThreadExit();

return my_stuktura;

};

char *checkCUDAError(const char *msg)

{

cudaError_t err = cudaGetLastError();

if( cudaSuccess != err) 

{

	char *error_message ;

	int mem = /* strlen("Cuda error:") +*/ strlen(msg) + strlen(cudaGetErrorString( err))+1;

	size_t memSizechar = mem * sizeof(int);

	error_message = (char *) malloc(memSizechar);

	strcpy(error_message,"");

	strcat(error_message, msg);

	strcat(error_message, cudaGetErrorString( err));

	

	return  error_message;

    //exit(EXIT_FAILURE);

}  

else return "noerr";

}

[/codebox]

Racz_Tamas · October 11, 2010, 11:21am

Hi All!

My problem is that I’m getting “the launch timed out and was terminated” error in kernel execution but only when my array’s length is large enough. The strange is that I’m only getting error 1 time from 2-4 runnings.

Before it was a constant error, but I increased the “numThreadsPerBlock” size from 4 to 32 and now its working like i wrote before… 1 time from 2-4 execution. I have no idea what’s the matter. Can you tell me whats wrong with my code?

a: my array to calculate from,

eredmenytom: my array that holds the calculated numbers

stuktura_tipus: my structure that looks like this:

in my header file:

struct stuktura_tipus

{

char *cuda_err;

};

extern “C” {

__declspec(dllexport) stuktura_tipus DLLalgoritm(int, int*, int, int, int,int);

}

the code:

[codebox]

stuktura_tipus* DLLalgoritm(int *a, int *eredmenytomb, int dimX, int dimY, int filter, int offset)

{

char *error;

stuktura_tipus *my_stuktura;

my_stuktura = (stuktura_tipus*) malloc(sizeof(stuktura_tipus)); 

int dimA = (dimX*dimY)+1; 

//set 0 device

cudaSetDevice(0);

error = checkCUDAError(" CUDA setDevice ");

if (error != "noerr"){

	my_stuktura->cuda_err = error;

	cudaThreadExit();

	return my_stuktura;

}

// define grid and block size

	int numThreadsPerBlock = 32;

int n_blocks = dimA/numThreadsPerBlock + (dimA%numThreadsPerBlock == 0?0:1);  // 1024 blocks

// pointer for device memory

int *d_a, *d_b;

// allocate host and device memory

	size_t memSize = dimA * sizeof(int);

cudaMalloc( (void **) &d_a, memSize );

cudaMalloc( (void **) &d_b, memSize );

// Copy host array to device array

	cudaMemcpy( d_a, a, memSize, cudaMemcpyHostToDevice );

error = checkCUDAError(" CUDA memcpy_todevice ");

if (error != "noerr"){

	my_stuktura->cuda_err = error;

	// free device memory on error

	cudaFree(d_a);

	cudaFree(d_b);

	cudaThreadExit();

	return my_stuktura;

}

// launch kernel

MarakovCUDAKernel<<< n_blocks, numThreadsPerBlock >>>( d_b , d_a , dimA, dimX, dimY, offset);

// block until the device has completed

	cudaThreadSynchronize();

// check if kernel execution generated an error

error = checkCUDAError(" CUDA kernel invocation ");

if (error != "noerr"){

	my_stuktura->cuda_err = error;

	// free device memory on error

	cudaFree(d_a);

	cudaFree(d_b);

	cudaThreadExit();

	return my_stuktura;

}

// device to host copy

cudaMemcpy( eredmenytomb, d_b, memSize, cudaMemcpyDeviceToHost );



	// Check for any CUDA errors

error = checkCUDAError(" CUDA memcpy_backtoHost: ");

if (error != "noerr"){

	my_stuktura->cuda_err = error;

	// free device memory on error

	cudaFree(d_a);

	cudaFree(d_b);

	cudaThreadExit();

	return my_stuktura;

}



// free device memory

	cudaFree(d_a);

cudaFree(d_b);

my_stuktura->cuda_err = error;

cudaThreadExit();

return my_stuktura;

};

char *checkCUDAError(const char *msg)

{

cudaError_t err = cudaGetLastError();

if( cudaSuccess != err) 

{

	char *error_message ;

	int mem = /* strlen("Cuda error:") +*/ strlen(msg) + strlen(cudaGetErrorString( err))+1;

	size_t memSizechar = mem * sizeof(int);

	error_message = (char *) malloc(memSizechar);

	strcpy(error_message,"");

	strcat(error_message, msg);

	strcat(error_message, cudaGetErrorString( err));

	

	return  error_message;

    //exit(EXIT_FAILURE);

}  

else return "noerr";

}

[/codebox]

Racz_Tamas · October 21, 2010, 8:54am

Hi guys! I found the solution. It seems, that the problem is releated to Operation System and VGA refreshing.
When you call a long kernel algoritm (in my case big array to calculate) after some time (between 22-45sec) the winXP do not get a refresh frame from the VGA, because its too busy to respond, and OS think that application died so termites the running.

External Image

THE SOLUTION IS you have to cut apart your kernel code and run in small pieces so VGA have few millisec to refresh between two kernel running. (basically work with for cycles)
I will test it under win7 today and get you some result.

Racz_Tamas · October 21, 2010, 8:54am

Hi guys! I found the solution. It seems, that the problem is releated to Operation System and VGA refreshing.
When you call a long kernel algoritm (in my case big array to calculate) after some time (between 22-45sec) the winXP do not get a refresh frame from the VGA, because its too busy to respond, and OS think that application died so termites the running.

External Image

THE SOLUTION IS you have to cut apart your kernel code and run in small pieces so VGA have few millisec to refresh between two kernel running. (basically work with for cycles)
I will test it under win7 today and get you some result.

Topic		Replies	Views
the launch timed out and was terminated. CUDA Programming and Performance	6	23936	June 29, 2010
Another launch timeout... CUDA Programming and Performance	2	1927	October 16, 2008
Launch Timeouts CUDA Programming and Performance	32	21943	May 4, 2011
unknown error, cudaGetLastError = 30 CUDA Programming and Performance	3	1806	April 21, 2011
unspecified launch failure kernel fails if a loop is too long CUDA Programming and Performance	8	42909	April 25, 2007
the launch timed out and was to terminated? CUDA Programming and Performance	3	1173	April 3, 2009
Possible reasons for CUDA_ERROR_LAUNCH_TIMEOUT Error 702 when calling cuCtxSynchronize CUDA Programming and Performance	3	16778	December 20, 2007
Error in lunching a kernel "the launch timed out and was terminated" CUDA Programming and Performance	1	913	April 13, 2011
Filter Problem (the launch timed out and was terminated) CUDA Programming and Performance	4	2423	December 15, 2009
problem with more data CUDA Programming and Performance	1	10435	October 29, 2011

"the launch timed out and was terminated" error I'm getting this error but not every t

Related topics