"the launch timed out and was terminated" error I'm getting this error but not every t

Hi All!

My problem is that I’m getting “the launch timed out and was terminated” error in kernel execution but only when my array’s length is large enough. The strange is that I’m only getting error 1 time from 2-4 runnings.

Before it was a constant error, but I increased the “numThreadsPerBlock” size from 4 to 32 and now its working like i wrote before… 1 time from 2-4 execution. I have no idea what’s the matter. Can you tell me whats wrong with my code?

a: my array to calculate from,

eredmenytom: my array that holds the calculated numbers

stuktura_tipus: my structure that looks like this:

in my header file:

struct stuktura_tipus

{

char *cuda_err; 

};

extern “C” {

__declspec(dllexport) stuktura_tipus DLLalgoritm(int, int*, int, int, int,int);

}

the code:

[codebox]

stuktura_tipus* DLLalgoritm(int *a, int *eredmenytomb, int dimX, int dimY, int filter, int offset)

{

char *error;

stuktura_tipus *my_stuktura;

my_stuktura = (stuktura_tipus*) malloc(sizeof(stuktura_tipus)); 

int dimA = (dimX*dimY)+1; 

//set 0 device

cudaSetDevice(0);

error = checkCUDAError(" CUDA setDevice ");

if (error != "noerr"){

	my_stuktura->cuda_err = error;

	cudaThreadExit();

	return my_stuktura;

}

// define grid and block size

	int numThreadsPerBlock = 32;

int n_blocks = dimA/numThreadsPerBlock + (dimA%numThreadsPerBlock == 0?0:1);  // 1024 blocks

// pointer for device memory

int *d_a, *d_b;

// allocate host and device memory

	size_t memSize = dimA * sizeof(int);

cudaMalloc( (void **) &d_a, memSize );

cudaMalloc( (void **) &d_b, memSize );

// Copy host array to device array

	cudaMemcpy( d_a, a, memSize, cudaMemcpyHostToDevice );

error = checkCUDAError(" CUDA memcpy_todevice ");

if (error != "noerr"){

	my_stuktura->cuda_err = error;

	// free device memory on error

	cudaFree(d_a);

	cudaFree(d_b);

	cudaThreadExit();

	return my_stuktura;

}

// launch kernel

MarakovCUDAKernel<<< n_blocks, numThreadsPerBlock >>>( d_b , d_a , dimA, dimX, dimY, offset);

// block until the device has completed

	cudaThreadSynchronize();

// check if kernel execution generated an error

error = checkCUDAError(" CUDA kernel invocation ");

if (error != "noerr"){

	my_stuktura->cuda_err = error;

	// free device memory on error

	cudaFree(d_a);

	cudaFree(d_b);

	cudaThreadExit();

	return my_stuktura;

}

// device to host copy

cudaMemcpy( eredmenytomb, d_b, memSize, cudaMemcpyDeviceToHost );



	// Check for any CUDA errors

error = checkCUDAError(" CUDA memcpy_backtoHost: ");

if (error != "noerr"){

	my_stuktura->cuda_err = error;

	// free device memory on error

	cudaFree(d_a);

	cudaFree(d_b);

	cudaThreadExit();

	return my_stuktura;

}



// free device memory

	cudaFree(d_a);

cudaFree(d_b);

my_stuktura->cuda_err = error;

cudaThreadExit();

return my_stuktura;

};

char *checkCUDAError(const char *msg)

{

cudaError_t err = cudaGetLastError();

if( cudaSuccess != err) 

{

	char *error_message ;

	int mem = /* strlen("Cuda error:") +*/ strlen(msg) + strlen(cudaGetErrorString( err))+1;

	size_t memSizechar = mem * sizeof(int);

	error_message = (char *) malloc(memSizechar);

	strcpy(error_message,"");

	strcat(error_message, msg);

	strcat(error_message, cudaGetErrorString( err));

	

	return  error_message;

    //exit(EXIT_FAILURE);

}  

else return "noerr";

}

[/codebox]

Hi All!

My problem is that I’m getting “the launch timed out and was terminated” error in kernel execution but only when my array’s length is large enough. The strange is that I’m only getting error 1 time from 2-4 runnings.

Before it was a constant error, but I increased the “numThreadsPerBlock” size from 4 to 32 and now its working like i wrote before… 1 time from 2-4 execution. I have no idea what’s the matter. Can you tell me whats wrong with my code?

a: my array to calculate from,

eredmenytom: my array that holds the calculated numbers

stuktura_tipus: my structure that looks like this:

in my header file:

struct stuktura_tipus

{

char *cuda_err; 

};

extern “C” {

__declspec(dllexport) stuktura_tipus DLLalgoritm(int, int*, int, int, int,int);

}

the code:

[codebox]

stuktura_tipus* DLLalgoritm(int *a, int *eredmenytomb, int dimX, int dimY, int filter, int offset)

{

char *error;

stuktura_tipus *my_stuktura;

my_stuktura = (stuktura_tipus*) malloc(sizeof(stuktura_tipus)); 

int dimA = (dimX*dimY)+1; 

//set 0 device

cudaSetDevice(0);

error = checkCUDAError(" CUDA setDevice ");

if (error != "noerr"){

	my_stuktura->cuda_err = error;

	cudaThreadExit();

	return my_stuktura;

}

// define grid and block size

	int numThreadsPerBlock = 32;

int n_blocks = dimA/numThreadsPerBlock + (dimA%numThreadsPerBlock == 0?0:1);  // 1024 blocks

// pointer for device memory

int *d_a, *d_b;

// allocate host and device memory

	size_t memSize = dimA * sizeof(int);

cudaMalloc( (void **) &d_a, memSize );

cudaMalloc( (void **) &d_b, memSize );

// Copy host array to device array

	cudaMemcpy( d_a, a, memSize, cudaMemcpyHostToDevice );

error = checkCUDAError(" CUDA memcpy_todevice ");

if (error != "noerr"){

	my_stuktura->cuda_err = error;

	// free device memory on error

	cudaFree(d_a);

	cudaFree(d_b);

	cudaThreadExit();

	return my_stuktura;

}

// launch kernel

MarakovCUDAKernel<<< n_blocks, numThreadsPerBlock >>>( d_b , d_a , dimA, dimX, dimY, offset);

// block until the device has completed

	cudaThreadSynchronize();

// check if kernel execution generated an error

error = checkCUDAError(" CUDA kernel invocation ");

if (error != "noerr"){

	my_stuktura->cuda_err = error;

	// free device memory on error

	cudaFree(d_a);

	cudaFree(d_b);

	cudaThreadExit();

	return my_stuktura;

}

// device to host copy

cudaMemcpy( eredmenytomb, d_b, memSize, cudaMemcpyDeviceToHost );



	// Check for any CUDA errors

error = checkCUDAError(" CUDA memcpy_backtoHost: ");

if (error != "noerr"){

	my_stuktura->cuda_err = error;

	// free device memory on error

	cudaFree(d_a);

	cudaFree(d_b);

	cudaThreadExit();

	return my_stuktura;

}



// free device memory

	cudaFree(d_a);

cudaFree(d_b);

my_stuktura->cuda_err = error;

cudaThreadExit();

return my_stuktura;

};

char *checkCUDAError(const char *msg)

{

cudaError_t err = cudaGetLastError();

if( cudaSuccess != err) 

{

	char *error_message ;

	int mem = /* strlen("Cuda error:") +*/ strlen(msg) + strlen(cudaGetErrorString( err))+1;

	size_t memSizechar = mem * sizeof(int);

	error_message = (char *) malloc(memSizechar);

	strcpy(error_message,"");

	strcat(error_message, msg);

	strcat(error_message, cudaGetErrorString( err));

	

	return  error_message;

    //exit(EXIT_FAILURE);

}  

else return "noerr";

}

[/codebox]

Hi guys! I found the solution. It seems, that the problem is releated to Operation System and VGA refreshing.
When you call a long kernel algoritm (in my case big array to calculate) after some time (between 22-45sec) the winXP do not get a refresh frame from the VGA, because its too busy to respond, and OS think that application died so termites the running.

:pirate:

THE SOLUTION IS you have to cut apart your kernel code and run in small pieces so VGA have few millisec to refresh between two kernel running. (basically work with for cycles)
I will test it under win7 today and get you some result.

Hi guys! I found the solution. It seems, that the problem is releated to Operation System and VGA refreshing.
When you call a long kernel algoritm (in my case big array to calculate) after some time (between 22-45sec) the winXP do not get a refresh frame from the VGA, because its too busy to respond, and OS think that application died so termites the running.

:pirate:

THE SOLUTION IS you have to cut apart your kernel code and run in small pieces so VGA have few millisec to refresh between two kernel running. (basically work with for cycles)
I will test it under win7 today and get you some result.