Hi All!
My problem is that I’m getting “the launch timed out and was terminated” error in kernel execution but only when my array’s length is large enough. The strange is that I’m only getting error 1 time from 2-4 runnings.
Before it was a constant error, but I increased the “numThreadsPerBlock” size from 4 to 32 and now its working like i wrote before… 1 time from 2-4 execution. I have no idea what’s the matter. Can you tell me whats wrong with my code?
a: my array to calculate from,
eredmenytom: my array that holds the calculated numbers
stuktura_tipus: my structure that looks like this:
in my header file:
struct stuktura_tipus
{
char *cuda_err;
};
extern “C” {
__declspec(dllexport) stuktura_tipus DLLalgoritm(int, int*, int, int, int,int);
}
the code:
[codebox]
stuktura_tipus* DLLalgoritm(int *a, int *eredmenytomb, int dimX, int dimY, int filter, int offset)
{
char *error;
stuktura_tipus *my_stuktura;
my_stuktura = (stuktura_tipus*) malloc(sizeof(stuktura_tipus));
int dimA = (dimX*dimY)+1;
//set 0 device
cudaSetDevice(0);
error = checkCUDAError(" CUDA setDevice ");
if (error != "noerr"){
my_stuktura->cuda_err = error;
cudaThreadExit();
return my_stuktura;
}
// define grid and block size
int numThreadsPerBlock = 32;
int n_blocks = dimA/numThreadsPerBlock + (dimA%numThreadsPerBlock == 0?0:1); // 1024 blocks
// pointer for device memory
int *d_a, *d_b;
// allocate host and device memory
size_t memSize = dimA * sizeof(int);
cudaMalloc( (void **) &d_a, memSize );
cudaMalloc( (void **) &d_b, memSize );
// Copy host array to device array
cudaMemcpy( d_a, a, memSize, cudaMemcpyHostToDevice );
error = checkCUDAError(" CUDA memcpy_todevice ");
if (error != "noerr"){
my_stuktura->cuda_err = error;
// free device memory on error
cudaFree(d_a);
cudaFree(d_b);
cudaThreadExit();
return my_stuktura;
}
// launch kernel
MarakovCUDAKernel<<< n_blocks, numThreadsPerBlock >>>( d_b , d_a , dimA, dimX, dimY, offset);
// block until the device has completed
cudaThreadSynchronize();
// check if kernel execution generated an error
error = checkCUDAError(" CUDA kernel invocation ");
if (error != "noerr"){
my_stuktura->cuda_err = error;
// free device memory on error
cudaFree(d_a);
cudaFree(d_b);
cudaThreadExit();
return my_stuktura;
}
// device to host copy
cudaMemcpy( eredmenytomb, d_b, memSize, cudaMemcpyDeviceToHost );
// Check for any CUDA errors
error = checkCUDAError(" CUDA memcpy_backtoHost: ");
if (error != "noerr"){
my_stuktura->cuda_err = error;
// free device memory on error
cudaFree(d_a);
cudaFree(d_b);
cudaThreadExit();
return my_stuktura;
}
// free device memory
cudaFree(d_a);
cudaFree(d_b);
my_stuktura->cuda_err = error;
cudaThreadExit();
return my_stuktura;
};
char *checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
char *error_message ;
int mem = /* strlen("Cuda error:") +*/ strlen(msg) + strlen(cudaGetErrorString( err))+1;
size_t memSizechar = mem * sizeof(int);
error_message = (char *) malloc(memSizechar);
strcpy(error_message,"");
strcat(error_message, msg);
strcat(error_message, cudaGetErrorString( err));
return error_message;
//exit(EXIT_FAILURE);
}
else return "noerr";
}
[/codebox]