Hi guys,
I have a tricky problem with my CUDA code. I wrote a kernel and it works fine with a small number of data/threads. With 38 it works fine, if I use more I receive an error “cudaCheckError() with sync failed at kernel_nomm.cu:962 : the launch timed out and was terminated.”. The data of the 39th entry is ok, it works on a lower position. Anyway, I know that the kernel-source itself works, I make a printf which shows me the kernel results, and they are ok. But if I copy my result array back to the host, the data is lost (all zero). With 38 threads, i get my results to the host.
I removed some lines from the following code, to make it more easily to read. The full source is attached. The critical array is resultlist. I allocated space on host an device for it, calculate on the device and try to copy it back to host, but it doesn’t work. I use X blocks and only one thread/block, because I read that the error my come if the registers are full.
With 38 threads the code takes about 200ms to run (whole application), so the mysterious 5sec (for the launch time out error) border is not reached. The same error also occurs if I run the code without X11 (init 3). I use a Centos 6 and CudaToolkit 4.0. I compile with the -arch = sm_20 option and use a Q600 card.
void kernel_wrapper_nomm(bool direction, uint32_t *bwt, uint32_t *checkpoints, uint32_t *markedrows, uint32_t length, uint32_t numcheckpoints, char *SRs, char *phreds, int numsr, int srlength, uint32_t *fchr, uint32_t dollarpos)
{
//create List with offests for solution
uint32_t *resultlist = (uint32_t *)malloc(sizeof(uint32_t)*numsr);
//create a list of random numbers
int rnum = 100; //number of random values generated
uint32_t randomnum[rnum];
//initialize
memset(resultlist, 0, sizeof(uint32_t)*numsr);
for (int i=0; i< rnum; i++){
randomnum[i] = random();
}
//calc sizes
size_t s_bwt = sizeof(uint32_t)*((length+1)/16+1); //length + 1 because $ is also added
...
size_t s_resultlist = sizeof(uint32_t)*numsr;
//CPU Timer (only seconds)
timeval start, end; //Variables to get time for copy to device
gettimeofday(&start, 0); //set Walltime start Clock
//set CUDA-Timers
uint kernelTime;
cutCreateTimer(&kernelTime);
cutStartTimer(kernelTime);
//------------------ALLOCATE BEGIN--------------------
CudaSafeCall(cudaMalloc((void**)&d_bwt, s_bwt)); //allocate memory for bwt
....
CudaSafeCall(cudaMalloc((void**)&d_phreds, s_phred)); //allocate memory for phredscores
CUDA_CHECK_ERROR
CudaSafeCall(cudaMalloc((void**)&d_resultlist, s_resultlist)); //allocate memory for resultlist
CudaSafeCall(cudaMemset(d_resultlist, 0, s_resultlist));
CUDA_CHECK_ERROR
cutStopTimer(kernelTime);
gettimeofday(&end, 0); //get Walltime end time
cutResetTimer(kernelTime);
//SetTimer
gettimeofday(&start, 0); //set Walltime start Clock
cutStartTimer(kernelTime); //set GPU-Timer
cudaMemcpy(d_bwt, bwt, s_bwt, cudaMemcpyHostToDevice); //copy to Device
...
cudaMemcpy(d_phreds, phreds, s_phred, cudaMemcpyHostToDevice); //copy to Device
uint32_t dollarint = dollarpos/16;
cudaMemcpyToSymbol(c_length, &length, sizeof(uint32_t),0, cudaMemcpyHostToDevice);
....
cudaMemcpyToSymbol(c_dollarint, &dollarint,sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
cutStopTimer(kernelTime); //stop GPU-Timer
gettimeofday(&end, 0); //get Walltime end time
cutResetTimer(kernelTime);
uint32_t blocks = 0; //store number of blocks
uint32_t threads = MAXTHREADS;
if(numsr < 14*MAXTHREADS) threads = 1; //14 is the number of multiprocessors
if(numsr%threads==0){ //if SRs are a multiple of 1024
blocks = numsr/threads;
}else{ //if SRs are not a multiple of 1024
blocks = numsr/threads +1;
}
if(blocks == 1) threads = numsr;
#ifdef VERBOSE
printf("Start Kernel with: %i blocks, %i threads per block\n", blocks, threads);
#endif
MemKernel<<<1,1>>>(d_bwt, d_checkpoints, d_markedrows, d_resultlist, d_sr); //set pointers
ExactKernel<<<blocks,threads>>>();
cutStopTimer(kernelTime); //stop GPU-Timer
gettimeofday(&end, 0); //get Walltime end time
//copy results back
//##################### CRITICAL PART ############################
CudaSafeCall(cudaMemcpy( resultlist, d_resultlist, s_resultlist, cudaMemcpyDeviceToHost));
CudaCheckError();
//print result
printf("RESULTS: \n");
for(int i=0; i<numsr; i++){
//if(resultlist[i]!=0)
printf("%i: %u \n", i, resultlist[i]);
}
//##################### CRITICAL PART ############################
//Clear device Memory
if(GPUVERBOSE) printf("Free CUDA memory\n");
CudaSafeCall(cudaFree(d_bwt));
CudaSafeCall(cudaFree(d_checkpoints));
CudaSafeCall(cudaFree(d_markedrows));
CudaSafeCall(cudaFree(d_direction));
CudaSafeCall(cudaFree(d_resultlist));
// CudaSafeCall(cudaFree(&c_length)); //makes error for illegal adress
// CudaSafeCall(cudaFree(&c_numcheckpoints));
// CudaSafeCall(cudaFree(&c_numsr));
// CudaSafeCall(cudaFree(&c_srlength));
// CudaSafeCall(cudaFree(&c_dollarpos));
CudaCheckError();
}
The output is as following:
Read offsets from file...
Wallclock Index HDD->RAM: 55 ms
Wallclock SRs HDD->RAM: 13 ms
s_resultlist: 204
Allocate Memory on device
CUDAMALLOC: resultlist reserved size 204
CUDAMEMSET: resultlist == 0 with size 204
Memory allocated; CPU time = % ms.; GPU time = 338.661987 ms;
Copy Ewbt to Device:
bwt copied
Checkpoints copied
marked rows copied
SRs (DNA) copied
phreds copied
copy constants to device
GPU-cudaMemcpy-time: 1.741000 ms
Walltime-cudaMemcpy-time: 1 ms
Start Kernel with: 51 blocks, 1 threads per block
GPU-kernel launch time: 1.432000 ms
Walltime-kernel launch time: % ms
Copy back results with size 204
cudaCheckError() with sync failed at kernel_nomm.cu:962 : the launch timed out and was terminated.
cudaSafeCall() failed at kernel_nomm.cu:973 : the launch timed out and was terminated
cudaSafeCall() failed at kernel_nomm.cu:974 : the launch timed out and was terminated
cudaSafeCall() failed at kernel_nomm.cu:975 : the launch timed out and was terminated
cudaSafeCall() failed at kernel_nomm.cu:976 : the launch timed out and was terminated
cudaSafeCall() failed at kernel_nomm.cu:977 : the launch timed out and was terminated
cudaCheckError() failed at kernel_nomm.cu:983 : the launch timed out and was terminated.
cudaCheckError() with sync failed at kernel_nomm.cu:983 : the launch timed out and was terminated.
{13} k_resultlist[13] = 958824 //####### the following lines show me that the calculation is done
{5} k_resultlist[5] = 4249841
{19} k_resultlist[19] = 2849230
{34} k_resultlist[34] = 4582962
{24} k_resultlist[24] = 3979632
{7} k_resultlist[7] = 4086913
{42} k_resultlist[42] = 4379768
{32} k_resultlist[32] = 2097232
{20} k_resultlist[20] = 396703
{48} k_resultlist[48] = 3640348
{50} k_resultlist[50] = 4191309
RESULTS: //but on host side, there only zeros
0: 0
1: 0
2: 0
3: 0
4: 0
5: 0
6: 0
.... //cut for reading purposes
49: 0
50: 0
Free CUDA memory
Programruntime: 11295 ms
Bowtie GPU end
Thank you for any suggestions…