problem with more data

Hi guys,

I have a tricky problem with my CUDA code. I wrote a kernel and it works fine with a small number of data/threads. With 38 it works fine, if I use more I receive an error “cudaCheckError() with sync failed at kernel_nomm.cu:962 : the launch timed out and was terminated.”. The data of the 39th entry is ok, it works on a lower position. Anyway, I know that the kernel-source itself works, I make a printf which shows me the kernel results, and they are ok. But if I copy my result array back to the host, the data is lost (all zero). With 38 threads, i get my results to the host.

I removed some lines from the following code, to make it more easily to read. The full source is attached. The critical array is resultlist. I allocated space on host an device for it, calculate on the device and try to copy it back to host, but it doesn’t work. I use X blocks and only one thread/block, because I read that the error my come if the registers are full.

With 38 threads the code takes about 200ms to run (whole application), so the mysterious 5sec (for the launch time out error) border is not reached. The same error also occurs if I run the code without X11 (init 3). I use a Centos 6 and CudaToolkit 4.0. I compile with the -arch = sm_20 option and use a Q600 card.

void kernel_wrapper_nomm(bool direction, uint32_t *bwt, uint32_t *checkpoints, uint32_t *markedrows, uint32_t length, uint32_t numcheckpoints, char *SRs, char *phreds, int numsr, int srlength, uint32_t *fchr, uint32_t dollarpos)

{

    //create List with offests for solution

    uint32_t *resultlist = (uint32_t *)malloc(sizeof(uint32_t)*numsr);

    //create a list of random numbers

    int rnum = 100; //number of random values generated

    uint32_t randomnum[rnum];

//initialize

    memset(resultlist, 0, sizeof(uint32_t)*numsr);

    for (int i=0; i< rnum; i++){

        randomnum[i] = random();

    }

//calc sizes

    size_t s_bwt = sizeof(uint32_t)*((length+1)/16+1); //length + 1 because $ is also added

...

    size_t s_resultlist = sizeof(uint32_t)*numsr;

//CPU Timer (only seconds)

    timeval start, end;    //Variables to get time for copy to device

    gettimeofday(&start, 0); //set Walltime start Clock

//set CUDA-Timers

    uint kernelTime;

    cutCreateTimer(&kernelTime);

    cutStartTimer(kernelTime);

//------------------ALLOCATE BEGIN--------------------

    CudaSafeCall(cudaMalloc((void**)&d_bwt, s_bwt));                     //allocate memory for bwt

....

    CudaSafeCall(cudaMalloc((void**)&d_phreds, s_phred));             //allocate memory for phredscores

CUDA_CHECK_ERROR

CudaSafeCall(cudaMalloc((void**)&d_resultlist, s_resultlist));     //allocate memory for resultlist

    CudaSafeCall(cudaMemset(d_resultlist, 0, s_resultlist));

CUDA_CHECK_ERROR

cutStopTimer(kernelTime);

    gettimeofday(&end, 0); //get Walltime end time

    cutResetTimer(kernelTime);

//SetTimer

    gettimeofday(&start, 0); //set Walltime start Clock

    cutStartTimer(kernelTime); //set GPU-Timer

cudaMemcpy(d_bwt, bwt, s_bwt, cudaMemcpyHostToDevice); //copy to Device

...

    cudaMemcpy(d_phreds, phreds, s_phred, cudaMemcpyHostToDevice); //copy to Device

uint32_t dollarint = dollarpos/16;

cudaMemcpyToSymbol(c_length, &length, sizeof(uint32_t),0, cudaMemcpyHostToDevice);

....

    cudaMemcpyToSymbol(c_dollarint, &dollarint,sizeof(uint32_t), 0, cudaMemcpyHostToDevice);

cutStopTimer(kernelTime); //stop GPU-Timer

    gettimeofday(&end, 0); //get Walltime end time

cutResetTimer(kernelTime);

uint32_t blocks = 0;        //store number of blocks

    uint32_t threads = MAXTHREADS;

    if(numsr < 14*MAXTHREADS) threads = 1; //14 is the number of multiprocessors

    if(numsr%threads==0){    //if SRs are a multiple of 1024

        blocks = numsr/threads;

    }else{                        //if SRs are not a multiple of 1024

        blocks = numsr/threads +1;

    }

    if(blocks == 1) threads = numsr;

#ifdef VERBOSE

    printf("Start Kernel with: %i blocks, %i threads per block\n", blocks, threads);

#endif

MemKernel<<<1,1>>>(d_bwt, d_checkpoints, d_markedrows, d_resultlist, d_sr); //set pointers

    ExactKernel<<<blocks,threads>>>();

cutStopTimer(kernelTime); //stop GPU-Timer

    gettimeofday(&end, 0); //get Walltime end time

//copy results back

//##################### CRITICAL PART ############################

CudaSafeCall(cudaMemcpy( resultlist, d_resultlist, s_resultlist, cudaMemcpyDeviceToHost));

    CudaCheckError();

//print result

    printf("RESULTS: \n");

    for(int i=0; i<numsr; i++){

        //if(resultlist[i]!=0)

            printf("%i: %u \n", i, resultlist[i]);

    }

//##################### CRITICAL PART ############################

//Clear device Memory

    if(GPUVERBOSE) printf("Free CUDA memory\n");

    CudaSafeCall(cudaFree(d_bwt));

    CudaSafeCall(cudaFree(d_checkpoints));

    CudaSafeCall(cudaFree(d_markedrows));

    CudaSafeCall(cudaFree(d_direction));

    CudaSafeCall(cudaFree(d_resultlist));

//    CudaSafeCall(cudaFree(&c_length)); //makes error for illegal adress

//    CudaSafeCall(cudaFree(&c_numcheckpoints));

//    CudaSafeCall(cudaFree(&c_numsr));

//    CudaSafeCall(cudaFree(&c_srlength));

//    CudaSafeCall(cudaFree(&c_dollarpos));

    CudaCheckError();

}

The output is as following:

Read offsets from file...

Wallclock Index HDD->RAM: 55 ms

Wallclock SRs HDD->RAM: 13 ms

	 s_resultlist: 	 204 

Allocate Memory on device 

CUDAMALLOC: resultlist reserved size 204

CUDAMEMSET: resultlist == 0 with size 204

Memory allocated; CPU time = % ms.; GPU time = 338.661987 ms;

Copy Ewbt to Device:

	 bwt copied 

	 Checkpoints copied 

	 marked rows copied 

	 SRs (DNA) copied 

	 phreds copied 

copy constants to device

GPU-cudaMemcpy-time: 1.741000 ms

Walltime-cudaMemcpy-time: 1 ms

Start Kernel with: 51 blocks, 1 threads per block

GPU-kernel launch time: 1.432000 ms

Walltime-kernel launch time: % ms

Copy back results with size 204

cudaCheckError() with sync failed at kernel_nomm.cu:962 : the launch timed out and was terminated.

cudaSafeCall() failed at kernel_nomm.cu:973 : the launch timed out and was terminated

cudaSafeCall() failed at kernel_nomm.cu:974 : the launch timed out and was terminated

cudaSafeCall() failed at kernel_nomm.cu:975 : the launch timed out and was terminated

cudaSafeCall() failed at kernel_nomm.cu:976 : the launch timed out and was terminated

cudaSafeCall() failed at kernel_nomm.cu:977 : the launch timed out and was terminated

cudaCheckError() failed at kernel_nomm.cu:983 : the launch timed out and was terminated.

cudaCheckError() with sync failed at kernel_nomm.cu:983 : the launch timed out and was terminated.

{13} k_resultlist[13] = 958824     //####### the following lines show me that the calculation is done

{5} k_resultlist[5] = 4249841

{19} k_resultlist[19] = 2849230

{34} k_resultlist[34] = 4582962

{24} k_resultlist[24] = 3979632

{7} k_resultlist[7] = 4086913

{42} k_resultlist[42] = 4379768

{32} k_resultlist[32] = 2097232

{20} k_resultlist[20] = 396703

{48} k_resultlist[48] = 3640348

{50} k_resultlist[50] = 4191309

RESULTS: //but on host side, there only zeros

0: 0 

1: 0 

2: 0 

3: 0 

4: 0 

5: 0 

6: 0 

.... //cut for reading purposes

49: 0 

50: 0 

Free CUDA memory

Programruntime: 11295 ms

Bowtie GPU end

Thank you for any suggestions…

Hi, after a code review and a lot of tests the error is related to the input data, so its my own issue.