GPU kernel hangs

I have written the kernel below, and I use a Tesla C870, on a Fedora 8, with the latest nvidia drivers, and toolkit 2.0. My computer has another Quadro FX1500 (which is not CUDA compatible by the way).

When I restart my computer and I execute this kernel, for the first two or three times it runs fine (in the sense that it executes in a reasonable time, and it is not stuck anywhere). After a few runs, though, it just hangs, and I have to terminate the execution of the program. In emulation mode it runs fine, though, and the few of the examples of the SDK that I have tried run fine.

Do you have any idea what might be the problem? I am almost sure that it is not a problem of the kernel (if it was the kernel it would not execute at all, not even the first few times after I restart), but it has to do with something else (maybe the drivers).

I used to have the same problems with the previous drivers and toolkit, and I thought the upgrade would help, but it hasn’t.

Another thing I have noticed (not only for this kernel, but for others as well) is that when I run the kernel for the fist time it is slower, than the sesbsequent times I run it. Has anybody else noticed this?

P.S.1: I tried to indent the code, but the indentation is moved when I post the code.

P.S.2: I know I can optimize the speed of this kernel by replacing the modulo and the divisions with more efficient logical and shifting operations, but I don’t care about speed at this point.

global void
BLAST_OperationsIntensive_kernel( int* d_Database2Dpadded, int* d_RepeatedQuery, int *d_RepeatedSubstitutionMatrix, int *d_Hits, int NumSequences, int MaxLength, int Query_length, int Query_actual_length, int SubstitutionMatrix_length){

int bx = blockIdx.x;
int tx = threadIdx.x;
int blockDimx = blockDim.x;

//load the Query to shared memory
extern shared int array;
int* Query_AA_shared = (int*)array;; // I have to declare Query_AA_shared as int to achieve bank conflict free shared memory accessing
int* SubstitutionMatrix_shared_int = (int*)&Query_AA_shared[Query_length>>2];

//Load Query from global to shared memory
int Query_Index = tx;
Query_length >>= 2; //The Query is accessed as integers to/from global and shared memory to achieve coalescing.
while( Query_Index < Query_length ){ //However, each integer contains 4 AA letters, thus Query_length is divided by 4.

Query_AA_shared[ Query_Index ] = d_RepeatedQuery[ bx * Query_length + Query_Index ];
    
Query_Index += blockDimx;

}

int SubstitutionMatrix_Index = tx;
while( SubstitutionMatrix_Index < (SubstitutionMatrix_length>>2) ){//SubstitutionMatrix_length is a multiple of 64 which includes some padded zeros
//so there is some waste of memory space. Can it be recovered?
SubstitutionMatrix_shared_int[ SubstitutionMatrix_Index ] = d_RepeatedSubstitutionMatrix[bx*(SubstitutionMatrix_length>>2) + SubstitutionMatrix_Index ];

SubstitutionMatrix_Index += blockDimx;

}

__syncthreads();

char* SubstitutionMatrix_shared_char = (char*)SubstitutionMatrix_shared_int;

int Sequence_Index = (bx * blockDimx + tx);
Query_length <<= 2; //Restore the value of the Query_length
char Query_AA[8] = {0,1,2,3,4,5,6,7};
int Query_AAI = 0x00010203, Query_AAII = 0x04050607;
char Sequence_AA[8] = {0,1,2,3,4,5,6,7};
int Sequence_AAI = 0x00010203, Sequence_AAII = 0x04050607;
char A[10] = {0,1,2,3,4,5,6,7,8,9};
while( Sequence_Index < NumSequences ){

    Sequence_AAI = d_Database2Dpadded[ Sequence_Index*(MaxLength>>2) + 0];
    
    Sequence_AA[0] = (Sequence_AAI & 0x000000FF) >>  0;
    Sequence_AA[1] = (Sequence_AAI & 0x0000FF00) >>  8;
    Sequence_AA[2] = (Sequence_AAI & 0x00FF0000) >> 16;
    Sequence_AA[3] = (Sequence_AAI & 0xFF000000) >> 24;

  for( int j = 0; j < MaxLength - 2; ++j ){

        if( (j%4 == 0) & ((j / 4) % 2 == 1) ){
            
            Sequence_AAI  = d_Database2Dpadded[ Sequence_Index*(MaxLength>>2) + j/4 + 1 ];
            Sequence_AA[0] = (Sequence_AAI & 0x000000FF) >>  0;
            Sequence_AA[1] = (Sequence_AAI & 0x0000FF00) >>  8;
            Sequence_AA[2] = (Sequence_AAI & 0x00FF0000) >> 16;
            Sequence_AA[3] = (Sequence_AAI & 0xFF000000) >> 24;
            
        }else if( (j%4 == 0) & ((j / 4) % 2 == 0) ){
            
            Sequence_AAII  = d_Database2Dpadded[ Sequence_Index*(MaxLength>>2) + j/4 + 1 ];
            Sequence_AA[4] = (Sequence_AAII & 0x000000FF) >>  0;
            Sequence_AA[5] = (Sequence_AAII & 0x0000FF00) >>  8;
            Sequence_AA[6] = (Sequence_AAII & 0x00FF0000) >> 16;
            Sequence_AA[7] = (Sequence_AAII & 0xFF000000) >> 24;
            
        }

        Query_AAI = Query_AA_shared[ 0 ];
        Query_AA[0] = (Query_AAI & 0x000000FF) >>  0;
        Query_AA[1] = (Query_AAI & 0x0000FF00) >>  8;
        Query_AA[2] = (Query_AAI & 0x00FF0000) >> 16;
        Query_AA[3] = (Query_AAI & 0xFF000000) >> 24;

        for( int i = 0; i < Query_actual_length - 2; ++i){
            
            if( (i%4 == 0) & ((i / 4) % 2 == 1) ){
                
                Query_AAI = Query_AA_shared( i / 4 + 1);
                Query_AA[0] = (Query_AAI & 0x000000FF) >>  0;
                Query_AA[1] = (Query_AAI & 0x0000FF00) >>  8;
                Query_AA[2] = (Query_AAI & 0x00FF0000) >> 16;
                Query_AA[3] = (Query_AAI & 0xFF000000) >> 24;
                *d_Database2Dpadded = A[i];
                
            }else if( (i%4 == 0) & ((i / 4) % 2 == 0) ){
                
                Query_AAII = Query_AA_shared( i / 4 + 1);
                Query_AA[4] = (Query_AAII & 0x000000FF) >>  0;
                Query_AA[5] = (Query_AAII & 0x0000FF00) >>  8;
                Query_AA[6] = (Query_AAII & 0x00FF0000) >> 16;
                Query_AA[7] = (Query_AAII & 0xFF000000) >> 24;

            }

            int Score =  SubstitutionMatrix_shared_char[ Sequence_AA[ (j + 0)%8] * 25 + Query_AA[ (i + 0)%8 ] ] +
                SubstitutionMatrix_shared_char[ Sequence_AA[ (j + 1)%8] * 25 + Query_AA[ (i + 1)%8 ] ]+                    
                SubstitutionMatrix_shared_char[ Sequence_AA[ (j + 2)%8] * 25 + Query_AA[ (i + 2)%8 ] ];

            if(Score > 10){

                
                
     }

        }
    
    }
    
    Sequence_Index += (gridDim.x * blockDimx);
}

}

Compile with -deviceemu, use valgrind, check for out of bounds memory accesses.

I think there might be some compatibility issue or something went wriong, You need to check thoroughly

I used valgrind and I managed to mitigate the problem I had. At least now the kernel doesn’t stuck completely, but it delays a lot to execute, and it happens in a weird way. I have before and after the call of the kernel a “start timer” and a
'stop timer" function to see how much times it takes to execute. The weird thing is that although the time reported is of the order of 0.1 ms, after the execution the program hangs for a few seconds (10 - 20 sec) and then proceeds further. And the hanging happens after the “printf” that reports the time of the kernel!!

Any ideas what might be the problem?

In a previous post tmurray told me to run my code with valgrind and check for out of bounds memory accesses.
Below is the output of valgrind with -v and --leak-check=full on emudebug. As you can see there are only 24 bytes lost (I think I know where they are lost from, but they don’t cause the problem).

Moreover, if I use valgrind on realese, emurelease, debug, and emudebug I get different results. Which ones should I trust more?

[pvouzis@alexander BLAST]$ valgrind -v --leak-check=full …/…/bin/linux/emudebug/BLAST -s100
==3599== Memcheck, a memory error detector.
==3599== Copyright © 2002-2008, and GNU GPL’d, by Julian Seward et al.
==3599== Using LibVEX rev 1878, a library for dynamic binary translation.
==3599== Copyright © 2004-2008, and GNU GPL’d, by OpenWorks LLP.
==3599== Using valgrind-3.4.0, a dynamic binary instrumentation framework.
==3599== Copyright © 2000-2008, and GNU GPL’d, by Julian Seward et al.
==3599==
–3599-- Command line
–3599-- …/…/bin/linux/emudebug/BLAST
–3599-- -s100
–3599-- Startup, with flags:
–3599-- -v
–3599-- --leak-check=full
–3599-- Contents of /proc/version:
–3599-- Linux version 2.6.26.8-57.fc8 (mockbuild@x86-3.fedora.phx.redhat.com) (gcc version 4.1.2 20070925 (Red Hat 4.1.2-33)) #1 SMP Thu Dec 18 19:19:45 EST 2008
–3599-- Arch and hwcaps: X86, x86-sse1-sse2
–3599-- Page sizes: currently 4096, max supported 4096
–3599-- Valgrind library directory: /usr/local/lib/valgrind
–3599-- Reading syms from /lib/ld-2.7.so (0x95e000)
–3599-- Reading syms from /home/pvouzis/Documents/CUDA/CUDA_SDK2.0/bin/linux/emudebug/BLAST (0x8048000)
–3599-- Reading syms from /usr/local/lib/valgrind/x86-linux/memcheck (0x38000000)
–3599-- object doesn’t have a dynamic symbol table
–3599-- Reading suppressions file: /usr/local/lib/valgrind/default.supp
–3599-- REDIR: 0x973bb0 (index) redirected to 0x3803a863 (vgPlain_x86_linux_REDIR_FOR_index)
–3599-- Reading syms from /usr/local/lib/valgrind/x86-linux/vgpreload_core.so (0x4001000)
–3599-- Reading syms from /usr/local/lib/valgrind/x86-linux/vgpreload_memcheck.so (0x4003000)
==3599== WARNING: new redirection conflicts with existing – ignoring it
–3599-- new: 0x00973bb0 (index ) R-> 0x04006840 index
–3599-- REDIR: 0x973d50 (strlen) redirected to 0x40069f0 (strlen)
–3599-- Reading syms from /usr/local/cuda/lib/libcudart.so.2.0 (0x400a000)
–3599-- object doesn’t have a symbol table
–3599-- Reading syms from /usr/lib/libstdc++.so.6.0.8 (0x4f34000)
–3599-- object doesn’t have a symbol table
–3599-- Reading syms from /lib/libm-2.7.so (0xad8000)
–3599-- Reading syms from /lib/libgcc_s-4.1.2-20070925.so.1 (0x8bc000)
–3599-- object doesn’t have a symbol table
–3599-- Reading syms from /lib/libc-2.7.so (0x97d000)
–3599-- Reading syms from /lib/libdl-2.7.so (0xb03000)
–3599-- Reading syms from /lib/libpthread-2.7.so (0xb0a000)
–3599-- Reading syms from /lib/librt-2.7.so (0xc05000)
–3599-- REDIR: 0x9ee3a0 (memset) redirected to 0x4006d50 (memset)
–3599-- REDIR: 0x9ee890 (memcpy) redirected to 0x4007c40 (memcpy)
–3599-- REDIR: 0x9ed4c0 (rindex) redirected to 0x4006720 (rindex)
–3599-- REDIR: 0x9ed120 (strlen) redirected to 0x40069d0 (strlen)
–3599-- REDIR: 0x9e8810 (calloc) redirected to 0x4004b80 (calloc)
–3599-- REDIR: 0x9ea250 (realloc) redirected to 0x4005b20 (realloc)
–3599-- REDIR: 0x9e8c80 (memalign) redirected to 0x4004a30 (memalign)
–3599-- REDIR: 0x9e8b00 (malloc) redirected to 0x4005a60 (malloc)
–3599-- REDIR: 0x4fe8b80 (operator new(unsigned int)) redirected to 0x4006240 (operator new(unsigned int))
–3599-- REDIR: 0x9ee330 (memmove) redirected to 0x4006db0 (memmove)

Reading Database
–3599-- REDIR: 0x9edea0 (memchr) redirected to 0x4006bc0 (memchr)
–3599-- REDIR: 0x9ec8a0 (strcat) redirected to 0x40071d0 (strcat)
–3599-- REDIR: 0x9ea060 (free) redirected to 0x4005600 (free)
…Done
–3599-- REDIR: 0x9ef220 (strchrnul) redirected to 0x4006e30 (strchrnul)
–3599-- REDIR: 0x9ee400 (mempcpy) redirected to 0x40074a0 (mempcpy)
Sequence_MaxLength = 64

Reading Query…DoneGlobal Memory Allocated bytes = 5054976
–3599-- Reading syms from /usr/lib/libcuda.so.177.73 (0x447a000)
–3599-- Reading syms from /lib/libz.so.1.2.3 (0xb25000)
–3599-- object doesn’t have a symbol table
–3599-- REDIR: 0x9ed410 (strncpy) redirected to 0x4007db0 (strncpy)
–3599-- REDIR: 0x9ef150 (rawmemchr) redirected to 0x4006e60 (rawmemchr)
–3599-- REDIR: 0x4fe76d0 (operator delete(void*)) redirected to 0x4005290 (operator delete(void*))
Allocating GPU memory…Done
Copying to GPU memory…Done
–3599-- REDIR: 0x4fe8cc0 (operator new(unsigned int)) redirected to 0x4005ec0 (operator new(unsigned int))
–3599-- REDIR: 0x4fe7730 (operator delete) redirected to 0x4004e70 (operator delete)
Kernel Execution…Done

Cut error in file ‘BLAST.cu’ in line 198.
Transferring data from GPU==3599==
==3599== ERROR SUMMARY: 0 errors from 0 contexts (suppressed: 28 from 1)
–3599–
–3599-- supp: 28 dl-hack3-cond-1
==3599== malloc/free: in use at exit: 5,069,385 bytes in 225 blocks.
==3599== malloc/free: 1,288 allocs, 1,063 frees, 10,303,301 bytes allocated.
==3599==
==3599== searching for pointers to 225 not-freed blocks.
==3599== checked 1,366,916 bytes.
==3599==
==3599== 24 bytes in 1 blocks are definitely lost in loss record 4 of 23
==3599== at 0x40062A4: operator new(unsigned int) (vg_replace_malloc.c:224)
==3599== by 0x805A0AE: StopWatch::create() (stopwatch.cpp:92)
==3599== by 0x804FC92: cutCreateTimer (cutil.cpp:1391)
==3599== by 0x804DB9F: BLAST_OperationsIntensive(int, char**) (BLAST.cu:87)
==3599== by 0x804EC99: main (BLAST.cu:65)
==3599==
==3599== LEAK SUMMARY:
==3599== definitely lost: 24 bytes in 1 blocks.
==3599== possibly lost: 0 bytes in 0 blocks.
==3599== still reachable: 5,069,361 bytes in 224 blocks.
==3599== suppressed: 0 bytes in 0 blocks.
==3599== Reachable blocks (those to which a pointer was found) are not shown.
==3599== To see them, rerun with: --leak-check=full --show-reachable=yes
–3599-- memcheck: sanity checks: 318 cheap, 12 expensive
–3599-- memcheck: auxmaps: 0 auxmap entries (0k, 0M) in use
–3599-- memcheck: auxmaps_L1: 0 searches, 0 cmps, ratio 0:10
–3599-- memcheck: auxmaps_L2: 0 searches, 0 nodes
–3599-- memcheck: SMs: n_issued = 371 (5936k, 5M)
–3599-- memcheck: SMs: n_deissued = 44 (704k, 0M)
–3599-- memcheck: SMs: max_noaccess = 65535 (1048560k, 1023M)
–3599-- memcheck: SMs: max_undefined = 120 (1920k, 1M)
–3599-- memcheck: SMs: max_defined = 835 (13360k, 13M)
–3599-- memcheck: SMs: max_non_DSM = 370 (5920k, 5M)
–3599-- memcheck: max sec V bit nodes: 0 (0k, 0M)
–3599-- memcheck: set_sec_vbits8 calls: 0 (new: 0, updates: 0)
–3599-- memcheck: max shadow mem size: 6224k, 6M
–3599-- translate: fast SP updates identified: 8,200 ( 88.3%)
–3599-- translate: generic_known SP updates identified: 667 ( 7.1%)
–3599-- translate: generic_unknown SP updates identified: 412 ( 4.4%)
–3599-- tt/tc: 13,766 tt lookups requiring 14,530 probes
–3599-- tt/tc: 13,766 fast-cache updates, 3 flushes
–3599-- transtab: new 6,419 (150,875 -> 2,128,160; ratio 141:10) [0 scs]
–3599-- transtab: dumped 0 (0 -> ??)
–3599-- transtab: discarded 8 (187 -> ??)
–3599-- scheduler: 31,971,004 jumps (bb entries).
–3599-- scheduler: 318/1,585,028 major/minor sched events.
–3599-- sanity: 319 cheap, 12 expensive checks.
–3599-- exectx: 1,543 lists, 1,167 contexts (avg 0 per list)
–3599-- exectx: 3,459 searches, 2,921 full compares (844 per 1000)
–3599-- exectx: 535 cmp2, 111 cmp4, 0 cmpAll
–3599-- errormgr: 36 supplist searches, 1,853 comparisons during search
–3599-- errormgr: 28 errlist searches, 111 comparisons during search