I have written the kernel below, and I use a Tesla C870, on a Fedora 8, with the latest nvidia drivers, and toolkit 2.0. My computer has another Quadro FX1500 (which is not CUDA compatible by the way).
When I restart my computer and I execute this kernel, for the first two or three times it runs fine (in the sense that it executes in a reasonable time, and it is not stuck anywhere). After a few runs, though, it just hangs, and I have to terminate the execution of the program. In emulation mode it runs fine, though, and the few of the examples of the SDK that I have tried run fine.
Do you have any idea what might be the problem? I am almost sure that it is not a problem of the kernel (if it was the kernel it would not execute at all, not even the first few times after I restart), but it has to do with something else (maybe the drivers).
I used to have the same problems with the previous drivers and toolkit, and I thought the upgrade would help, but it hasn’t.
Another thing I have noticed (not only for this kernel, but for others as well) is that when I run the kernel for the fist time it is slower, than the sesbsequent times I run it. Has anybody else noticed this?
P.S.1: I tried to indent the code, but the indentation is moved when I post the code.
P.S.2: I know I can optimize the speed of this kernel by replacing the modulo and the divisions with more efficient logical and shifting operations, but I don’t care about speed at this point.
global void
BLAST_OperationsIntensive_kernel( int* d_Database2Dpadded, int* d_RepeatedQuery, int *d_RepeatedSubstitutionMatrix, int *d_Hits, int NumSequences, int MaxLength, int Query_length, int Query_actual_length, int SubstitutionMatrix_length){
int bx = blockIdx.x;
int tx = threadIdx.x;
int blockDimx = blockDim.x;
//load the Query to shared memory
extern shared int array;
int* Query_AA_shared = (int*)array;; // I have to declare Query_AA_shared as int to achieve bank conflict free shared memory accessing
int* SubstitutionMatrix_shared_int = (int*)&Query_AA_shared[Query_length>>2];
//Load Query from global to shared memory
int Query_Index = tx;
Query_length >>= 2; //The Query is accessed as integers to/from global and shared memory to achieve coalescing.
while( Query_Index < Query_length ){ //However, each integer contains 4 AA letters, thus Query_length is divided by 4.
Query_AA_shared[ Query_Index ] = d_RepeatedQuery[ bx * Query_length + Query_Index ];
Query_Index += blockDimx;
}
int SubstitutionMatrix_Index = tx;
while( SubstitutionMatrix_Index < (SubstitutionMatrix_length>>2) ){//SubstitutionMatrix_length is a multiple of 64 which includes some padded zeros
//so there is some waste of memory space. Can it be recovered?
SubstitutionMatrix_shared_int[ SubstitutionMatrix_Index ] = d_RepeatedSubstitutionMatrix[bx*(SubstitutionMatrix_length>>2) + SubstitutionMatrix_Index ];
SubstitutionMatrix_Index += blockDimx;
}
__syncthreads();
char* SubstitutionMatrix_shared_char = (char*)SubstitutionMatrix_shared_int;
int Sequence_Index = (bx * blockDimx + tx);
Query_length <<= 2; //Restore the value of the Query_length
char Query_AA[8] = {0,1,2,3,4,5,6,7};
int Query_AAI = 0x00010203, Query_AAII = 0x04050607;
char Sequence_AA[8] = {0,1,2,3,4,5,6,7};
int Sequence_AAI = 0x00010203, Sequence_AAII = 0x04050607;
char A[10] = {0,1,2,3,4,5,6,7,8,9};
while( Sequence_Index < NumSequences ){
Sequence_AAI = d_Database2Dpadded[ Sequence_Index*(MaxLength>>2) + 0];
Sequence_AA[0] = (Sequence_AAI & 0x000000FF) >> 0;
Sequence_AA[1] = (Sequence_AAI & 0x0000FF00) >> 8;
Sequence_AA[2] = (Sequence_AAI & 0x00FF0000) >> 16;
Sequence_AA[3] = (Sequence_AAI & 0xFF000000) >> 24;
for( int j = 0; j < MaxLength - 2; ++j ){
if( (j%4 == 0) & ((j / 4) % 2 == 1) ){
Sequence_AAI = d_Database2Dpadded[ Sequence_Index*(MaxLength>>2) + j/4 + 1 ];
Sequence_AA[0] = (Sequence_AAI & 0x000000FF) >> 0;
Sequence_AA[1] = (Sequence_AAI & 0x0000FF00) >> 8;
Sequence_AA[2] = (Sequence_AAI & 0x00FF0000) >> 16;
Sequence_AA[3] = (Sequence_AAI & 0xFF000000) >> 24;
}else if( (j%4 == 0) & ((j / 4) % 2 == 0) ){
Sequence_AAII = d_Database2Dpadded[ Sequence_Index*(MaxLength>>2) + j/4 + 1 ];
Sequence_AA[4] = (Sequence_AAII & 0x000000FF) >> 0;
Sequence_AA[5] = (Sequence_AAII & 0x0000FF00) >> 8;
Sequence_AA[6] = (Sequence_AAII & 0x00FF0000) >> 16;
Sequence_AA[7] = (Sequence_AAII & 0xFF000000) >> 24;
}
Query_AAI = Query_AA_shared[ 0 ];
Query_AA[0] = (Query_AAI & 0x000000FF) >> 0;
Query_AA[1] = (Query_AAI & 0x0000FF00) >> 8;
Query_AA[2] = (Query_AAI & 0x00FF0000) >> 16;
Query_AA[3] = (Query_AAI & 0xFF000000) >> 24;
for( int i = 0; i < Query_actual_length - 2; ++i){
if( (i%4 == 0) & ((i / 4) % 2 == 1) ){
Query_AAI = Query_AA_shared( i / 4 + 1);
Query_AA[0] = (Query_AAI & 0x000000FF) >> 0;
Query_AA[1] = (Query_AAI & 0x0000FF00) >> 8;
Query_AA[2] = (Query_AAI & 0x00FF0000) >> 16;
Query_AA[3] = (Query_AAI & 0xFF000000) >> 24;
*d_Database2Dpadded = A[i];
}else if( (i%4 == 0) & ((i / 4) % 2 == 0) ){
Query_AAII = Query_AA_shared( i / 4 + 1);
Query_AA[4] = (Query_AAII & 0x000000FF) >> 0;
Query_AA[5] = (Query_AAII & 0x0000FF00) >> 8;
Query_AA[6] = (Query_AAII & 0x00FF0000) >> 16;
Query_AA[7] = (Query_AAII & 0xFF000000) >> 24;
}
int Score = SubstitutionMatrix_shared_char[ Sequence_AA[ (j + 0)%8] * 25 + Query_AA[ (i + 0)%8 ] ] +
SubstitutionMatrix_shared_char[ Sequence_AA[ (j + 1)%8] * 25 + Query_AA[ (i + 1)%8 ] ]+
SubstitutionMatrix_shared_char[ Sequence_AA[ (j + 2)%8] * 25 + Query_AA[ (i + 2)%8 ] ];
if(Score > 10){
}
}
}
Sequence_Index += (gridDim.x * blockDimx);
}
}