How can I avoid local memory?

I quote from the Programming Guide:

“However in some cases the compiler might choose to place it in local memory. This is often the case for large structures or arrays that would consume too much register space, and arrays for which the compiler cannot determine that they are indexed with constant quantities.”

This refers to the placements of variables. My problem is that I use an array that it is not indexed by constant quantities, and the compiles places the array in local memory which slows down the kernel. Is there a way to direct the compiler to place those arrays in the registers?

I want to put them in registers and not in the shared memory.

I have tried using the key work “register” but this is merely a suggestion to the compiler and it doesn’t help in my case.

I paste the code here. The arrays I am talking about are the Query_AA[8] and Sequence_AA[8].

Thanks.

char Query_AA[8] = {0,1,2,3,4,5,6,7};
int Query_AAI = 0x00010203, Query_AAII = 0x04050607;
char Sequence_AA[8] = {0,1,2,3,4,5,6,7};
int Sequence_AAI = 0x00010203, Sequence_AAII = 0x04050607;

int Score = 0;

while( Sequence_Index < NumSequences ){

[indent] int Hits = 0;

    Sequence_AAI = d_Database2Dpadded[ Sequence_Index*(Sequence_length>>2) + 0];
    
    Sequence_AA[0] = (Sequence_AAI & 0x000000FF) >>  0;
    Sequence_AA[1] = (Sequence_AAI & 0x0000FF00) >>  8;
    Sequence_AA[2] = (Sequence_AAI & 0x00FF0000) >> 16;
    Sequence_AA[3] = (Sequence_AAI & 0xFF000000) >> 24;

    Score = 0;

    int first_time = 1;
    
    for( int s_off = 0; s_off < Sequence_actual_length - 2; ++s_off ){

[indent] if( ((s_off&0x3) == 0) && (((s_off >> 2) & 0x1) == 1) && ((s_off>>2) < ((Sequence_length>>2) - 1)) ){

[indent] Sequence_AAI = d_Database2Dpadded[ Sequence_Index*(Sequence_length>>2) + (s_off>>2) + 1 ];
Sequence_AA[0] = (Sequence_AAI & 0x000000FF) >> 0;
Sequence_AA[1] = (Sequence_AAI & 0x0000FF00) >> 8;
Sequence_AA[2] = (Sequence_AAI & 0x00FF0000) >> 16;
Sequence_AA[3] = (Sequence_AAI & 0xFF000000) >> 24;[/indent]

        }else if( ((s_off&0x3) == 0) && ( ((s_off >> 2) & 0x1) == 0) ){

[indent] Sequence_AAII = d_Database2Dpadded[ Sequence_Index*(Sequence_length>>2) + (s_off>>2) + 1 ];
Sequence_AA[4] = (Sequence_AAII & 0x000000FF) >> 0;
Sequence_AA[5] = (Sequence_AAII & 0x0000FF00) >> 8;
Sequence_AA[6] = (Sequence_AAII & 0x00FF0000) >> 16;
Sequence_AA[7] = (Sequence_AAII & 0xFF000000) >> 24;[/indent]

        }

        Query_AAI = Query_AA_shared[ 0 ];
        Query_AA[0] = (Query_AAI & 0x000000FF) >>  0;
        Query_AA[1] = (Query_AAI & 0x0000FF00) >>  8;
        Query_AA[2] = (Query_AAI & 0x00FF0000) >> 16;
        Query_AA[3] = (Query_AAI & 0xFF000000) >> 24;

        for( int q_off = 0; q_off < Query_actual_length - 2; ++q_off){

           [indent] if( ((q_off&0x3) == 0) && ( ((q_off >> 2) & 0x1) == 1) && ((q_off>>2) < ((Query_length>>2) - 1 )) ){

[indent] Query_AAI = Query_AA_shared( (q_off>>2) + 1);
Query_AA[0] = (Query_AAI & 0x000000FF) >> 0;
Query_AA[1] = (Query_AAI & 0x0000FF00) >> 8;
Query_AA[2] = (Query_AAI & 0x00FF0000) >> 16;
Query_AA[3] = (Query_AAI & 0xFF000000) >> 24;[/indent]

            }else if( ((q_off&0x3) == 0) && ( ((q_off >> 2) & 0x1) == 0) ){

[indent] Query_AAII = Query_AA_shared( (q_off>>2) + 1);
Query_AA[4] = (Query_AAII & 0x000000FF) >> 0;
Query_AA[5] = (Query_AAII & 0x0000FF00) >> 8;
Query_AA[6] = (Query_AAII & 0x00FF0000) >> 16;
Query_AA[7] = (Query_AAII & 0xFF000000) >> 24;[/indent]
}

            Score =
               SubstitutionMatrix_shared_char( Sequence_AA[ (s_off + 0)&0x7] * ALPHABET_SIZE + Query_AA[ (q_off + 0)&0x7 ])+
                SubstitutionMatrix_shared_char( Sequence_AA[ (s_off + 1)&0x7] * ALPHABET_SIZE + Query_AA[ (q_off + 1)&0x7 ])+                    
                SubstitutionMatrix_shared_char( Sequence_AA[ (s_off + 2)&0x7] * ALPHABET_SIZE + Query_AA[ (q_off + 2)&0x7 ]);

            if( Score >= THRESHOLD )

[indent] ++Hits;[/indent]
[/indent]

        } //Query for loop[/indent]            
    } //Sequence for loop

    d_Hits[ Sequence_Index] = Hits;
    Sequence_Index += (gridDim.x * blockDimx);[/indent]
    
} //while loop

The compiler cannot decide at compile time which element to take with your complex index calculation. So no, there is no way for the compiler to put this in registers.
So either your index calculation at the end needs to be much simpler, otherwise shared memory is your best option.