Acceleration of switching Debug/Release mode

I have 2 implementation of algorithm in 3 functions:

First implementation:

__global__ void CalcStars(u8* A, u8* B)
{  
	u32 Index = blockIdx.x * blockDim.x + threadIdx.x;
	
	A[ Index +  0] = TAB[ A[Index+31] ][ A[Index+26] ][ A[Index+20] ][ A[Index+ 9] ][ A[Index+ 1] ][ A[Index+ 0] ][ A[Index+16] ][ B[ 0] ];
	A[ Index +  1] = TAB[ A[Index+ 0] ][ A[Index+27] ][ A[Index+21] ][ A[Index+10] ][ A[Index+ 2] ][ A[Index+ 1] ][ A[Index+17] ][ B[ 1] ];
	A[ Index +  2] = TAB[ A[Index+ 1] ][ A[Index+28] ][ A[Index+22] ][ A[Index+11] ][ A[Index+ 3] ][ A[Index+ 2] ][ A[Index+18] ][ B[ 2] ];
	A[ Index +  3] = TAB[ A[Index+ 2] ][ A[Index+29] ][ A[Index+23] ][ A[Index+12] ][ A[Index+ 4] ][ A[Index+ 3] ][ A[Index+19] ][ B[ 3] ];
	A[ Index +  4] = TAB[ A[Index+ 3] ][ A[Index+30] ][ A[Index+24] ][ A[Index+13] ][ A[Index+ 5] ][ A[Index+ 4] ][ A[Index+20] ][ B[ 4] ];
	A[ Index +  5] = TAB[ A[Index+ 4] ][ A[Index+31] ][ A[Index+25] ][ A[Index+14] ][ A[Index+ 6] ][ A[Index+ 5] ][ A[Index+21] ][ B[ 5] ];
	A[ Index +  6] = TAB[ A[Index+ 5] ][ A[Index+ 0] ][ A[Index+26] ][ A[Index+15] ][ A[Index+ 7] ][ A[Index+ 6] ][ A[Index+22] ][ B[ 6] ];
	A[ Index +  7] = TAB[ A[Index+ 6] ][ A[Index+ 1] ][ A[Index+27] ][ A[Index+16] ][ A[Index+ 8] ][ A[Index+ 7] ][ A[Index+23] ][ B[ 7] ];
	A[ Index +  8] = TAB[ A[Index+ 7] ][ A[Index+ 2] ][ A[Index+28] ][ A[Index+17] ][ A[Index+ 9] ][ A[Index+ 8] ][ A[Index+24] ][ B[ 8] ];
	A[ Index +  9] = TAB[ A[Index+ 8] ][ A[Index+ 3] ][ A[Index+29] ][ A[Index+18] ][ A[Index+10] ][ A[Index+ 9] ][ A[Index+25] ][ B[ 9] ];
	A[ Index + 10] = TAB[ A[Index+ 9] ][ A[Index+ 4] ][ A[Index+30] ][ A[Index+19] ][ A[Index+11] ][ A[Index+10] ][ A[Index+26] ][ B[10] ];
	A[ Index + 11] = TAB[ A[Index+10] ][ A[Index+ 5] ][ A[Index+31] ][ A[Index+20] ][ A[Index+12] ][ A[Index+11] ][ A[Index+27] ][ B[11] ];
	A[ Index + 12] = TAB[ A[Index+11] ][ A[Index+ 6] ][ A[Index+ 0] ][ A[Index+21] ][ A[Index+13] ][ A[Index+12] ][ A[Index+28] ][ B[12] ];
	A[ Index + 13] = TAB[ A[Index+12] ][ A[Index+ 7] ][ A[Index+ 1] ][ A[Index+22] ][ A[Index+14] ][ A[Index+13] ][ A[Index+29] ][ B[13] ];
	A[ Index + 14] = TAB[ A[Index+13] ][ A[Index+ 8] ][ A[Index+ 2] ][ A[Index+23] ][ A[Index+15] ][ A[Index+14] ][ A[Index+30] ][ B[14] ];
	A[ Index + 15] = TAB[ A[Index+14] ][ A[Index+ 9] ][ A[Index+ 3] ][ A[Index+24] ][ A[Index+16] ][ A[Index+15] ][ A[Index+31] ][ B[15] ];
}

There I made a calculation by using the eight-dimensional texture array - TAB.

Second Implemetation:

__global__ void CalcStars(u32* A, u32* B)
{
	u32 Thread_ID  = blockIdx.x * blockDim.x + threadIdx.x;
	u32 Star_ID    = Thread_ID * 16;
	u32 Text_ID    = Thread_ID * 32;
	
	A[ 0 + Star_ID] = A[31 + Text_ID] ^  A[26 + Text_ID] ^ A[20 + Text_ID] ^ A[ 9 + Text_ID] ^ A[ 1 + Text_ID] ^ A[ 0 + Text_ID] ^ A[16 + Text_ID] ^ B[ 0];
	A[ 1 + Star_ID] = A[ 0 + Star_ID] ^  A[27 + Text_ID] ^ A[21 + Text_ID] ^ A[10 + Text_ID] ^ A[ 2 + Text_ID] ^ A[ 1 + Text_ID] ^ A[17 + Text_ID] ^ B[ 1];
	A[ 2 + Star_ID] = A[ 1 + Star_ID] ^  A[28 + Text_ID] ^ A[22 + Text_ID] ^ A[11 + Text_ID] ^ A[ 3 + Text_ID] ^ A[ 2 + Text_ID] ^ A[18 + Text_ID] ^ B[ 2];
	A[ 3 + Star_ID] = A[ 2 + Star_ID] ^  A[29 + Text_ID] ^ A[23 + Text_ID] ^ A[12 + Text_ID] ^ A[ 4 + Text_ID] ^ A[ 3 + Text_ID] ^ A[19 + Text_ID] ^ B[ 3];
	A[ 4 + Star_ID] = A[ 3 + Star_ID] ^  A[30 + Text_ID] ^ A[24 + Text_ID] ^ A[13 + Text_ID] ^ A[ 5 + Text_ID] ^ A[ 4 + Text_ID] ^ A[20 + Text_ID] ^ B[ 4];
	A[ 5 + Star_ID] = A[ 4 + Star_ID] ^  A[31 + Text_ID] ^ A[25 + Text_ID] ^ A[14 + Text_ID] ^ A[ 6 + Text_ID] ^ A[ 5 + Text_ID] ^ A[21 + Text_ID] ^ B[ 5];
	A[ 6 + Star_ID] = A[ 5 + Star_ID] ^  A[ 0 + Star_ID] ^ A[26 + Text_ID] ^ A[15 + Text_ID] ^ A[ 7 + Text_ID] ^ A[ 6 + Text_ID] ^ A[22 + Text_ID] ^ B[ 6];
	A[ 7 + Star_ID] = A[ 6 + Star_ID] ^  A[ 1 + Star_ID] ^ A[27 + Text_ID] ^ A[16 + Text_ID] ^ A[ 8 + Text_ID] ^ A[ 7 + Text_ID] ^ A[23 + Text_ID] ^ B[ 7];
	A[ 8 + Star_ID] = A[ 7 + Star_ID] ^  A[ 2 + Star_ID] ^ A[28 + Text_ID] ^ A[17 + Text_ID] ^ A[ 9 + Text_ID] ^ A[ 8 + Text_ID] ^ A[24 + Text_ID] ^ B[ 8];
	A[ 9 + Star_ID] = A[ 8 + Star_ID] ^  A[ 3 + Star_ID] ^ A[29 + Text_ID] ^ A[18 + Text_ID] ^ A[10 + Text_ID] ^ A[ 9 + Text_ID] ^ A[25 + Text_ID] ^ B[ 9];
	A[10 + Star_ID] = A[ 9 + Star_ID] ^  A[ 4 + Star_ID] ^ A[30 + Text_ID] ^ A[19 + Text_ID] ^ A[11 + Text_ID] ^ A[10 + Text_ID] ^ A[26 + Text_ID] ^ B[10];
	A[11 + Star_ID] = A[10 + Star_ID] ^  A[ 5 + Star_ID] ^ A[31 + Text_ID] ^ A[20 + Text_ID] ^ A[12 + Text_ID] ^ A[11 + Text_ID] ^ A[27 + Text_ID] ^ B[11];
	A[12 + Star_ID] = A[11 + Star_ID] ^  A[ 6 + Star_ID] ^ A[ 0 + Star_ID] ^ A[21 + Text_ID] ^ A[13 + Text_ID] ^ A[12 + Text_ID] ^ A[28 + Text_ID] ^ B[12];
	A[13 + Star_ID] = A[12 + Star_ID] ^  A[ 7 + Star_ID] ^ A[ 1 + Star_ID] ^ A[22 + Text_ID] ^ A[14 + Text_ID] ^ A[13 + Text_ID] ^ A[29 + Text_ID] ^ B[13];
	A[14 + Star_ID] = A[13 + Star_ID] ^  A[ 8 + Star_ID] ^ A[ 2 + Star_ID] ^ A[23 + Text_ID] ^ A[15 + Text_ID] ^ A[14 + Text_ID] ^ A[30 + Text_ID] ^ B[14];
	A[15 + Star_ID] = A[14 + Star_ID] ^  A[ 9 + Star_ID] ^ A[ 3 + Star_ID] ^ A[24 + Text_ID] ^ A[16 + Text_ID] ^ A[15 + Text_ID] ^ A[31 + Text_ID] ^ B[15];
}

In this case I made a simple calculations.

Other 2 functions have very similar view.

Time of execution for 1 implementation in DEBUG mode (3 func): 35 ms, 34 ms, 44 ms
Time of execution for 1 implementation in RELEASE mode (3 func): 0.00352 ms, 0.00326 ms, 0.00355 ms

Time of execution for 2 implementation in DEBUG mode (3 func): 3 ms, 5 ms, 12 ms
Time of execution for 2 implementation in RELEASE mode (3 func): 1 ms, 2 ms, 10 ms

And I want to understand why such a huge difference in acceleration between the two implementations.

Dump the SASS code with cudaobjdump for both implementations. My guess is implementation 2 generates a lot more instructions than implementation 1… Others may comment further or offer more specifics.

It’s pretty obvious I think… example 2 uses indexes which are further apart and thus could lead to worse memory access patterns, plus also 32 bit elements instead of 8 bit elements worsening memory access, cache hits, etc. That be my guess at least.