I have 2 implementation of algorithm in 3 functions:
First implementation:
__global__ void CalcStars(u8* A, u8* B)
{
u32 Index = blockIdx.x * blockDim.x + threadIdx.x;
A[ Index + 0] = TAB[ A[Index+31] ][ A[Index+26] ][ A[Index+20] ][ A[Index+ 9] ][ A[Index+ 1] ][ A[Index+ 0] ][ A[Index+16] ][ B[ 0] ];
A[ Index + 1] = TAB[ A[Index+ 0] ][ A[Index+27] ][ A[Index+21] ][ A[Index+10] ][ A[Index+ 2] ][ A[Index+ 1] ][ A[Index+17] ][ B[ 1] ];
A[ Index + 2] = TAB[ A[Index+ 1] ][ A[Index+28] ][ A[Index+22] ][ A[Index+11] ][ A[Index+ 3] ][ A[Index+ 2] ][ A[Index+18] ][ B[ 2] ];
A[ Index + 3] = TAB[ A[Index+ 2] ][ A[Index+29] ][ A[Index+23] ][ A[Index+12] ][ A[Index+ 4] ][ A[Index+ 3] ][ A[Index+19] ][ B[ 3] ];
A[ Index + 4] = TAB[ A[Index+ 3] ][ A[Index+30] ][ A[Index+24] ][ A[Index+13] ][ A[Index+ 5] ][ A[Index+ 4] ][ A[Index+20] ][ B[ 4] ];
A[ Index + 5] = TAB[ A[Index+ 4] ][ A[Index+31] ][ A[Index+25] ][ A[Index+14] ][ A[Index+ 6] ][ A[Index+ 5] ][ A[Index+21] ][ B[ 5] ];
A[ Index + 6] = TAB[ A[Index+ 5] ][ A[Index+ 0] ][ A[Index+26] ][ A[Index+15] ][ A[Index+ 7] ][ A[Index+ 6] ][ A[Index+22] ][ B[ 6] ];
A[ Index + 7] = TAB[ A[Index+ 6] ][ A[Index+ 1] ][ A[Index+27] ][ A[Index+16] ][ A[Index+ 8] ][ A[Index+ 7] ][ A[Index+23] ][ B[ 7] ];
A[ Index + 8] = TAB[ A[Index+ 7] ][ A[Index+ 2] ][ A[Index+28] ][ A[Index+17] ][ A[Index+ 9] ][ A[Index+ 8] ][ A[Index+24] ][ B[ 8] ];
A[ Index + 9] = TAB[ A[Index+ 8] ][ A[Index+ 3] ][ A[Index+29] ][ A[Index+18] ][ A[Index+10] ][ A[Index+ 9] ][ A[Index+25] ][ B[ 9] ];
A[ Index + 10] = TAB[ A[Index+ 9] ][ A[Index+ 4] ][ A[Index+30] ][ A[Index+19] ][ A[Index+11] ][ A[Index+10] ][ A[Index+26] ][ B[10] ];
A[ Index + 11] = TAB[ A[Index+10] ][ A[Index+ 5] ][ A[Index+31] ][ A[Index+20] ][ A[Index+12] ][ A[Index+11] ][ A[Index+27] ][ B[11] ];
A[ Index + 12] = TAB[ A[Index+11] ][ A[Index+ 6] ][ A[Index+ 0] ][ A[Index+21] ][ A[Index+13] ][ A[Index+12] ][ A[Index+28] ][ B[12] ];
A[ Index + 13] = TAB[ A[Index+12] ][ A[Index+ 7] ][ A[Index+ 1] ][ A[Index+22] ][ A[Index+14] ][ A[Index+13] ][ A[Index+29] ][ B[13] ];
A[ Index + 14] = TAB[ A[Index+13] ][ A[Index+ 8] ][ A[Index+ 2] ][ A[Index+23] ][ A[Index+15] ][ A[Index+14] ][ A[Index+30] ][ B[14] ];
A[ Index + 15] = TAB[ A[Index+14] ][ A[Index+ 9] ][ A[Index+ 3] ][ A[Index+24] ][ A[Index+16] ][ A[Index+15] ][ A[Index+31] ][ B[15] ];
}
There I made a calculation by using the eight-dimensional texture array - TAB.
Second Implemetation:
__global__ void CalcStars(u32* A, u32* B)
{
u32 Thread_ID = blockIdx.x * blockDim.x + threadIdx.x;
u32 Star_ID = Thread_ID * 16;
u32 Text_ID = Thread_ID * 32;
A[ 0 + Star_ID] = A[31 + Text_ID] ^ A[26 + Text_ID] ^ A[20 + Text_ID] ^ A[ 9 + Text_ID] ^ A[ 1 + Text_ID] ^ A[ 0 + Text_ID] ^ A[16 + Text_ID] ^ B[ 0];
A[ 1 + Star_ID] = A[ 0 + Star_ID] ^ A[27 + Text_ID] ^ A[21 + Text_ID] ^ A[10 + Text_ID] ^ A[ 2 + Text_ID] ^ A[ 1 + Text_ID] ^ A[17 + Text_ID] ^ B[ 1];
A[ 2 + Star_ID] = A[ 1 + Star_ID] ^ A[28 + Text_ID] ^ A[22 + Text_ID] ^ A[11 + Text_ID] ^ A[ 3 + Text_ID] ^ A[ 2 + Text_ID] ^ A[18 + Text_ID] ^ B[ 2];
A[ 3 + Star_ID] = A[ 2 + Star_ID] ^ A[29 + Text_ID] ^ A[23 + Text_ID] ^ A[12 + Text_ID] ^ A[ 4 + Text_ID] ^ A[ 3 + Text_ID] ^ A[19 + Text_ID] ^ B[ 3];
A[ 4 + Star_ID] = A[ 3 + Star_ID] ^ A[30 + Text_ID] ^ A[24 + Text_ID] ^ A[13 + Text_ID] ^ A[ 5 + Text_ID] ^ A[ 4 + Text_ID] ^ A[20 + Text_ID] ^ B[ 4];
A[ 5 + Star_ID] = A[ 4 + Star_ID] ^ A[31 + Text_ID] ^ A[25 + Text_ID] ^ A[14 + Text_ID] ^ A[ 6 + Text_ID] ^ A[ 5 + Text_ID] ^ A[21 + Text_ID] ^ B[ 5];
A[ 6 + Star_ID] = A[ 5 + Star_ID] ^ A[ 0 + Star_ID] ^ A[26 + Text_ID] ^ A[15 + Text_ID] ^ A[ 7 + Text_ID] ^ A[ 6 + Text_ID] ^ A[22 + Text_ID] ^ B[ 6];
A[ 7 + Star_ID] = A[ 6 + Star_ID] ^ A[ 1 + Star_ID] ^ A[27 + Text_ID] ^ A[16 + Text_ID] ^ A[ 8 + Text_ID] ^ A[ 7 + Text_ID] ^ A[23 + Text_ID] ^ B[ 7];
A[ 8 + Star_ID] = A[ 7 + Star_ID] ^ A[ 2 + Star_ID] ^ A[28 + Text_ID] ^ A[17 + Text_ID] ^ A[ 9 + Text_ID] ^ A[ 8 + Text_ID] ^ A[24 + Text_ID] ^ B[ 8];
A[ 9 + Star_ID] = A[ 8 + Star_ID] ^ A[ 3 + Star_ID] ^ A[29 + Text_ID] ^ A[18 + Text_ID] ^ A[10 + Text_ID] ^ A[ 9 + Text_ID] ^ A[25 + Text_ID] ^ B[ 9];
A[10 + Star_ID] = A[ 9 + Star_ID] ^ A[ 4 + Star_ID] ^ A[30 + Text_ID] ^ A[19 + Text_ID] ^ A[11 + Text_ID] ^ A[10 + Text_ID] ^ A[26 + Text_ID] ^ B[10];
A[11 + Star_ID] = A[10 + Star_ID] ^ A[ 5 + Star_ID] ^ A[31 + Text_ID] ^ A[20 + Text_ID] ^ A[12 + Text_ID] ^ A[11 + Text_ID] ^ A[27 + Text_ID] ^ B[11];
A[12 + Star_ID] = A[11 + Star_ID] ^ A[ 6 + Star_ID] ^ A[ 0 + Star_ID] ^ A[21 + Text_ID] ^ A[13 + Text_ID] ^ A[12 + Text_ID] ^ A[28 + Text_ID] ^ B[12];
A[13 + Star_ID] = A[12 + Star_ID] ^ A[ 7 + Star_ID] ^ A[ 1 + Star_ID] ^ A[22 + Text_ID] ^ A[14 + Text_ID] ^ A[13 + Text_ID] ^ A[29 + Text_ID] ^ B[13];
A[14 + Star_ID] = A[13 + Star_ID] ^ A[ 8 + Star_ID] ^ A[ 2 + Star_ID] ^ A[23 + Text_ID] ^ A[15 + Text_ID] ^ A[14 + Text_ID] ^ A[30 + Text_ID] ^ B[14];
A[15 + Star_ID] = A[14 + Star_ID] ^ A[ 9 + Star_ID] ^ A[ 3 + Star_ID] ^ A[24 + Text_ID] ^ A[16 + Text_ID] ^ A[15 + Text_ID] ^ A[31 + Text_ID] ^ B[15];
}
In this case I made a simple calculations.
Other 2 functions have very similar view.
Time of execution for 1 implementation in DEBUG mode (3 func): 35 ms, 34 ms, 44 ms
Time of execution for 1 implementation in RELEASE mode (3 func): 0.00352 ms, 0.00326 ms, 0.00355 ms
Time of execution for 2 implementation in DEBUG mode (3 func): 3 ms, 5 ms, 12 ms
Time of execution for 2 implementation in RELEASE mode (3 func): 1 ms, 2 ms, 10 ms
And I want to understand why such a huge difference in acceleration between the two implementations.