# define coeff 512
__device__
void device_function( unsigned int Array[8][8], signed short int a0, signed short int a1, signed short int a2, signed short int a3, signed short int a4,
signed short int a5, signed short int a6, signed short int a7, int* Out_array )
{
Out_array [0] = (__mul24(a0 , Array[0][0]) + __mul24(a1 , Array[0][1]) + __mul24(a2 , Array[0][2]) + __mul24(a3 , Array[0][3]) +
__mul24(a4 , Array[0][4]) + __mul24(a5 , Array[0][5]) + __mul24(a6 , Array[0][6]) + __mul24(a7 , Array[0][7])+ coeff);
Out_array [1] = (__mul24(a0 , Array[1][0]) + __mul24(a1 , Array[1][1]) + __mul24(a2 , Array[1][2] )+ __mul24(a3 , Array[1][3] )+
__mul24(a4 , Array[1][4]) + __mul24(a5 , Array[1][5]) + __mul24(a6 , Array[1][6] )+ __mul24(a7 , Array[1][7])+ coeff);
Out_array [2] = (__mul24(a0 , Array[2][0]) + __mul24(a1 , Array[2][1]) + __mul24(a2 , Array[2][2]) + __mul24(a3 , Array[2][3]) +
__mul24(a4 , Array[2][4]) + __mul24(a5 , Array[2][5]) + __mul24(a6 , Array[2][6]) + __mul24(a7 , Array[2][7])+ coeff);
Out_array 3] = (__mul24(a0 , Array[3][0]) + __mul24(a1 , Array[3][1]) + __mul24(a2 , Array[3][2]) + __mul24(a3 , Array[3][3]) +
__mul24(a4 , Array[3][4]) + __mul24(a5 , Array[3][5]) + __mul24(a6 , Array[3][6]) + __mul24(a7 , Array[3][7])+ coeff);
Out_array [4] = (__mul24(a0 , Array[4][0]) + __mul24(a1 , Array[4][1]) + __mul24(a2 , Array[4][2]) + __mul24(a3 , Array[4][3]) +
__mul24(a4 , Array[4][4]) + __mul24(a5 , Array[4][5]) + __mul24(a6 , Array[4][6]) + __mul24(a7 , Array[4][7]) + coeff);
Out_array [5] = (__mul24(a0 , Array[5][0]) + __mul24(a1 , Array[5][1]) + __mul24(a2 , Array[5][2]) + __mul24(a3 , Array[5][3]) +
__mul24(a4 , Array[5][4]) + __mul24(a5 , Array[5][5]) + __mul24(a6 , Array[5][6]) + __mul24(a7 , Array[5][7])+ coeff);
Out_array [6] = (__mul24(a0 , Array[6][0]) + __mul24(a1 , Array[6][1]) + __mul24(a2 , Array[6][2]) + __mul24(a3 , Array[6][3]) +
__mul24(a4 , Array[6][4]) + __mul24(a5 , Array[6][5]) + __mul24(a6 , Array[6][6]) + __mul24(a7 , Array[6][7])+ coeff);
Out_array [7] = (__mul24(a0 , Array[7][0]) + __mul24(a1 , Array[7][1]) + __mul24(a2 , Array[7][2]) + __mul24(a3 , Array[7][3]) +
__mul24(a4 , Array[7][4] )+ __mul24(a5 , Array[7][5]) + __mul24(a6 , Array[7][6] )+ __mul24(a7 , Array[7][7])+ coeff);
}
I am just bewildered at this code.
Why can’t you just put it in some kind of FOR loop with a neat expression???
Readability is the single most important and desired property of a programmer.
Also, If you have shortage of shared memory because of addition of smem of device functions, declare a global shared memory in your kernel and pass the pointer to it as an argument to your device function…
The compiler would (should) be smart enough to inline it corerctly (although there are some compiler quirks – I dont want confuse u now. You will know when u hit the advisory warning)