Can anyone help, please? why does this kernel uses 40 registers? Is it possible to optimize it?
extern “C” global void All(float2 *signal, float2 *out, int Width, int Height)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
float2 a[16];
int level = 0;
for (int y = 0; y < Height; y++)
{
a[level] = signal[y * Width + x];
level++;
if (level == 16) level = 0;
FFT16(a);
out[y * Width + x] = a[0];
}
}
FFT16 and other stuff:
[codebox]#define M_SQRT1_2 0.456743f
#define COS_PI_8 0.923879533f
#define SIN_PI_8 0.382683432f
#define exp_1_16 make_float2( COS_PI_8, -SIN_PI_8 )
#define exp_3_16 make_float2( SIN_PI_8, -COS_PI_8 )
#define exp_5_16 make_float2( -SIN_PI_8, -COS_PI_8 )
#define exp_7_16 make_float2( -COS_PI_8, -SIN_PI_8 )
#define exp_9_16 make_float2( -COS_PI_8, SIN_PI_8 )
#define exp_1_8 make_float2( 1, -1 )//requires post-multiply by 1/sqrt(2)
#define exp_1_4 make_float2( 0, -1 )
#define exp_3_8 make_float2( -1, -1 )//requires post-multiply by 1/sqrt(2)
inline float2 operator*( float2 a, float2 b ) { return make_float2( a.xb.x-a.yb.y, a.xb.y+a.yb.x ); }
inline float2 operator*( float2 a, float b ) { return make_float2( ba.x, ba.y ); }
inline float2 operator+( float2 a, float2 b ) { return make_float2( a.x + b.x, a.y + b.y ); }
inline float2 operator-( float2 a, float2 b ) { return make_float2( a.x - b.x, a.y - b.y ); }
inline device void FFT2( float2 &a0, float2 &a1 )
{
float2 c0 = a0;
a0 = c0 + a1;
a1 = c0 - a1;
}
inline device void FFT4( float2 &a0, float2 &a1, float2 &a2, float2 &a3 )
{
FFT2( a0, a2 );
FFT2( a1, a3 );
a3 = a3 * exp_1_4;
FFT2( a0, a1 );
FFT2( a2, a3 );
}
inline device void FFT8( float2 *a )
{
FFT2( a[0], a[4] );
FFT2( a[1], a[5] );
FFT2( a[2], a[6] );
FFT2( a[3], a[7] );
a[5] = ( a[5] * exp_1_8 ) * M_SQRT1_2;
a[6] = a[6] * exp_1_4;
a[7] = ( a[7] * exp_3_8 ) * M_SQRT1_2;
FFT4( a[0], a[1], a[2], a[3] );
FFT4( a[4], a[5], a[6], a[7] );
}
inline device void FFT16( float2 *a )
{
FFT4( a[0], a[4], a[8], a[12] );
FFT4( a[1], a[5], a[9], a[13] );
FFT4( a[2], a[6], a[10], a[14] );
FFT4( a[3], a[7], a[11], a[15] );
a[5] = (a[5] * exp_1_8 ) * M_SQRT1_2;
a[6] = a[6] * exp_1_4;
a[7] = (a[7] * exp_3_8 ) * M_SQRT1_2;
a[9] = a[9] * exp_1_16;
a[10] = (a[10] * exp_1_8 ) * M_SQRT1_2;
a[11] = a[11] * exp_3_16;
a[13] = a[13] * exp_3_16;
a[14] = (a[14] * exp_3_8 ) * M_SQRT1_2;
a[15] = a[15] * exp_9_16;
FFT4( a[0], a[1], a[2], a[3] );
FFT4( a[4], a[5], a[6], a[7] );
FFT4( a[8], a[9], a[10], a[11] );
FFT4( a[12], a[13], a[14], a[15] );
}
if i try this FFT16, it takes only 9 registers.
inline device void FFT16( float2 *a )
{
a[5] = (a[5] * exp_1_8 ) * M_SQRT1_2;
a[6] = a[6] * exp_1_4;
a[7] = (a[7] * exp_3_8 ) * M_SQRT1_2;
a[9] = a[9] * exp_1_16;
a[10] = (a[10] * exp_1_8 ) * M_SQRT1_2;
a[11] = a[11] * exp_3_16;
a[13] = a[13] * exp_3_16;
a[14] = (a[14] * exp_3_8 ) * M_SQRT1_2;
a[15] = a[15] * exp_9_16;
}[/codebox]