why so many registers used?

Can anyone help, please? why does this kernel uses 40 registers? Is it possible to optimize it?

extern “C” global void All(float2 *signal, float2 *out, int Width, int Height)

{

int x = blockIdx.x * blockDim.x + threadIdx.x;



float2 a[16];



int level = 0;

for (int y = 0; y < Height; y++)

{

	a[level] = signal[y * Width + x];

	level++;

	if (level == 16) level = 0;

	FFT16(a);

	out[y * Width + x] = a[0];

}

}

FFT16 and other stuff:

[codebox]#define M_SQRT1_2 0.456743f

#define COS_PI_8 0.923879533f

#define SIN_PI_8 0.382683432f

#define exp_1_16 make_float2( COS_PI_8, -SIN_PI_8 )

#define exp_3_16 make_float2( SIN_PI_8, -COS_PI_8 )

#define exp_5_16 make_float2( -SIN_PI_8, -COS_PI_8 )

#define exp_7_16 make_float2( -COS_PI_8, -SIN_PI_8 )

#define exp_9_16 make_float2( -COS_PI_8, SIN_PI_8 )

#define exp_1_8 make_float2( 1, -1 )//requires post-multiply by 1/sqrt(2)

#define exp_1_4 make_float2( 0, -1 )

#define exp_3_8 make_float2( -1, -1 )//requires post-multiply by 1/sqrt(2)

inline float2 operator*( float2 a, float2 b ) { return make_float2( a.xb.x-a.yb.y, a.xb.y+a.yb.x ); }

inline float2 operator*( float2 a, float b ) { return make_float2( ba.x, ba.y ); }

inline float2 operator+( float2 a, float2 b ) { return make_float2( a.x + b.x, a.y + b.y ); }

inline float2 operator-( float2 a, float2 b ) { return make_float2( a.x - b.x, a.y - b.y ); }

inline device void FFT2( float2 &a0, float2 &a1 )

{

float2 c0 = a0;

a0 = c0 + a1; 

a1 = c0 - a1;

}

inline device void FFT4( float2 &a0, float2 &a1, float2 &a2, float2 &a3 )

{

FFT2( a0, a2 );

FFT2( a1, a3 );

a3 = a3 * exp_1_4;

FFT2( a0, a1 );

FFT2( a2, a3 );

}

inline device void FFT8( float2 *a )

{

FFT2( a[0], a[4] );

FFT2( a[1], a[5] );

FFT2( a[2], a[6] );

FFT2( a[3], a[7] );

a[5] = ( a[5] * exp_1_8 ) * M_SQRT1_2;

a[6] =   a[6] * exp_1_4;

a[7] = ( a[7] * exp_3_8 ) * M_SQRT1_2;

FFT4( a[0], a[1], a[2], a[3] );

FFT4( a[4], a[5], a[6], a[7] );

}

inline device void FFT16( float2 *a )

{

FFT4( a[0], a[4], a[8], a[12] );

FFT4( a[1], a[5], a[9], a[13] );

FFT4( a[2], a[6], a[10], a[14] );

FFT4( a[3], a[7], a[11], a[15] );

a[5] = (a[5] * exp_1_8 ) * M_SQRT1_2;

a[6]  =  a[6]  * exp_1_4;

a[7]  = (a[7]  * exp_3_8 ) * M_SQRT1_2;

a[9]  =  a[9]  * exp_1_16;

a[10] = (a[10] * exp_1_8 ) * M_SQRT1_2;

a[11] =  a[11] * exp_3_16;

a[13] =  a[13] * exp_3_16;

a[14] = (a[14] * exp_3_8 ) * M_SQRT1_2;

a[15] =  a[15] * exp_9_16;

FFT4( a[0], a[1], a[2], a[3] );

FFT4( a[4],  a[5],  a[6],  a[7] );

FFT4( a[8],  a[9],  a[10], a[11] );

FFT4( a[12], a[13], a[14], a[15] );

}

if i try this FFT16, it takes only 9 registers.

inline device void FFT16( float2 *a )

{

a[5]  = (a[5]  * exp_1_8 ) * M_SQRT1_2;

a[6]  =  a[6]  * exp_1_4;

a[7]  = (a[7]  * exp_3_8 ) * M_SQRT1_2;

a[9]  =  a[9]  * exp_1_16;

a[10] = (a[10] * exp_1_8 ) * M_SQRT1_2;

a[11] =  a[11] * exp_3_16;

a[13] =  a[13] * exp_3_16;

a[14] = (a[14] * exp_3_8 ) * M_SQRT1_2;

a[15] =  a[15] * exp_9_16;

}[/codebox]

All those nested device function calls are expanded inline inside your kernel by the compiler when you call the FFT16 function. That is probably where all the registers are going.

As a tip, try using code boxes for posting code snippets. It is very hard to follow text and code intermingled like that.