auto unrolled loops and shared registers

Hi, all
recently i’ve tried to play with auto unroll like that:

#define UNROLL 8
                float V=A[tid];
                for(j=0;j<C/UNROLL;j++){
			for(i=0;i<UNROLL;i++){
				V=V*0.1f+0.9f;
			}
		}

I see that inner loop is unrolled as follows:

mov.f32         %f3, 0f3f666666;        // 0.9
        mov.f32         %f4, 0f3dcccccd;        // 0.1
        mov.f32         %f5, 0f3f666666;        // 0.9
        mov.f32         %f6, 0f3dcccccd;        // 0.1
        mov.f32         %f7, 0f3f666666;        // 0.9
        mov.f32         %f8, 0f3dcccccd;        // 0.1
        mov.f32         %f9, 0f3f666666;        // 0.9
        mov.f32         %f10, 0f3dcccccd;       // 0.1
        mov.f32         %f11, 0f3f666666;       // 0.9
        mov.f32         %f12, 0f3dcccccd;       // 0.1
        mov.f32         %f13, 0f3f666666;       // 0.9
        mov.f32         %f14, 0f3dcccccd;       // 0.1
        mov.f32         %f15, 0f3f666666;       // 0.9
        mov.f32         %f16, 0f3dcccccd;       // 0.1
        mov.f32         %f17, 0f3f666666;       // 0.9
        mov.f32         %f18, 0f3dcccccd;       // 0.1
        mad.f32         %f19, %f18, %f2, %f17;
        mad.f32         %f20, %f16, %f19, %f15;
        mad.f32         %f21, %f14, %f20, %f13;
        mad.f32         %f22, %f12, %f21, %f11;
        mad.f32         %f23, %f10, %f22, %f9;
        mad.f32         %f24, %f8, %f23, %f7;
        mad.f32         %f25, %f6, %f24, %f5;
        mad.f32         %f2, %f4, %f25, %f3;

and run time was 0.15 arb. units. Then i decided to pass 0.1 via argument dt in kernel call:

#define UNROLL 8
                float V=A[tid];
                for(j=0;j<C/UNROLL;j++){
			for(i=0;i<UNROLL;i++){
				V=V*dt+0.9f;
			}
		}

loop was unrolled but in some other way:

mad.f32         %f5, %f3, %f2, %f4;
        mov.f32         %f6, 0f3f666666;        // 0.9
        mad.f32         %f7, %f3, %f5, %f6;
        mov.f32         %f8, 0f3f666666;        // 0.9
        mad.f32         %f9, %f3, %f7, %f8;
        mov.f32         %f10, 0f3f666666;       // 0.9
        mad.f32         %f11, %f3, %f9, %f10;
        mov.f32         %f12, 0f3f666666;       // 0.9
        mad.f32         %f13, %f3, %f11, %f12;
        mov.f32         %f14, 0f3f666666;       // 0.9
        mad.f32         %f15, %f3, %f13, %f14;
        mov.f32         %f16, 0f3f666666;       // 0.9
        mad.f32         %f17, %f3, %f15, %f16;
        mov.f32         %f18, 0f3f666666;       // 0.9
        mad.f32         %f2, %f3, %f17, %f18;

performance dropped to 0.18 arb. units! The only difference is common register %f3 which i suppose is the bottleneck for cuda core pipline. So i decided to write something like:

#define UNROLL 8
		register FL_DBL a=dt;
		register FL_DBL aa[16];
		aa[0]=0.0f;
		for(i=0;i<UNROLL;i++) aa[i]+=a;
		for(j=0;j<C*D/UNROLL;j++){
			for(i=0;i<UNROLL;i++){
				V=V*aa[i]+0.9f;
			}
		}

It is different code but operations are the same. In ptx output I see:

mov.f32         %f35, 0f3f666666;       // 0.9
        mad.f32         %f36, %f34, %f2, %f35;
        mov.f32         %f37, 0f3f666666;       // 0.9
        mad.f32         %f38, %f33, %f36, %f37;
        mov.f32         %f39, 0f3f666666;       // 0.9
        mad.f32         %f40, %f32, %f38, %f39;
        mov.f32         %f41, 0f3f666666;       // 0.9
        mad.f32         %f42, %f31, %f40, %f41;
        mov.f32         %f43, 0f3f666666;       // 0.9
        mad.f32         %f44, %f30, %f42, %f43;
        mov.f32         %f45, 0f3f666666;       // 0.9
        mad.f32         %f46, %f29, %f44, %f45;
        mov.f32         %f47, 0f3f666666;       // 0.9
        mad.f32         %f48, %f28, %f46, %f47;
        mov.f32         %f49, 0f3f666666;       // 0.9
        mad.f32         %f2, %f27, %f48, %f49;

No shared register! And performance increased again to 0.16 arb. units!
If UNROLL = 32 the performance differs greater! I think it is a problem of nvcc that for second code it uses only one register while for first code it uses UNROLL number of registers for one value. My question is how to make nvcc to use UNROLL number of registers for dt parameter?