shfl - transform result to other threads (PTX perfomance)

Hi, all.
Can someone tell me how to optimize the code.

.reg.b64 t,%rA<25>,%rB<1>, %rt<3>;
.reg .b32  %rd0,threadId32,round,counter, n_c,rS, dagpointer;
.reg .b32 tC,temp,temp2,temp3,temp4,mem,mem2,mem3,mem4;
.reg .pred p;
.reg .b64	nonce;
.reg .b16  threadIdx, threadDim, blockIdx, blockDim;

//---------------
//let's skip the unnecessary code..
//---------------

// And function to transform %rA0-%rA7 from threades to line.

        xor.b32 counter,counter,counter;
	$LLBtemp1:
	
	
	and.b32 tC,threadId32,0x1f;		
	shr.b32 tC,tC,3;               
	shl.b32 temp,counter,2;
	add.u32 tC,tC,temp;
	
	
	mov.b64 {temp,temp2},%rA0;
	shfl.sync.idx.b32  temp, temp, tC, tC, tC;	
	shfl.sync.idx.b32  temp2, temp2, tC, tC, tC;
	mov.b64 %rA12,{temp,temp2};
	
	mov.b32 rS,temp;
	
	mov.b64 {temp,temp2},%rA1;
	shfl.sync.idx.b32  temp, temp, tC, tC, tC;	
	shfl.sync.idx.b32  temp2, temp2, tC, tC, tC;
	mov.b64 %rA13,{temp,temp2};
	
	mov.b64 {temp,temp2},%rA2;
	shfl.sync.idx.b32  temp, temp, tC, tC, tC;	
	shfl.sync.idx.b32  temp2, temp2, tC, tC, tC;
	mov.b64 %rA14,{temp,temp2};
	
	mov.b64 {temp,temp2},%rA3;
	shfl.sync.idx.b32  temp, temp, tC, tC, tC;	
	shfl.sync.idx.b32  temp2, temp2, tC, tC, tC;
	mov.b64 %rA15,{temp,temp2};
	
	mov.b64 {temp,temp2},%rA4;
	shfl.sync.idx.b32  temp, temp, tC, tC, tC;	
	shfl.sync.idx.b32  temp2, temp2, tC, tC, tC;
	mov.b64 %rA16,{temp,temp2};
	
	mov.b64 {temp,temp2},%rA5;
	shfl.sync.idx.b32  temp, temp, tC, tC, tC;	
	shfl.sync.idx.b32  temp2, temp2, tC, tC, tC;
	mov.b64 %rA17,{temp,temp2};
	
	mov.b64 {temp,temp2},%rA6;
	shfl.sync.idx.b32  temp, temp, tC, tC, tC;	
	shfl.sync.idx.b32  temp2, temp2, tC, tC, tC;
	mov.b64 %rA18,{temp,temp2};
	
	mov.b64 {temp,temp2},%rA7;
	shfl.sync.idx.b32  temp, temp, tC, tC, tC;	
	shfl.sync.idx.b32  temp2, temp2, tC, tC, tC;
	mov.b64 %rA19,{temp,temp2};

//--------------------------------
	
	and.b32 temp,threadId32,0x03;	
	mov.b64 %rt0,%rA12;
	mov.b64 %rt1,%rA13;
	setp.eq.u32 p,temp,1;
	selp.u64 	%rt0,%rA14,%rt0,p;		
	selp.u64 	%rt1,%rA15,%rt1,p;
	setp.eq.u32 p,temp,2;
	selp.u64 	%rt0,%rA16,%rt0,p;
	selp.u64 	%rt1,%rA17,%rt1,p;
	setp.eq.u32 p,temp,3;
	selp.u64 	%rt0,%rA18,%rt0,p;
	selp.u64 	%rt1,%rA19,%rt1,p;	

//----------------------------------
// Some functions that use this result...
//----------------------------------

        add.u32     counter,counter,1;
	setp.lo.u32 p,counter,8;               
	@p bra.uni $LLBtemp1;

So, this function need to trnasform thread result to other threads in warp
Totaly…
when counter = 0
thread 0 will get %rA0,%rA1 from thread 0
thread 1 will get %rA2,%rA3 from thread 0
thread 2 will get %rA4,%rA5 from thread 0
thread 3 will get %rA6,%rA7 from thread 0

thread 4 will get %rA0,%rA1 from thread 0
thread 5 will get %rA2,%rA3 from thread 0
thread 6 will get %rA4,%rA5 from thread 0
thread 7 will get %rA6,%rA7 from thread 0

thread 8 will get %rA0,%rA1 from thread 1
thread 9 will get %rA2,%rA3 from thread 1
thread 10 will get %rA4,%rA5 from thread 1
thread 11 will get %rA6,%rA7 from thread 1

thread 12 will get %rA0,%rA1 from thread 1
thread 13 will get %rA2,%rA3 from thread 1
thread 14 will get %rA4,%rA5 from thread 1
thread 15 will get %rA6,%rA7 from thread 1

thread 16 will get %rA0,%rA1 from thread 2
thread 17 will get %rA2,%rA3 from thread 2
thread 18 will get %rA4,%rA5 from thread 2
thread 19 will get %rA6,%rA7 from thread 2

thread 20 will get %rA0,%rA1 from thread 2
thread 21 will get %rA2,%rA3 from thread 2
thread 22 will get %rA4,%rA5 from thread 2
thread 23 will get %rA6,%rA7 from thread 2

thread 24 will get %rA0,%rA1 from thread 3
thread 25 will get %rA2,%rA3 from thread 3
thread 26 will get %rA4,%rA5 from thread 3
thread 27 will get %rA6,%rA7 from thread 3

thread 28 will get %rA0,%rA1 from thread 3
thread 29 will get %rA2,%rA3 from thread 3
thread 30 will get %rA4,%rA5 from thread 3
thread 31 will get %rA6,%rA7 from thread 3

in the next counter itteration threads in the same way will get result from other thread 4,5,6,7 and so on.
Any ideas how can optimaze code? Thanks!