Hi, all.
Can someone tell me how to optimize the code.
.reg.b64 t,%rA<25>,%rB<1>, %rt<3>;
.reg .b32 %rd0,threadId32,round,counter, n_c,rS, dagpointer;
.reg .b32 tC,temp,temp2,temp3,temp4,mem,mem2,mem3,mem4;
.reg .pred p;
.reg .b64 nonce;
.reg .b16 threadIdx, threadDim, blockIdx, blockDim;
//---------------
//let's skip the unnecessary code..
//---------------
// And function to transform %rA0-%rA7 from threades to line.
xor.b32 counter,counter,counter;
$LLBtemp1:
and.b32 tC,threadId32,0x1f;
shr.b32 tC,tC,3;
shl.b32 temp,counter,2;
add.u32 tC,tC,temp;
mov.b64 {temp,temp2},%rA0;
shfl.sync.idx.b32 temp, temp, tC, tC, tC;
shfl.sync.idx.b32 temp2, temp2, tC, tC, tC;
mov.b64 %rA12,{temp,temp2};
mov.b32 rS,temp;
mov.b64 {temp,temp2},%rA1;
shfl.sync.idx.b32 temp, temp, tC, tC, tC;
shfl.sync.idx.b32 temp2, temp2, tC, tC, tC;
mov.b64 %rA13,{temp,temp2};
mov.b64 {temp,temp2},%rA2;
shfl.sync.idx.b32 temp, temp, tC, tC, tC;
shfl.sync.idx.b32 temp2, temp2, tC, tC, tC;
mov.b64 %rA14,{temp,temp2};
mov.b64 {temp,temp2},%rA3;
shfl.sync.idx.b32 temp, temp, tC, tC, tC;
shfl.sync.idx.b32 temp2, temp2, tC, tC, tC;
mov.b64 %rA15,{temp,temp2};
mov.b64 {temp,temp2},%rA4;
shfl.sync.idx.b32 temp, temp, tC, tC, tC;
shfl.sync.idx.b32 temp2, temp2, tC, tC, tC;
mov.b64 %rA16,{temp,temp2};
mov.b64 {temp,temp2},%rA5;
shfl.sync.idx.b32 temp, temp, tC, tC, tC;
shfl.sync.idx.b32 temp2, temp2, tC, tC, tC;
mov.b64 %rA17,{temp,temp2};
mov.b64 {temp,temp2},%rA6;
shfl.sync.idx.b32 temp, temp, tC, tC, tC;
shfl.sync.idx.b32 temp2, temp2, tC, tC, tC;
mov.b64 %rA18,{temp,temp2};
mov.b64 {temp,temp2},%rA7;
shfl.sync.idx.b32 temp, temp, tC, tC, tC;
shfl.sync.idx.b32 temp2, temp2, tC, tC, tC;
mov.b64 %rA19,{temp,temp2};
//--------------------------------
and.b32 temp,threadId32,0x03;
mov.b64 %rt0,%rA12;
mov.b64 %rt1,%rA13;
setp.eq.u32 p,temp,1;
selp.u64 %rt0,%rA14,%rt0,p;
selp.u64 %rt1,%rA15,%rt1,p;
setp.eq.u32 p,temp,2;
selp.u64 %rt0,%rA16,%rt0,p;
selp.u64 %rt1,%rA17,%rt1,p;
setp.eq.u32 p,temp,3;
selp.u64 %rt0,%rA18,%rt0,p;
selp.u64 %rt1,%rA19,%rt1,p;
//----------------------------------
// Some functions that use this result...
//----------------------------------
add.u32 counter,counter,1;
setp.lo.u32 p,counter,8;
@p bra.uni $LLBtemp1;
So, this function need to trnasform thread result to other threads in warp
Totaly…
when counter = 0
thread 0 will get %rA0,%rA1 from thread 0
thread 1 will get %rA2,%rA3 from thread 0
thread 2 will get %rA4,%rA5 from thread 0
thread 3 will get %rA6,%rA7 from thread 0
thread 4 will get %rA0,%rA1 from thread 0
thread 5 will get %rA2,%rA3 from thread 0
thread 6 will get %rA4,%rA5 from thread 0
thread 7 will get %rA6,%rA7 from thread 0
thread 8 will get %rA0,%rA1 from thread 1
thread 9 will get %rA2,%rA3 from thread 1
thread 10 will get %rA4,%rA5 from thread 1
thread 11 will get %rA6,%rA7 from thread 1
thread 12 will get %rA0,%rA1 from thread 1
thread 13 will get %rA2,%rA3 from thread 1
thread 14 will get %rA4,%rA5 from thread 1
thread 15 will get %rA6,%rA7 from thread 1
thread 16 will get %rA0,%rA1 from thread 2
thread 17 will get %rA2,%rA3 from thread 2
thread 18 will get %rA4,%rA5 from thread 2
thread 19 will get %rA6,%rA7 from thread 2
thread 20 will get %rA0,%rA1 from thread 2
thread 21 will get %rA2,%rA3 from thread 2
thread 22 will get %rA4,%rA5 from thread 2
thread 23 will get %rA6,%rA7 from thread 2
thread 24 will get %rA0,%rA1 from thread 3
thread 25 will get %rA2,%rA3 from thread 3
thread 26 will get %rA4,%rA5 from thread 3
thread 27 will get %rA6,%rA7 from thread 3
thread 28 will get %rA0,%rA1 from thread 3
thread 29 will get %rA2,%rA3 from thread 3
thread 30 will get %rA4,%rA5 from thread 3
thread 31 will get %rA6,%rA7 from thread 3
in the next counter itteration threads in the same way will get result from other thread 4,5,6,7 and so on.
Any ideas how can optimaze code? Thanks!