shfl - transform result to other threads (PTX perfomance)

Etayson · June 26, 2018, 12:06pm

Hi, all.
Can someone tell me how to optimize the code.

.reg.b64 t,%rA<25>,%rB<1>, %rt<3>;
.reg .b32  %rd0,threadId32,round,counter, n_c,rS, dagpointer;
.reg .b32 tC,temp,temp2,temp3,temp4,mem,mem2,mem3,mem4;
.reg .pred p;
.reg .b64	nonce;
.reg .b16  threadIdx, threadDim, blockIdx, blockDim;

//---------------
//let's skip the unnecessary code..
//---------------

// And function to transform %rA0-%rA7 from threades to line.

        xor.b32 counter,counter,counter;
	$LLBtemp1:
	
	
	and.b32 tC,threadId32,0x1f;		
	shr.b32 tC,tC,3;               
	shl.b32 temp,counter,2;
	add.u32 tC,tC,temp;
	
	
	mov.b64 {temp,temp2},%rA0;
	shfl.sync.idx.b32  temp, temp, tC, tC, tC;	
	shfl.sync.idx.b32  temp2, temp2, tC, tC, tC;
	mov.b64 %rA12,{temp,temp2};
	
	mov.b32 rS,temp;
	
	mov.b64 {temp,temp2},%rA1;
	shfl.sync.idx.b32  temp, temp, tC, tC, tC;	
	shfl.sync.idx.b32  temp2, temp2, tC, tC, tC;
	mov.b64 %rA13,{temp,temp2};
	
	mov.b64 {temp,temp2},%rA2;
	shfl.sync.idx.b32  temp, temp, tC, tC, tC;	
	shfl.sync.idx.b32  temp2, temp2, tC, tC, tC;
	mov.b64 %rA14,{temp,temp2};
	
	mov.b64 {temp,temp2},%rA3;
	shfl.sync.idx.b32  temp, temp, tC, tC, tC;	
	shfl.sync.idx.b32  temp2, temp2, tC, tC, tC;
	mov.b64 %rA15,{temp,temp2};
	
	mov.b64 {temp,temp2},%rA4;
	shfl.sync.idx.b32  temp, temp, tC, tC, tC;	
	shfl.sync.idx.b32  temp2, temp2, tC, tC, tC;
	mov.b64 %rA16,{temp,temp2};
	
	mov.b64 {temp,temp2},%rA5;
	shfl.sync.idx.b32  temp, temp, tC, tC, tC;	
	shfl.sync.idx.b32  temp2, temp2, tC, tC, tC;
	mov.b64 %rA17,{temp,temp2};
	
	mov.b64 {temp,temp2},%rA6;
	shfl.sync.idx.b32  temp, temp, tC, tC, tC;	
	shfl.sync.idx.b32  temp2, temp2, tC, tC, tC;
	mov.b64 %rA18,{temp,temp2};
	
	mov.b64 {temp,temp2},%rA7;
	shfl.sync.idx.b32  temp, temp, tC, tC, tC;	
	shfl.sync.idx.b32  temp2, temp2, tC, tC, tC;
	mov.b64 %rA19,{temp,temp2};

//--------------------------------
	
	and.b32 temp,threadId32,0x03;	
	mov.b64 %rt0,%rA12;
	mov.b64 %rt1,%rA13;
	setp.eq.u32 p,temp,1;
	selp.u64 	%rt0,%rA14,%rt0,p;		
	selp.u64 	%rt1,%rA15,%rt1,p;
	setp.eq.u32 p,temp,2;
	selp.u64 	%rt0,%rA16,%rt0,p;
	selp.u64 	%rt1,%rA17,%rt1,p;
	setp.eq.u32 p,temp,3;
	selp.u64 	%rt0,%rA18,%rt0,p;
	selp.u64 	%rt1,%rA19,%rt1,p;	

//----------------------------------
// Some functions that use this result...
//----------------------------------

        add.u32     counter,counter,1;
	setp.lo.u32 p,counter,8;               
	@p bra.uni $LLBtemp1;

So, this function need to trnasform thread result to other threads in warp
Totaly…
when counter = 0
thread 0 will get %rA0,%rA1 from thread 0
thread 1 will get %rA2,%rA3 from thread 0
thread 2 will get %rA4,%rA5 from thread 0
thread 3 will get %rA6,%rA7 from thread 0

thread 4 will get %rA0,%rA1 from thread 0
thread 5 will get %rA2,%rA3 from thread 0
thread 6 will get %rA4,%rA5 from thread 0
thread 7 will get %rA6,%rA7 from thread 0

thread 8 will get %rA0,%rA1 from thread 1
thread 9 will get %rA2,%rA3 from thread 1
thread 10 will get %rA4,%rA5 from thread 1
thread 11 will get %rA6,%rA7 from thread 1

thread 12 will get %rA0,%rA1 from thread 1
thread 13 will get %rA2,%rA3 from thread 1
thread 14 will get %rA4,%rA5 from thread 1
thread 15 will get %rA6,%rA7 from thread 1

thread 16 will get %rA0,%rA1 from thread 2
thread 17 will get %rA2,%rA3 from thread 2
thread 18 will get %rA4,%rA5 from thread 2
thread 19 will get %rA6,%rA7 from thread 2

thread 20 will get %rA0,%rA1 from thread 2
thread 21 will get %rA2,%rA3 from thread 2
thread 22 will get %rA4,%rA5 from thread 2
thread 23 will get %rA6,%rA7 from thread 2

thread 24 will get %rA0,%rA1 from thread 3
thread 25 will get %rA2,%rA3 from thread 3
thread 26 will get %rA4,%rA5 from thread 3
thread 27 will get %rA6,%rA7 from thread 3

thread 28 will get %rA0,%rA1 from thread 3
thread 29 will get %rA2,%rA3 from thread 3
thread 30 will get %rA4,%rA5 from thread 3
thread 31 will get %rA6,%rA7 from thread 3

in the next counter itteration threads in the same way will get result from other thread 4,5,6,7 and so on.
Any ideas how can optimaze code? Thanks!

Topic		Replies	Views
Is it possible to read array values between threads over __shfl()? CUDA Programming and Performance	5	986	June 28, 2017
In-place register-based matrix transpose with shuffle not working CUDA Programming and Performance	7	1231	October 12, 2021
Do I need to write my own warp-wide broadcasting function or will __shfl handle it efficiently? CUDA Programming and Performance	3	1684	May 30, 2017
Changing from __shared__ to __shfl() CUDA Programming and Performance	5	2182	July 8, 2015
Transpose 2D matrix with warp shuffle and in-place array CUDA Programming and Performance	3	2094	October 12, 2021
How to tell nvcc that some `if` must diverge and stop trying to fuse previous statements into it? CUDA Programming and Performance	20	680	March 3, 2024
__shfl_down_sync weird behavior CUDA Programming and Performance cuda , ubuntu	5	1618	November 23, 2021
shfl function in kepler CUDA Programming and Performance	11	6830	November 26, 2013
Warp shuffle instruction not working as expected CUDA Programming and Performance	7	966	September 6, 2023
shuffling warp CUDA Programming and Performance	3	979	March 12, 2018

shfl - transform result to other threads (PTX perfomance)

Related topics