One thing I wondered about: “is the non-local (register-only) version any better, performance-wise?”
According to the following test case, the register-only version is substantially better:
$ cat t1986.cu
#include <cstdio>
#include <cstdlib>
// the movement
// start:
// A B
// C D
// step 1:
// B A
// C D
// step 2:
// C A
// B D
// step 3:
// A C
// B D
template <typename T>
__device__ __forceinline__ void myswap(T &a, T &b){ T s = a; a = b; b = s;}
template <typename T, int s>
__device__ __forceinline__ void mymove(T (&u)[32]){
const int s1 = 2*s;
// step 1:
if (!(threadIdx.x&s)) {
#pragma unroll 16
for (int i = 0; i < 16; i++){
int i1 = i%s;
int i2 = i/s;
int i3 = i2*s1;
myswap(u[i3+i1], u[i3+i1+s]);}}
// step 2:
#pragma unroll 16
for (int i = 0; i < 16; i++){
int i1 = i%s;
int i2 = i/s;
int i3 = i2*s1;
u[i3+i1] = __shfl_xor_sync(0xFFFFFFFF, u[i3+i1], s);}
// step 3:
if (!(threadIdx.x&s)) {
#pragma unroll 16
for (int i = 0; i < 16; i++){
int i1 = i%s;
int i2 = i/s;
int i3 = i2*s1;
myswap(u[i3+i1], u[i3+i1+s]);}}
}
template <typename T>
__global__ void t(int do_print){
T u[32];
// initialize data
for (int i = 0; i < 32; i++)
u[i] = threadIdx.x*32+i;
if (u[0] > do_print)
// print data
for (int i = 0; i < 32; i++)
if (threadIdx.x == i){
for (int j = 0; j < 32; j++) printf("%d ", u[j]);
printf("\n");}
mymove<T, 1>(u);
mymove<T, 2>(u);
mymove<T, 4>(u);
mymove<T, 8>(u);
mymove<T,16>(u);
if (u[0] >= do_print)
// print data
for (int i = 0; i < 32; i++)
if (threadIdx.x == i){
for (int j = 0; j < 32; j++) printf("%d ", u[j]);
printf("\n");}
}
int main(int argc, char *argv[]){
int do_print = 0;
if (argc > 1) do_print = atoi(argv[1]);
t<int><<<1,32>>>(do_print);
t<int><<<80*1000,32>>>(do_print);
cudaDeviceSynchronize();
}
$ nvcc -maxrregcount 82 -lineinfo -Xptxas=-v -arch=sm_70 -o t1986 t1986.cu
ptxas info : 6 bytes gmem
ptxas info : Compiling entry function '_Z1tIiEvi' for 'sm_70'
ptxas info : Function properties for _Z1tIiEvi
8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 82 registers, 356 bytes cmem[0]
$ cat t1987.cu
#include <cstdio>
#include <cstdlib>
__global__ void t(int do_print){
int u[32];
for (int i = 0; i < 32; i++) u[i] = threadIdx.x*32+i;
if (u[0] >= do_print)
// print data
for (int i = 0; i < 32; i++)
if (threadIdx.x == i){
for (int j = 0; j < 32; j++) printf("%d ", u[j]);
printf("\n");}
#pragma unroll 31
for (int i = 1; i < 32; i++){
int idx = threadIdx.x^i;
u[idx] = __shfl_sync(0xFFFFFFFF, u[idx], idx);}
if (u[0] >= do_print)
// print data
for (int i = 0; i < 32; i++)
if (threadIdx.x == i){
for (int j = 0; j < 32; j++) printf("%d ", u[j]);
printf("\n");}
}
int main(int argc, char *argv[]){
int do_print = 0;
if (argc > 1) do_print = atoi(argv[1]);
t<<<1,32>>>(do_print);
t<<<80*1000,32>>>(do_print);
cudaDeviceSynchronize();
}
$ nvcc -lineinfo -Xptxas=-v -arch=sm_70 -o t1987 t1987.cu
ptxas info : 6 bytes gmem
ptxas info : Compiling entry function '_Z1ti' for 'sm_70'
ptxas info : Function properties for _Z1ti
144 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 87 registers, 356 bytes cmem[0]
$ nvprof ./t1986 1024
==11142== NVPROF is profiling process 11142, command: ./t1986 1024
==11142== Profiling application: ./t1986 1024
==11142== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 100.00% 152.52us 2 76.258us 7.8720us 144.64us void t<int>(int)
API calls: 97.80% 342.76ms 2 171.38ms 17.766us 342.74ms cudaLaunchKernel
1.34% 4.6906ms 4 1.1727ms 588.52us 2.9028ms cuDeviceTotalMem
0.70% 2.4534ms 404 6.0720us 303ns 277.77us cuDeviceGetAttribute
0.11% 380.59us 4 95.147us 59.160us 189.63us cuDeviceGetName
0.04% 155.54us 1 155.54us 155.54us 155.54us cudaDeviceSynchronize
0.01% 25.081us 4 6.2700us 2.8620us 14.372us cuDeviceGetPCIBusId
0.00% 9.6120us 8 1.2010us 395ns 4.8200us cuDeviceGet
0.00% 3.2680us 4 817ns 650ns 1.1840us cuDeviceGetUuid
0.00% 2.5920us 3 864ns 405ns 1.2770us cuDeviceGetCount
$ nvprof ./t1987 1024
==11156== NVPROF is profiling process 11156, command: ./t1987 1024
==11156== Profiling application: ./t1987 1024
==11156== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 100.00% 8.4755ms 2 4.2377ms 6.8480us 8.4686ms t(int)
API calls: 94.59% 286.63ms 2 143.31ms 12.607us 286.61ms cudaLaunchKernel
2.80% 8.4926ms 1 8.4926ms 8.4926ms 8.4926ms cudaDeviceSynchronize
1.67% 5.0492ms 4 1.2623ms 597.28us 3.2418ms cuDeviceTotalMem
0.81% 2.4489ms 404 6.0610us 393ns 266.26us cuDeviceGetAttribute
0.12% 376.16us 4 94.040us 60.237us 180.88us cuDeviceGetName
0.01% 22.479us 4 5.6190us 2.7930us 11.912us cuDeviceGetPCIBusId
0.00% 12.697us 8 1.5870us 463ns 5.1350us cuDeviceGet
0.00% 3.3600us 4 840ns 697ns 1.1470us cuDeviceGetUuid
0.00% 2.9690us 3 989ns 442ns 1.4120us cuDeviceGetCount
$
For 80,000 32x32 transposes on V100, the local memory version runs in about 8ms whereas the register-only version runs in about 0.2ms. So I guess the extra code complexity might be “worth it” in some cases.