Implement 2D matrix transpose using warp shuffle without local memory

One thing I wondered about: “is the non-local (register-only) version any better, performance-wise?”

According to the following test case, the register-only version is substantially better:

$ cat t1986.cu
#include <cstdio>
#include <cstdlib>
// the movement
// start:
//  A B
//  C D
// step 1:
//  B A
//  C D
// step 2:
//  C A
//  B D
// step 3:
//  A C
//  B D
template <typename T>
__device__ __forceinline__ void myswap(T &a, T &b){ T s = a;  a = b; b = s;}
template <typename T, int s>
__device__ __forceinline__ void mymove(T (&u)[32]){
  const int s1 = 2*s;
  // step 1:
  if (!(threadIdx.x&s)) {
    #pragma unroll 16
    for (int i = 0; i < 16; i++){
      int i1 = i%s;
      int i2 = i/s;
      int i3 = i2*s1;
      myswap(u[i3+i1], u[i3+i1+s]);}}
  // step 2:
  #pragma unroll 16
  for (int i = 0; i < 16; i++){
    int i1 = i%s;
    int i2 = i/s;
    int i3 = i2*s1;
    u[i3+i1] = __shfl_xor_sync(0xFFFFFFFF, u[i3+i1], s);}
  // step 3:
  if (!(threadIdx.x&s)) {
    #pragma unroll 16
    for (int i = 0; i < 16; i++){
      int i1 = i%s;
      int i2 = i/s;
      int i3 = i2*s1;
      myswap(u[i3+i1], u[i3+i1+s]);}}
}

template <typename T>
__global__ void t(int do_print){
  T u[32];
  // initialize data
  for (int i = 0; i < 32; i++)
    u[i] = threadIdx.x*32+i;
  if (u[0] > do_print)
    // print data
    for (int i = 0; i < 32; i++)
      if (threadIdx.x == i){
        for (int j = 0; j < 32; j++)  printf("%d ", u[j]);
        printf("\n");}
  mymove<T, 1>(u);
  mymove<T, 2>(u);
  mymove<T, 4>(u);
  mymove<T, 8>(u);
  mymove<T,16>(u);
  if (u[0] >= do_print)
    // print data
    for (int i = 0; i < 32; i++)
      if (threadIdx.x == i){
        for (int j = 0; j < 32; j++)  printf("%d ", u[j]);
        printf("\n");}
}

int main(int argc, char *argv[]){
  int do_print = 0;
  if (argc > 1) do_print = atoi(argv[1]);
  t<int><<<1,32>>>(do_print);
  t<int><<<80*1000,32>>>(do_print);
  cudaDeviceSynchronize();
}
$ nvcc -maxrregcount 82  -lineinfo -Xptxas=-v -arch=sm_70 -o t1986 t1986.cu
ptxas info    : 6 bytes gmem
ptxas info    : Compiling entry function '_Z1tIiEvi' for 'sm_70'
ptxas info    : Function properties for _Z1tIiEvi
    8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 82 registers, 356 bytes cmem[0]
$ cat t1987.cu
#include <cstdio>
#include <cstdlib>
__global__ void t(int do_print){

  int u[32];
  for (int i = 0; i < 32; i++) u[i] = threadIdx.x*32+i;
  if (u[0] >= do_print)
    // print data
    for (int i = 0; i < 32; i++)
      if (threadIdx.x == i){
        for (int j = 0; j < 32; j++)  printf("%d ", u[j]);
        printf("\n");}
  #pragma unroll 31
  for (int i = 1; i < 32; i++){
    int idx = threadIdx.x^i;
    u[idx] = __shfl_sync(0xFFFFFFFF, u[idx], idx);}
  if (u[0] >= do_print)
    // print data
    for (int i = 0; i < 32; i++)
      if (threadIdx.x == i){
        for (int j = 0; j < 32; j++)  printf("%d ", u[j]);
        printf("\n");}
}

int main(int argc, char *argv[]){
  int do_print = 0;
  if (argc > 1) do_print = atoi(argv[1]);
  t<<<1,32>>>(do_print);
  t<<<80*1000,32>>>(do_print);
  cudaDeviceSynchronize();
}
$ nvcc -lineinfo -Xptxas=-v -arch=sm_70 -o t1987 t1987.cu
ptxas info    : 6 bytes gmem
ptxas info    : Compiling entry function '_Z1ti' for 'sm_70'
ptxas info    : Function properties for _Z1ti
    144 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 87 registers, 356 bytes cmem[0]
$ nvprof ./t1986 1024
==11142== NVPROF is profiling process 11142, command: ./t1986 1024
==11142== Profiling application: ./t1986 1024
==11142== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  152.52us         2  76.258us  7.8720us  144.64us  void t<int>(int)
      API calls:   97.80%  342.76ms         2  171.38ms  17.766us  342.74ms  cudaLaunchKernel
                    1.34%  4.6906ms         4  1.1727ms  588.52us  2.9028ms  cuDeviceTotalMem
                    0.70%  2.4534ms       404  6.0720us     303ns  277.77us  cuDeviceGetAttribute
                    0.11%  380.59us         4  95.147us  59.160us  189.63us  cuDeviceGetName
                    0.04%  155.54us         1  155.54us  155.54us  155.54us  cudaDeviceSynchronize
                    0.01%  25.081us         4  6.2700us  2.8620us  14.372us  cuDeviceGetPCIBusId
                    0.00%  9.6120us         8  1.2010us     395ns  4.8200us  cuDeviceGet
                    0.00%  3.2680us         4     817ns     650ns  1.1840us  cuDeviceGetUuid
                    0.00%  2.5920us         3     864ns     405ns  1.2770us  cuDeviceGetCount
$ nvprof ./t1987 1024
==11156== NVPROF is profiling process 11156, command: ./t1987 1024
==11156== Profiling application: ./t1987 1024
==11156== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  8.4755ms         2  4.2377ms  6.8480us  8.4686ms  t(int)
      API calls:   94.59%  286.63ms         2  143.31ms  12.607us  286.61ms  cudaLaunchKernel
                    2.80%  8.4926ms         1  8.4926ms  8.4926ms  8.4926ms  cudaDeviceSynchronize
                    1.67%  5.0492ms         4  1.2623ms  597.28us  3.2418ms  cuDeviceTotalMem
                    0.81%  2.4489ms       404  6.0610us     393ns  266.26us  cuDeviceGetAttribute
                    0.12%  376.16us         4  94.040us  60.237us  180.88us  cuDeviceGetName
                    0.01%  22.479us         4  5.6190us  2.7930us  11.912us  cuDeviceGetPCIBusId
                    0.00%  12.697us         8  1.5870us     463ns  5.1350us  cuDeviceGet
                    0.00%  3.3600us         4     840ns     697ns  1.1470us  cuDeviceGetUuid
                    0.00%  2.9690us         3     989ns     442ns  1.4120us  cuDeviceGetCount
$

For 80,000 32x32 transposes on V100, the local memory version runs in about 8ms whereas the register-only version runs in about 0.2ms. So I guess the extra code complexity might be “worth it” in some cases.

3 Likes