Implement 2D matrix transpose using warp shuffle without local memory

Robert_Crovella · March 17, 2022, 7:29pm

One thing I wondered about: “is the non-local (register-only) version any better, performance-wise?”

According to the following test case, the register-only version is substantially better:

$ cat t1986.cu
#include <cstdio>
#include <cstdlib>
// the movement
// start:
//  A B
//  C D
// step 1:
//  B A
//  C D
// step 2:
//  C A
//  B D
// step 3:
//  A C
//  B D
template <typename T>
__device__ __forceinline__ void myswap(T &a, T &b){ T s = a;  a = b; b = s;}
template <typename T, int s>
__device__ __forceinline__ void mymove(T (&u)[32]){
  const int s1 = 2*s;
  // step 1:
  if (!(threadIdx.x&s)) {
    #pragma unroll 16
    for (int i = 0; i < 16; i++){
      int i1 = i%s;
      int i2 = i/s;
      int i3 = i2*s1;
      myswap(u[i3+i1], u[i3+i1+s]);}}
  // step 2:
  #pragma unroll 16
  for (int i = 0; i < 16; i++){
    int i1 = i%s;
    int i2 = i/s;
    int i3 = i2*s1;
    u[i3+i1] = __shfl_xor_sync(0xFFFFFFFF, u[i3+i1], s);}
  // step 3:
  if (!(threadIdx.x&s)) {
    #pragma unroll 16
    for (int i = 0; i < 16; i++){
      int i1 = i%s;
      int i2 = i/s;
      int i3 = i2*s1;
      myswap(u[i3+i1], u[i3+i1+s]);}}
}

template <typename T>
__global__ void t(int do_print){
  T u[32];
  // initialize data
  for (int i = 0; i < 32; i++)
    u[i] = threadIdx.x*32+i;
  if (u[0] > do_print)
    // print data
    for (int i = 0; i < 32; i++)
      if (threadIdx.x == i){
        for (int j = 0; j < 32; j++)  printf("%d ", u[j]);
        printf("\n");}
  mymove<T, 1>(u);
  mymove<T, 2>(u);
  mymove<T, 4>(u);
  mymove<T, 8>(u);
  mymove<T,16>(u);
  if (u[0] >= do_print)
    // print data
    for (int i = 0; i < 32; i++)
      if (threadIdx.x == i){
        for (int j = 0; j < 32; j++)  printf("%d ", u[j]);
        printf("\n");}
}

int main(int argc, char *argv[]){
  int do_print = 0;
  if (argc > 1) do_print = atoi(argv[1]);
  t<int><<<1,32>>>(do_print);
  t<int><<<80*1000,32>>>(do_print);
  cudaDeviceSynchronize();
}
$ nvcc -maxrregcount 82  -lineinfo -Xptxas=-v -arch=sm_70 -o t1986 t1986.cu
ptxas info    : 6 bytes gmem
ptxas info    : Compiling entry function '_Z1tIiEvi' for 'sm_70'
ptxas info    : Function properties for _Z1tIiEvi
    8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 82 registers, 356 bytes cmem[0]
$ cat t1987.cu
#include <cstdio>
#include <cstdlib>
__global__ void t(int do_print){

  int u[32];
  for (int i = 0; i < 32; i++) u[i] = threadIdx.x*32+i;
  if (u[0] >= do_print)
    // print data
    for (int i = 0; i < 32; i++)
      if (threadIdx.x == i){
        for (int j = 0; j < 32; j++)  printf("%d ", u[j]);
        printf("\n");}
  #pragma unroll 31
  for (int i = 1; i < 32; i++){
    int idx = threadIdx.x^i;
    u[idx] = __shfl_sync(0xFFFFFFFF, u[idx], idx);}
  if (u[0] >= do_print)
    // print data
    for (int i = 0; i < 32; i++)
      if (threadIdx.x == i){
        for (int j = 0; j < 32; j++)  printf("%d ", u[j]);
        printf("\n");}
}

int main(int argc, char *argv[]){
  int do_print = 0;
  if (argc > 1) do_print = atoi(argv[1]);
  t<<<1,32>>>(do_print);
  t<<<80*1000,32>>>(do_print);
  cudaDeviceSynchronize();
}
$ nvcc -lineinfo -Xptxas=-v -arch=sm_70 -o t1987 t1987.cu
ptxas info    : 6 bytes gmem
ptxas info    : Compiling entry function '_Z1ti' for 'sm_70'
ptxas info    : Function properties for _Z1ti
    144 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 87 registers, 356 bytes cmem[0]
$ nvprof ./t1986 1024
==11142== NVPROF is profiling process 11142, command: ./t1986 1024
==11142== Profiling application: ./t1986 1024
==11142== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  152.52us         2  76.258us  7.8720us  144.64us  void t<int>(int)
      API calls:   97.80%  342.76ms         2  171.38ms  17.766us  342.74ms  cudaLaunchKernel
                    1.34%  4.6906ms         4  1.1727ms  588.52us  2.9028ms  cuDeviceTotalMem
                    0.70%  2.4534ms       404  6.0720us     303ns  277.77us  cuDeviceGetAttribute
                    0.11%  380.59us         4  95.147us  59.160us  189.63us  cuDeviceGetName
                    0.04%  155.54us         1  155.54us  155.54us  155.54us  cudaDeviceSynchronize
                    0.01%  25.081us         4  6.2700us  2.8620us  14.372us  cuDeviceGetPCIBusId
                    0.00%  9.6120us         8  1.2010us     395ns  4.8200us  cuDeviceGet
                    0.00%  3.2680us         4     817ns     650ns  1.1840us  cuDeviceGetUuid
                    0.00%  2.5920us         3     864ns     405ns  1.2770us  cuDeviceGetCount
$ nvprof ./t1987 1024
==11156== NVPROF is profiling process 11156, command: ./t1987 1024
==11156== Profiling application: ./t1987 1024
==11156== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  8.4755ms         2  4.2377ms  6.8480us  8.4686ms  t(int)
      API calls:   94.59%  286.63ms         2  143.31ms  12.607us  286.61ms  cudaLaunchKernel
                    2.80%  8.4926ms         1  8.4926ms  8.4926ms  8.4926ms  cudaDeviceSynchronize
                    1.67%  5.0492ms         4  1.2623ms  597.28us  3.2418ms  cuDeviceTotalMem
                    0.81%  2.4489ms       404  6.0610us     393ns  266.26us  cuDeviceGetAttribute
                    0.12%  376.16us         4  94.040us  60.237us  180.88us  cuDeviceGetName
                    0.01%  22.479us         4  5.6190us  2.7930us  11.912us  cuDeviceGetPCIBusId
                    0.00%  12.697us         8  1.5870us     463ns  5.1350us  cuDeviceGet
                    0.00%  3.3600us         4     840ns     697ns  1.1470us  cuDeviceGetUuid
                    0.00%  2.9690us         3     989ns     442ns  1.4120us  cuDeviceGetCount
$

For 80,000 32x32 transposes on V100, the local memory version runs in about 8ms whereas the register-only version runs in about 0.2ms. So I guess the extra code complexity might be “worth it” in some cases.

Topic		Replies	Views
In-place register-based matrix transpose with shuffle not working CUDA Programming and Performance	7	1140	October 12, 2021
Matrix transpose perfomance profile explanation CUDA Programming and Performance hw , cuda , kernel , ncu	9	85	April 26, 2025
Warp shuffle instruction not working as expected CUDA Programming and Performance	7	827	September 6, 2023
Memory problem? ...incredible slowdown CUDA Programming and Performance	29	16310	January 30, 2011
Efficient in-place transpose of multiple square float matrices CUDA Programming and Performance	10	5501	October 10, 2019
Transposing the grid, but not its thread-blocks, results in slower performance, why ? CUDA Programming and Performance	9	911	March 2, 2018
Matix Multiplication using __shfl? CUDA Programming and Performance	8	3126	August 29, 2016
CUDA Shuffle Instruction (Warp-level intra register exchange) CUDA Programming and Performance	8	8940	November 29, 2013
matrix multiply reduction CUDA Programming and Performance	41	35555	January 15, 2011
Efficient use of shared memory CUDA Programming and Performance	29	4516	December 2, 2019

Implement 2D matrix transpose using warp shuffle without local memory

Related topics