Transpose 2D matrix with warp shuffle and in-place array

Problem:
each thread in a warp size 8 holds 8 uint32 variables. I want to transpose them like this:
Before:

thread 0: 00 01 02 03 04 05 06 07
thread 1: 08 09 0a 0b 0c 0d 0e 0f
thread 2: 10 11 12 13 14 15 16 17
thread 3: 18 19 1a 1b 1c 1d 1e 1f
thread 4: 20 21 22 23 24 25 26 27
thread 5: 28 29 2a 2b 2c 2d 2e 2f
thread 6: 30 31 32 33 34 35 36 37
thread 7: 38 39 3a 3b 3c 3d 3e 3f

After:

thread 0: 00 08 10 18 20 28 30 38
thread 1: 01 09 11 19 21 29 31 39
thread 2: 02 0a 12 1a 22 2a 32 3a
thread 3: 03 0b 13 1b 23 2b 33 3b
thread 4: 04 0c 14 1c 24 2c 34 3c
thread 5: 05 0d 15 1d 25 2d 35 3d
thread 6: 06 0e 16 1e 26 2e 36 3e
thread 7: 07 0f 17 1f 27 2f 37 3f

the problem can be solved with 1 tmp array like below:

__global__ void test()
{
    uint u[8];
    uint v[8];
    int t = threadIdx.x & 7;
    int warp_size = 8;
    int ncorrs = 8;
    for (int i = 0; i < 8; i++) u[i] = t*8 +i;
    for(int corr=0; corr < ncorrs; ++corr)
    {
        int src_lane = ((t + corr) % 8);
        int src_corr = ((8 - corr) + t) % 8;
        int dest = (t + corr) % 8;
        v[dest] = __shfl_sync(0xffffffff, u[src_corr], src_lane, warp_size);
    }
    // now array v holds the transposed values. Do other computations.
}

void run(){
    test<<<1,8>>> ();
}

I wonder if there is any method to do it in-place (without array v).
ps: and without sharedmem

Yes:

#include <cstdio>
__global__ void t(){

  int u[8];
  for (int i = 0; i < 8; i++) u[i] = threadIdx.x*8+i;
  for (int i = 0; i < 8; i++) printf("lane: %d, idx: %d, val: %d\n", threadIdx.x, i, u[i]);
  for (int i = 1; i < 8; i++){
    int idx = threadIdx.x^i;
    u[idx] = __shfl_sync(0x000000FF, u[idx], idx);}
  for (int i = 0; i < 8; i++) printf("lane: %d, idx: %d, tra: %d\n", threadIdx.x, i, u[i]);
}

int main(){

  t<<<1,8>>>();
  cudaDeviceSynchronize();
}

You could use __shfl_xor_sync() here also, but I’m not convinced it makes the code any simpler.

You’re swapping elements across the main diagonal, basically, and the xor pattern gives the right set of hopscotch coverage to hit every element that needs to be swapped. The main diagonal is untouched, which is why we only need 7 loop iterations instead of 8. I think it should be straightforward to extend up to 32x32 transpose across the warp, or any set of power-of-2 square dimensions up to 32.

I presume you have seen this (based on your code). The non-square case indexing is more involved, and offhand I don’t know if it can be easily done without temp arrays.

1 Like

Perfect thank you. Yes, I’ve seen that post.
a cleaner debug version for whoever needs it:

#include <stdint.h>
#include <stdio.h>
__global__ void test()
{
    int u[8];
    for (int i = 0; i < 8; i++) u[i] = threadIdx.x*8+i;
    __shared__ int all[64];
    for (int i = 0; i < 8; i++){
        all[threadIdx.x*8+i] = u[i];    
    }
    __syncwarp();
    if (threadIdx.x == 0){
        printf("Before:\n");
        for (int i = 0; i < 8; i++){
            for (int j = 0; j < 8; j++){
                printf("%02x ", all[i*8+j]);
            }
            printf("\n");
        }
    }
    for (int i = 1; i < 8; i++){
        int idx = threadIdx.x^i;
        u[idx] = __shfl_sync(0x000000FF, u[idx], idx);
    }
    for (int i = 0; i < 8; i++){
        all[threadIdx.x*8+i] = u[i];    
    }
    __syncwarp();
    if (threadIdx.x == 0){
        printf("After:\n");
        for (int i = 0; i < 8; i++){
            for (int j = 0; j < 8; j++){
                printf("%02x ", all[i*8+j]);
            }
            printf("\n");
        }
    }
}
int main(){
    test <<< 1, 8 >>> ();
    cudaDeviceSynchronize();
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess){
        printf("%s\n", cudaGetErrorString(err));
    }
    return 0;
}