CUDA Matrix Transpose only with warp shuffle instructions not working

Live code: https://godbolt.org/z/ejdMPMn6q

#include <iostream>
#include <cooperative_groups.h>
namespace cg = cooperative_groups;

__global__ void transpose_block(float* mat, size_t sx, size_t sy) 
{
    constexpr size_t size = 8;
    auto tile = cg::tiled_partition<size>(cg::this_thread_block());
    auto ix = tile.thread_rank();

    float col[size];
    for (size_t iy = 0; iy < size; ++iy)
        col[iy] = mat[ix + iy * sx];

    auto val = [&tile, &col](int ix, int iy) { return tile.shfl(col[iy], ix); }; 

    for (size_t iy = 0; iy < size; ++iy)
        mat[ix + iy * sx] = val(iy, ix);
}

void print_mat(float* mat, size_t sx, size_t sy)
{
    printf("{\n");
    for (size_t iy = 0; iy < sy; ++iy)
    {
        printf("\t{ ");
        for (size_t ix = 0; ix < sx; ++ix)
            printf("%6.1f, ", mat[ix + iy * sx]);
        printf("},\n");
    }
    printf("}\n");
}

int main()
{
    constexpr size_t sx = 8;
    constexpr size_t sy = 8;
    float* mat;
    cudaMallocManaged(&mat, sx * sy * sizeof(float));
    
    for (size_t iy = 0; iy < sy; ++iy)
    for (size_t ix = 0; ix < sx; ++ix)
        mat[ix + sx * iy] = ix + sx * iy;

    print_mat(mat, sx, sy);

    transpose_block<<<1, 8>>>(mat, sx, sy);
    cudaDeviceSynchronize();

    print_mat(mat, sx, sy);

    cudaFree(mat);
}

I’m trying to transpose an 8x8 block by saving the columns locally for each of the 8 threads and then writing a lambda that would essentially give me access to the entire block using warp shuffle:

// tile = the cooperative group that holds all 8 threads 
// col = local vector that holds all 8 column values of column[tile.thread_rank()]
auto val = [&tile, &col](int ix, int iy) { return tile.shfl(col[iy], ix); }; 

This doesn’t work and I just cannot find the reason why.

https://stackoverflow.com/questions/74975945/cuda-matrix-transpose-only-with-warp-shuffle-instructions-not-working/74979192#74979192