Live code: https://godbolt.org/z/ejdMPMn6q
#include <iostream>
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
__global__ void transpose_block(float* mat, size_t sx, size_t sy)
{
constexpr size_t size = 8;
auto tile = cg::tiled_partition<size>(cg::this_thread_block());
auto ix = tile.thread_rank();
float col[size];
for (size_t iy = 0; iy < size; ++iy)
col[iy] = mat[ix + iy * sx];
auto val = [&tile, &col](int ix, int iy) { return tile.shfl(col[iy], ix); };
for (size_t iy = 0; iy < size; ++iy)
mat[ix + iy * sx] = val(iy, ix);
}
void print_mat(float* mat, size_t sx, size_t sy)
{
printf("{\n");
for (size_t iy = 0; iy < sy; ++iy)
{
printf("\t{ ");
for (size_t ix = 0; ix < sx; ++ix)
printf("%6.1f, ", mat[ix + iy * sx]);
printf("},\n");
}
printf("}\n");
}
int main()
{
constexpr size_t sx = 8;
constexpr size_t sy = 8;
float* mat;
cudaMallocManaged(&mat, sx * sy * sizeof(float));
for (size_t iy = 0; iy < sy; ++iy)
for (size_t ix = 0; ix < sx; ++ix)
mat[ix + sx * iy] = ix + sx * iy;
print_mat(mat, sx, sy);
transpose_block<<<1, 8>>>(mat, sx, sy);
cudaDeviceSynchronize();
print_mat(mat, sx, sy);
cudaFree(mat);
}
I’m trying to transpose an 8x8 block by saving the columns locally for each of the 8 threads and then writing a lambda that would essentially give me access to the entire block using warp shuffle:
// tile = the cooperative group that holds all 8 threads
// col = local vector that holds all 8 column values of column[tile.thread_rank()]
auto val = [&tile, &col](int ix, int iy) { return tile.shfl(col[iy], ix); };
This doesn’t work and I just cannot find the reason why.