I ran the proposed solution in the link below and confirmed that the compiler allocates the register array to the local memory.
This seems to be because the index of the register array is determined to be threadIdx.x^i and the compiler cannot interpret it.
Is there any way to apply the 2D matrix transpose with warp shuffling without local memory allocation?
Transpose 2D matrix
Before:
thread 0: 00 01 02 03 04 05 06 07
thread 1: 08 09 0a 0b 0c 0d 0e 0f
thread 2: 10 11 12 13 14 15 16 17
thread 3: 18 19 1a 1b 1c 1d 1e 1f
thread 4: 20 21 22 23 24 25 26 27
thread 5: 28 29 2a 2b 2c 2d 2e 2f
thread 6: 30 31 32 33 34 35 36 37
thread 7: 38 39 3a 3b 3c 3d 3e 3f
After:
thread 0: 00 08 10 18 20 28 30 38
thread 1: 01 09 11 19 21 29 31 39
thread 2: 02 0a 12 1a 22 2a 32 3a
thread 3: 03 0b 13 1b 23 2b 33 3b
thread 4: 04 0c 14 1c 24 2c 34 3c
thread 5: 05 0d 15 1d 25 2d 35 3d
thread 6: 06 0e 16 1e 26 2e 36 3e
thread 7: 07 0f 17 1f 27 2f 37 3f
Suggested solution
#include <cstdio>
__global__ void t(){
int u[8];
for (int i = 0; i < 8; i++) u[i] = threadIdx.x*8+i;
for (int i = 0; i < 8; i++) printf("lane: %d, idx: %d, val: %d\n", threadIdx.x, i, u[i]);
for (int i = 1; i < 8; i++){
int idx = threadIdx.x^i;
u[idx] = __shfl_sync(0x000000FF, u[idx], idx);}
for (int i = 0; i < 8; i++) printf("lane: %d, idx: %d, tra: %d\n", threadIdx.x, i, u[i]);
}
int main(){
t<<<1,8>>>();
cudaDeviceSynchronize();
}
link