I’m trying to implement an in-place matrix transform. The code in this thread Transpose 2D matrix with warp shuffle and in-place array probably can solve my problem. I was trying to implement something different, though, and I cannot figure out what is wrong with this kernel.
The idea is that at each iteration we should merge two vectors, say from threads 0 and 8, and store this result in threads 0 and 1, and so on. This process is carried out log(N) times.
For some reason, after the first iteration here the vector in the odd threads ends up the same as the even threads.
Can anyone see a bug in this code (mytranspose
), or is there some limitation in shfl_sync or in private arrays that I’m not taking in account?
#include <stdio.h>
template<class T, int N>
__device__ void printvector(T z[N]) {
for (int th = 0; th < N; th++) {
if (threadIdx.x == th) {
for (int n = 0; n < N; n++) {
printf("% 5d", z[n]);
}
printf("\n");
}
}
if (threadIdx.x == 0)printf("\n");
}
template<int N>
__global__ void bobtranspose() {
int z[N];
for (int n = 0; n < N; n++) {
z[n] = n + threadIdx.x * 100;
}
printvector<int, N>(z);
int mask = __activemask();
for (int i = 1; i < N; i++) {
int idx = threadIdx.x ^i;
z[idx] = __shfl_sync(mask, z[idx], idx);
}
printvector<int, N>(z);
}
template<int N, int It>
__global__ void mytranspose() {
int z[N];
for (int n = 0; n < N; n++) {
z[n] = n + threadIdx.x * 100;
}
printvector<int, N>(z);
int mask = __activemask();
int w[N];
int C2 = N >> 1;
for (int iteration = 0; iteration < It; iteration++) {
int inc = (threadIdx.x & 0x1) ? C2 : 0;
int t1 = threadIdx.x / 2;
int t2 = t1 + C2;
for (int v = 0; v < C2; v++) {
w[v * 2 + 0] = __shfl_sync(mask, z[v+inc], t1);
w[v * 2 + 1] = __shfl_sync(mask, z[v+inc], t2);
}
for (int v = 0; v < N; v++) {
z[v] = w[v];
}
printvector<int, N>(z);
}
}
#define VEC_LEN 8
int main(int argc, char **argv) {
printf("bobtranspose\n");
bobtranspose<VEC_LEN><<<1, VEC_LEN>>>();
cudaDeviceSynchronize();
printf("\n\nmytranspose\n");
mytranspose<VEC_LEN, 1><<<1, VEC_LEN>>>();
cudaDeviceSynchronize();
}
program output
bobtranspose
0 1 2 3 4 5 6 7
100 101 102 103 104 105 106 107
200 201 202 203 204 205 206 207
300 301 302 303 304 305 306 307
400 401 402 403 404 405 406 407
500 501 502 503 504 505 506 507
600 601 602 603 604 605 606 607
700 701 702 703 704 705 706 707
0 100 200 300 400 500 600 700
1 101 201 301 401 501 601 701
2 102 202 302 402 502 602 702
3 103 203 303 403 503 603 703
4 104 204 304 404 504 604 704
5 105 205 305 405 505 605 705
6 106 206 306 406 506 606 706
7 107 207 307 407 507 607 707
mytranspose
0 1 2 3 4 5 6 7
100 101 102 103 104 105 106 107
200 201 202 203 204 205 206 207
300 301 302 303 304 305 306 307
400 401 402 403 404 405 406 407
500 501 502 503 504 505 506 507
600 601 602 603 604 605 606 607
700 701 702 703 704 705 706 707
0 400 1 401 2 402 3 403
0 400 1 401 2 402 3 403
100 500 100 500 100 500 100 500
100 500 100 500 100 500 100 500
200 600 201 601 202 602 203 603
200 600 201 601 202 602 203 603
300 700 300 700 300 700 300 700
300 700 300 700 300 700 300 700