I am expecting the mask to be 0xffffffff for the first iteration for all 32 lanes.
But I got 0x7fffffff for lane 0~30 and 0x80000000 for lane 31.
If I set LEN to 126, i got 0x3fffffff for lane 0~29 and 0xc0000000 for lane 30~31.[
But if I compile the code with flag -G. It behaves normal to me, got 0xffffffff for the first iteration for all 32 lanes.
I tested on Pascal, Turing and Ampere cards. They all behaved the same.
Can someone explain me why?
the code:
include <stdio.h>
__global__ void test_cpy(unsigned int *dst, unsigned int *src, int len) {
int lane = threadIdx.x & 0x1f;for (int i = lane; i < len; i += 32) { __syncwarp(0xffffffff); unsigned int mask = __activemask(); __syncwarp(0xffffffff); printf("debugging %x lane %d i %d len %d threadIdx.x %d\n", mask, lane, i, len, threadIdx.x); dst[i] = src[i]; } __syncwarp(0xffffffff); // __syncthreads();
}
define LEN 127
int main() {
unsigned int *d_dst;
unsigned int *d_src;unsigned int *h_src = (unsigned int *)malloc(LEN * 4); unsigned int *h_dst = (unsigned int *)malloc(LEN * 4); for (int i = 0; i < LEN; i++) { h_src[i] = i; } cudaMalloc(&d_src, LEN * 4); cudaMalloc(&d_dst, LEN * 4); cudaMemcpy(d_src, h_src, LEN * 4, cudaMemcpyHostToDevice); test_cpy<<<1, 32>>>(d_dst, d_src, LEN); cudaMemcpy(h_dst, d_dst, LEN * 4, cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); return 0;
}
I compile the code by :
nvcc -arch=native -o copy copy.cu
result:
[root@serv1 src]# ./copy
debugging mask 80000000 lane 31 i 31 len 127 threadIdx.x 31
debugging mask 7fffffff lane 0 i 0 len 127 threadIdx.x 0
debugging mask 7fffffff lane 1 i 1 len 127 threadIdx.x 1
debugging mask 7fffffff lane 2 i 2 len 127 threadIdx.x 2
debugging mask 7fffffff lane 3 i 3 len 127 threadIdx.x 3
debugging mask 7fffffff lane 4 i 4 len 127 threadIdx.x 4
debugging mask 7fffffff lane 5 i 5 len 127 threadIdx.x 5
debugging mask 7fffffff lane 6 i 6 len 127 threadIdx.x 6
debugging mask 7fffffff lane 7 i 7 len 127 threadIdx.x 7
debugging mask 7fffffff lane 8 i 8 len 127 threadIdx.x 8
debugging mask 7fffffff lane 9 i 9 len 127 threadIdx.x 9
debugging mask 7fffffff lane 10 i 10 len 127 threadIdx.x 10
debugging mask 7fffffff lane 11 i 11 len 127 threadIdx.x 11
debugging mask 7fffffff lane 12 i 12 len 127 threadIdx.x 12
debugging mask 7fffffff lane 13 i 13 len 127 threadIdx.x 13
debugging mask 7fffffff lane 14 i 14 len 127 threadIdx.x 14
debugging mask 7fffffff lane 15 i 15 len 127 threadIdx.x 15
debugging mask 7fffffff lane 16 i 16 len 127 threadIdx.x 16
debugging mask 7fffffff lane 17 i 17 len 127 threadIdx.x 17
debugging mask 7fffffff lane 18 i 18 len 127 threadIdx.x 18
debugging mask 7fffffff lane 19 i 19 len 127 threadIdx.x 19
debugging mask 7fffffff lane 20 i 20 len 127 threadIdx.x 20
debugging mask 7fffffff lane 21 i 21 len 127 threadIdx.x 21
debugging mask 7fffffff lane 22 i 22 len 127 threadIdx.x 22
debugging mask 7fffffff lane 23 i 23 len 127 threadIdx.x 23
debugging mask 7fffffff lane 24 i 24 len 127 threadIdx.x 24
debugging mask 7fffffff lane 25 i 25 len 127 threadIdx.x 25
debugging mask 7fffffff lane 26 i 26 len 127 threadIdx.x 26
debugging mask 7fffffff lane 27 i 27 len 127 threadIdx.x 27
debugging mask 7fffffff lane 28 i 28 len 127 threadIdx.x 28
debugging mask 7fffffff lane 29 i 29 len 127 threadIdx.x 29
debugging mask 7fffffff lane 30 i 30 len 127 threadIdx.x 30
debugging mask 80000000 lane 31 i 63 len 127 threadIdx.x 31
debugging mask 7fffffff lane 0 i 32 len 127 threadIdx.x 0
debugging mask 7fffffff lane 1 i 33 len 127 threadIdx.x 1
debugging mask 7fffffff lane 2 i 34 len 127 threadIdx.x 2
debugging mask 7fffffff lane 3 i 35 len 127 threadIdx.x 3
debugging mask 7fffffff lane 4 i 36 len 127 threadIdx.x 4
debugging mask 7fffffff lane 5 i 37 len 127 threadIdx.x 5
debugging mask 7fffffff lane 6 i 38 len 127 threadIdx.x 6
debugging mask 7fffffff lane 7 i 39 len 127 threadIdx.x 7
debugging mask 7fffffff lane 8 i 40 len 127 threadIdx.x 8
debugging mask 7fffffff lane 9 i 41 len 127 threadIdx.x 9
debugging mask 7fffffff lane 10 i 42 len 127 threadIdx.x 10
debugging mask 7fffffff lane 11 i 43 len 127 threadIdx.x 11
debugging mask 7fffffff lane 12 i 44 len 127 threadIdx.x 12
debugging mask 7fffffff lane 13 i 45 len 127 threadIdx.x 13
debugging mask 7fffffff lane 14 i 46 len 127 threadIdx.x 14
debugging mask 7fffffff lane 15 i 47 len 127 threadIdx.x 15
debugging mask 7fffffff lane 16 i 48 len 127 threadIdx.x 16
debugging mask 7fffffff lane 17 i 49 len 127 threadIdx.x 17
debugging mask 7fffffff lane 18 i 50 len 127 threadIdx.x 18
debugging mask 7fffffff lane 19 i 51 len 127 threadIdx.x 19
debugging mask 7fffffff lane 20 i 52 len 127 threadIdx.x 20
debugging mask 7fffffff lane 21 i 53 len 127 threadIdx.x 21
debugging mask 7fffffff lane 22 i 54 len 127 threadIdx.x 22
debugging mask 7fffffff lane 23 i 55 len 127 threadIdx.x 23
debugging mask 7fffffff lane 24 i 56 len 127 threadIdx.x 24
debugging mask 7fffffff lane 25 i 57 len 127 threadIdx.x 25
debugging mask 7fffffff lane 26 i 58 len 127 threadIdx.x 26
debugging mask 7fffffff lane 27 i 59 len 127 threadIdx.x 27
debugging mask 7fffffff lane 28 i 60 len 127 threadIdx.x 28
debugging mask 7fffffff lane 29 i 61 len 127 threadIdx.x 29
debugging mask 7fffffff lane 30 i 62 len 127 threadIdx.x 30
debugging mask 80000000 lane 31 i 95 len 127 threadIdx.x 31
debugging mask 7fffffff lane 0 i 64 len 127 threadIdx.x 0
debugging mask 7fffffff lane 1 i 65 len 127 threadIdx.x 1
debugging mask 7fffffff lane 2 i 66 len 127 threadIdx.x 2
debugging mask 7fffffff lane 3 i 67 len 127 threadIdx.x 3
debugging mask 7fffffff lane 4 i 68 len 127 threadIdx.x 4
debugging mask 7fffffff lane 5 i 69 len 127 threadIdx.x 5
debugging mask 7fffffff lane 6 i 70 len 127 threadIdx.x 6
debugging mask 7fffffff lane 7 i 71 len 127 threadIdx.x 7
debugging mask 7fffffff lane 8 i 72 len 127 threadIdx.x 8
debugging mask 7fffffff lane 9 i 73 len 127 threadIdx.x 9
debugging mask 7fffffff lane 10 i 74 len 127 threadIdx.x 10
debugging mask 7fffffff lane 11 i 75 len 127 threadIdx.x 11
debugging mask 7fffffff lane 12 i 76 len 127 threadIdx.x 12
debugging mask 7fffffff lane 13 i 77 len 127 threadIdx.x 13
debugging mask 7fffffff lane 14 i 78 len 127 threadIdx.x 14
debugging mask 7fffffff lane 15 i 79 len 127 threadIdx.x 15
debugging mask 7fffffff lane 16 i 80 len 127 threadIdx.x 16
debugging mask 7fffffff lane 17 i 81 len 127 threadIdx.x 17
debugging mask 7fffffff lane 18 i 82 len 127 threadIdx.x 18
debugging mask 7fffffff lane 19 i 83 len 127 threadIdx.x 19
debugging mask 7fffffff lane 20 i 84 len 127 threadIdx.x 20
debugging mask 7fffffff lane 21 i 85 len 127 threadIdx.x 21
debugging mask 7fffffff lane 22 i 86 len 127 threadIdx.x 22
debugging mask 7fffffff lane 23 i 87 len 127 threadIdx.x 23
debugging mask 7fffffff lane 24 i 88 len 127 threadIdx.x 24
debugging mask 7fffffff lane 25 i 89 len 127 threadIdx.x 25
debugging mask 7fffffff lane 26 i 90 len 127 threadIdx.x 26
debugging mask 7fffffff lane 27 i 91 len 127 threadIdx.x 27
debugging mask 7fffffff lane 28 i 92 len 127 threadIdx.x 28
debugging mask 7fffffff lane 29 i 93 len 127 threadIdx.x 29
debugging mask 7fffffff lane 30 i 94 len 127 threadIdx.x 30
debugging mask 7fffffff lane 0 i 96 len 127 threadIdx.x 0
debugging mask 7fffffff lane 1 i 97 len 127 threadIdx.x 1
debugging mask 7fffffff lane 2 i 98 len 127 threadIdx.x 2
debugging mask 7fffffff lane 3 i 99 len 127 threadIdx.x 3
debugging mask 7fffffff lane 4 i 100 len 127 threadIdx.x 4
debugging mask 7fffffff lane 5 i 101 len 127 threadIdx.x 5
debugging mask 7fffffff lane 6 i 102 len 127 threadIdx.x 6
debugging mask 7fffffff lane 7 i 103 len 127 threadIdx.x 7
debugging mask 7fffffff lane 8 i 104 len 127 threadIdx.x 8
debugging mask 7fffffff lane 9 i 105 len 127 threadIdx.x 9
debugging mask 7fffffff lane 10 i 106 len 127 threadIdx.x 10
debugging mask 7fffffff lane 11 i 107 len 127 threadIdx.x 11
debugging mask 7fffffff lane 12 i 108 len 127 threadIdx.x 12
debugging mask 7fffffff lane 13 i 109 len 127 threadIdx.x 13
debugging mask 7fffffff lane 14 i 110 len 127 threadIdx.x 14
debugging mask 7fffffff lane 15 i 111 len 127 threadIdx.x 15
debugging mask 7fffffff lane 16 i 112 len 127 threadIdx.x 16
debugging mask 7fffffff lane 17 i 113 len 127 threadIdx.x 17
debugging mask 7fffffff lane 18 i 114 len 127 threadIdx.x 18
debugging mask 7fffffff lane 19 i 115 len 127 threadIdx.x 19
debugging mask 7fffffff lane 20 i 116 len 127 threadIdx.x 20
debugging mask 7fffffff lane 21 i 117 len 127 threadIdx.x 21
debugging mask 7fffffff lane 22 i 118 len 127 threadIdx.x 22
debugging mask 7fffffff lane 23 i 119 len 127 threadIdx.x 23
debugging mask 7fffffff lane 24 i 120 len 127 threadIdx.x 24
debugging mask 7fffffff lane 25 i 121 len 127 threadIdx.x 25
debugging mask 7fffffff lane 26 i 122 len 127 threadIdx.x 26
debugging mask 7fffffff lane 27 i 123 len 127 threadIdx.x 27
debugging mask 7fffffff lane 28 i 124 len 127 threadIdx.x 28
debugging mask 7fffffff lane 29 i 125 len 127 threadIdx.x 29
debugging mask 7fffffff lane 30 i 126 len 127 threadIdx.x 30