Weird behavior about __activemask()

I am expecting the mask to be 0xffffffff for the first iteration for all 32 lanes.
But I got 0x7fffffff for lane 0~30 and 0x80000000 for lane 31.

If I set LEN to 126, i got 0x3fffffff for lane 0~29 and 0xc0000000 for lane 30~31.[

But if I compile the code with flag -G. It behaves normal to me, got 0xffffffff for the first iteration for all 32 lanes.

I tested on Pascal, Turing and Ampere cards. They all behaved the same.

Can someone explain me why?

the code:

include <stdio.h>

__global__ void test_cpy(unsigned int *dst, unsigned int *src, int len) {
int lane = threadIdx.x & 0x1f;

for (int i = lane; i < len; i += 32) {
    __syncwarp(0xffffffff);
    unsigned int mask = __activemask();
    __syncwarp(0xffffffff);
    printf("debugging %x lane %d i %d len %d threadIdx.x %d\n", mask, lane, i, len, threadIdx.x);
    dst[i] = src[i];
}
__syncwarp(0xffffffff);

// __syncthreads();

}

define LEN 127

int main() {
unsigned int *d_dst;
unsigned int *d_src;

unsigned int *h_src = (unsigned int *)malloc(LEN * 4);
unsigned int *h_dst = (unsigned int *)malloc(LEN * 4);

for (int i = 0; i < LEN; i++) {
    h_src[i] = i;
}

cudaMalloc(&d_src, LEN * 4);
cudaMalloc(&d_dst, LEN * 4);

cudaMemcpy(d_src, h_src, LEN * 4, cudaMemcpyHostToDevice);

test_cpy<<<1, 32>>>(d_dst, d_src, LEN);

cudaMemcpy(h_dst, d_dst, LEN * 4, cudaMemcpyDeviceToHost);


cudaDeviceSynchronize();
return 0;

}

I compile the code by :

nvcc -arch=native -o copy copy.cu

result:

[root@serv1 src]# ./copy

debugging mask 80000000 lane 31 i 31 len 127 threadIdx.x 31
debugging mask 7fffffff lane 0 i 0 len 127 threadIdx.x 0
debugging mask 7fffffff lane 1 i 1 len 127 threadIdx.x 1
debugging mask 7fffffff lane 2 i 2 len 127 threadIdx.x 2
debugging mask 7fffffff lane 3 i 3 len 127 threadIdx.x 3
debugging mask 7fffffff lane 4 i 4 len 127 threadIdx.x 4
debugging mask 7fffffff lane 5 i 5 len 127 threadIdx.x 5
debugging mask 7fffffff lane 6 i 6 len 127 threadIdx.x 6
debugging mask 7fffffff lane 7 i 7 len 127 threadIdx.x 7
debugging mask 7fffffff lane 8 i 8 len 127 threadIdx.x 8
debugging mask 7fffffff lane 9 i 9 len 127 threadIdx.x 9
debugging mask 7fffffff lane 10 i 10 len 127 threadIdx.x 10
debugging mask 7fffffff lane 11 i 11 len 127 threadIdx.x 11
debugging mask 7fffffff lane 12 i 12 len 127 threadIdx.x 12
debugging mask 7fffffff lane 13 i 13 len 127 threadIdx.x 13
debugging mask 7fffffff lane 14 i 14 len 127 threadIdx.x 14
debugging mask 7fffffff lane 15 i 15 len 127 threadIdx.x 15
debugging mask 7fffffff lane 16 i 16 len 127 threadIdx.x 16
debugging mask 7fffffff lane 17 i 17 len 127 threadIdx.x 17
debugging mask 7fffffff lane 18 i 18 len 127 threadIdx.x 18
debugging mask 7fffffff lane 19 i 19 len 127 threadIdx.x 19
debugging mask 7fffffff lane 20 i 20 len 127 threadIdx.x 20
debugging mask 7fffffff lane 21 i 21 len 127 threadIdx.x 21
debugging mask 7fffffff lane 22 i 22 len 127 threadIdx.x 22
debugging mask 7fffffff lane 23 i 23 len 127 threadIdx.x 23
debugging mask 7fffffff lane 24 i 24 len 127 threadIdx.x 24
debugging mask 7fffffff lane 25 i 25 len 127 threadIdx.x 25
debugging mask 7fffffff lane 26 i 26 len 127 threadIdx.x 26
debugging mask 7fffffff lane 27 i 27 len 127 threadIdx.x 27
debugging mask 7fffffff lane 28 i 28 len 127 threadIdx.x 28
debugging mask 7fffffff lane 29 i 29 len 127 threadIdx.x 29
debugging mask 7fffffff lane 30 i 30 len 127 threadIdx.x 30
debugging mask 80000000 lane 31 i 63 len 127 threadIdx.x 31
debugging mask 7fffffff lane 0 i 32 len 127 threadIdx.x 0
debugging mask 7fffffff lane 1 i 33 len 127 threadIdx.x 1
debugging mask 7fffffff lane 2 i 34 len 127 threadIdx.x 2
debugging mask 7fffffff lane 3 i 35 len 127 threadIdx.x 3
debugging mask 7fffffff lane 4 i 36 len 127 threadIdx.x 4
debugging mask 7fffffff lane 5 i 37 len 127 threadIdx.x 5
debugging mask 7fffffff lane 6 i 38 len 127 threadIdx.x 6
debugging mask 7fffffff lane 7 i 39 len 127 threadIdx.x 7
debugging mask 7fffffff lane 8 i 40 len 127 threadIdx.x 8
debugging mask 7fffffff lane 9 i 41 len 127 threadIdx.x 9
debugging mask 7fffffff lane 10 i 42 len 127 threadIdx.x 10
debugging mask 7fffffff lane 11 i 43 len 127 threadIdx.x 11
debugging mask 7fffffff lane 12 i 44 len 127 threadIdx.x 12
debugging mask 7fffffff lane 13 i 45 len 127 threadIdx.x 13
debugging mask 7fffffff lane 14 i 46 len 127 threadIdx.x 14
debugging mask 7fffffff lane 15 i 47 len 127 threadIdx.x 15
debugging mask 7fffffff lane 16 i 48 len 127 threadIdx.x 16
debugging mask 7fffffff lane 17 i 49 len 127 threadIdx.x 17
debugging mask 7fffffff lane 18 i 50 len 127 threadIdx.x 18
debugging mask 7fffffff lane 19 i 51 len 127 threadIdx.x 19
debugging mask 7fffffff lane 20 i 52 len 127 threadIdx.x 20
debugging mask 7fffffff lane 21 i 53 len 127 threadIdx.x 21
debugging mask 7fffffff lane 22 i 54 len 127 threadIdx.x 22
debugging mask 7fffffff lane 23 i 55 len 127 threadIdx.x 23
debugging mask 7fffffff lane 24 i 56 len 127 threadIdx.x 24
debugging mask 7fffffff lane 25 i 57 len 127 threadIdx.x 25
debugging mask 7fffffff lane 26 i 58 len 127 threadIdx.x 26
debugging mask 7fffffff lane 27 i 59 len 127 threadIdx.x 27
debugging mask 7fffffff lane 28 i 60 len 127 threadIdx.x 28
debugging mask 7fffffff lane 29 i 61 len 127 threadIdx.x 29
debugging mask 7fffffff lane 30 i 62 len 127 threadIdx.x 30
debugging mask 80000000 lane 31 i 95 len 127 threadIdx.x 31
debugging mask 7fffffff lane 0 i 64 len 127 threadIdx.x 0
debugging mask 7fffffff lane 1 i 65 len 127 threadIdx.x 1
debugging mask 7fffffff lane 2 i 66 len 127 threadIdx.x 2
debugging mask 7fffffff lane 3 i 67 len 127 threadIdx.x 3
debugging mask 7fffffff lane 4 i 68 len 127 threadIdx.x 4
debugging mask 7fffffff lane 5 i 69 len 127 threadIdx.x 5
debugging mask 7fffffff lane 6 i 70 len 127 threadIdx.x 6
debugging mask 7fffffff lane 7 i 71 len 127 threadIdx.x 7
debugging mask 7fffffff lane 8 i 72 len 127 threadIdx.x 8
debugging mask 7fffffff lane 9 i 73 len 127 threadIdx.x 9
debugging mask 7fffffff lane 10 i 74 len 127 threadIdx.x 10
debugging mask 7fffffff lane 11 i 75 len 127 threadIdx.x 11
debugging mask 7fffffff lane 12 i 76 len 127 threadIdx.x 12
debugging mask 7fffffff lane 13 i 77 len 127 threadIdx.x 13
debugging mask 7fffffff lane 14 i 78 len 127 threadIdx.x 14
debugging mask 7fffffff lane 15 i 79 len 127 threadIdx.x 15
debugging mask 7fffffff lane 16 i 80 len 127 threadIdx.x 16
debugging mask 7fffffff lane 17 i 81 len 127 threadIdx.x 17
debugging mask 7fffffff lane 18 i 82 len 127 threadIdx.x 18
debugging mask 7fffffff lane 19 i 83 len 127 threadIdx.x 19
debugging mask 7fffffff lane 20 i 84 len 127 threadIdx.x 20
debugging mask 7fffffff lane 21 i 85 len 127 threadIdx.x 21
debugging mask 7fffffff lane 22 i 86 len 127 threadIdx.x 22
debugging mask 7fffffff lane 23 i 87 len 127 threadIdx.x 23
debugging mask 7fffffff lane 24 i 88 len 127 threadIdx.x 24
debugging mask 7fffffff lane 25 i 89 len 127 threadIdx.x 25
debugging mask 7fffffff lane 26 i 90 len 127 threadIdx.x 26
debugging mask 7fffffff lane 27 i 91 len 127 threadIdx.x 27
debugging mask 7fffffff lane 28 i 92 len 127 threadIdx.x 28
debugging mask 7fffffff lane 29 i 93 len 127 threadIdx.x 29
debugging mask 7fffffff lane 30 i 94 len 127 threadIdx.x 30
debugging mask 7fffffff lane 0 i 96 len 127 threadIdx.x 0
debugging mask 7fffffff lane 1 i 97 len 127 threadIdx.x 1
debugging mask 7fffffff lane 2 i 98 len 127 threadIdx.x 2
debugging mask 7fffffff lane 3 i 99 len 127 threadIdx.x 3
debugging mask 7fffffff lane 4 i 100 len 127 threadIdx.x 4
debugging mask 7fffffff lane 5 i 101 len 127 threadIdx.x 5
debugging mask 7fffffff lane 6 i 102 len 127 threadIdx.x 6
debugging mask 7fffffff lane 7 i 103 len 127 threadIdx.x 7
debugging mask 7fffffff lane 8 i 104 len 127 threadIdx.x 8
debugging mask 7fffffff lane 9 i 105 len 127 threadIdx.x 9
debugging mask 7fffffff lane 10 i 106 len 127 threadIdx.x 10
debugging mask 7fffffff lane 11 i 107 len 127 threadIdx.x 11
debugging mask 7fffffff lane 12 i 108 len 127 threadIdx.x 12
debugging mask 7fffffff lane 13 i 109 len 127 threadIdx.x 13
debugging mask 7fffffff lane 14 i 110 len 127 threadIdx.x 14
debugging mask 7fffffff lane 15 i 111 len 127 threadIdx.x 15
debugging mask 7fffffff lane 16 i 112 len 127 threadIdx.x 16
debugging mask 7fffffff lane 17 i 113 len 127 threadIdx.x 17
debugging mask 7fffffff lane 18 i 114 len 127 threadIdx.x 18
debugging mask 7fffffff lane 19 i 115 len 127 threadIdx.x 19
debugging mask 7fffffff lane 20 i 116 len 127 threadIdx.x 20
debugging mask 7fffffff lane 21 i 117 len 127 threadIdx.x 21
debugging mask 7fffffff lane 22 i 118 len 127 threadIdx.x 22
debugging mask 7fffffff lane 23 i 119 len 127 threadIdx.x 23
debugging mask 7fffffff lane 24 i 120 len 127 threadIdx.x 24
debugging mask 7fffffff lane 25 i 121 len 127 threadIdx.x 25
debugging mask 7fffffff lane 26 i 122 len 127 threadIdx.x 26
debugging mask 7fffffff lane 27 i 123 len 127 threadIdx.x 27
debugging mask 7fffffff lane 28 i 124 len 127 threadIdx.x 28
debugging mask 7fffffff lane 29 i 125 len 127 threadIdx.x 29
debugging mask 7fffffff lane 30 i 126 len 127 threadIdx.x 30

please format your code correctly. One possible approach:

  • click the pencil icon below your post to edit it
  • in the edit window, select your code
  • at the top of the edit window, click the </> button
  • save your edits

It has to do with loop unrolling by the compiler. The compiler has made a decision about loop unrolling, and coupled with that must test each thread at entry to the unrolled area, to determine whether that thread can complete a single pass of the unrolled area (which would be multiple of the original loop iterations). if it cannot, it is handled separately. I’m not going to work through all the SASS code for you. If you want some evidence of this, compile your code with this added line:

#pragma unroll 1

before the for-loop in the kernel.

Because of this decision at the entry to the unrolled region, the divergence is pushed “up” by a number of loop iterations, probably equal to something like the unrolled depth.

In the future, please follow my suggestions for code formatting. By preceding each line with >, you have not formatted your code properly. That is a markup indicating “excerpting”. As a result of using that, you have not created scroll boxes for code (making your post unnecessarily long) and have omitted other features such as proper code highlighting.

Thanks a lot for your reply.

#pragma unroll 1

I tried this line.
It does make difference.
I will try to figfure it out more details tomorrow since it is midnight in china.
I have tried SASS last Friday, but I failed to fully interpret them.

VOTE.ANY R7, PT, PT

I guess the above line implements __activemask(), right?

active mask is in the category of warp voting functions, so my guess is that it uses a VOTE (SASS) instruction, probably.


$ cat t2063.cu
__device__ unsigned t;

__global__ void k(){

  t = __activemask();
}
$ nvcc -c t2063.cu
$ cuobjdump -sass t2063.o

Fatbin elf code:
================
arch = sm_52
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit

        code for sm_52
                Function : _Z1kv
        .headerflags    @"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)"
                                                           /* 0x001fc400fe2007f6 */
        /*0008*/                   MOV R1, c[0x0][0x20] ;  /* 0x4c98078000870001 */
        /*0010*/                   MOV32I R2, 0x0 ;        /* 0x010000000007f002 */
        /*0018*/                   MOV32I R3, 0x0 ;        /* 0x010000000007f003 */
                                                           /* 0x001fbc00fe2007f2 */
        /*0028*/                   VOTE.ANY R0, PT, PT ;   /* 0x50d9e38000070000 */
        /*0030*/                   STG.E [R2], R0 ;        /* 0xeedc200000070200 */
        /*0038*/                   NOP ;                   /* 0x50b0000000070f00 */
                                                           /* 0x001ffc00fd0007ef */
        /*0048*/                   NOP ;                   /* 0x50b0000000070f00 */
        /*0050*/                   NOP ;                   /* 0x50b0000000070f00 */
        /*0058*/                   EXIT ;                  /* 0xe30000000007000f */
                                                           /* 0x001f8000fc0007ff */
        /*0068*/                   BRA 0x60 ;              /* 0xe2400fffff07000f */
        /*0070*/                   NOP;                    /* 0x50b0000000070f00 */
        /*0078*/                   NOP;                    /* 0x50b0000000070f00 */
                ..........



Fatbin ptx code:
================
arch = sm_52
code version = [7,4]
producer = <unknown>
host = linux
compile_size = 64bit
compressed
$