How do mask work in warp primitive?

Here is my code:

#include<iostream>
#include<stdio.h>

#define FULL_MASK 0x0000000D
// #define FULL_MASK 0xC0000000

__global__ void test_shfl_sync(int* out) {
  int tid = threadIdx.x;

  unsigned int writemask = FULL_MASK;
  unsigned int total = __popc(writemask);
  // Find the lowest-numbered active lane
  int elected_lane = __ffs(writemask) - 1;
  int val = 9;
  val = __shfl_sync(writemask, 20, elected_lane);
  printf("writemask: %u, total: %u, elected_lane: %d, val: %d\n", writemask, total, elected_lane, val);
  out[tid] = val;
}


int main(){
  const int grid = 1;
  const int block = 32;
  int *o;
  cudaMalloc((void **)&o, sizeof(int)*block*grid);
  test_shfl_sync<<<grid, block>>>(o);
  int out_host[block*grid];
  cudaMemcpy(out_host, o, sizeof(int)*block*grid, cudaMemcpyDeviceToHost);
  for (int i=0; i<block*grid; i++) {
    printf("%4d", out_host[i]);
    if ((i + 1) % 8 == 0) printf("\n");
  }
  return 0;
}

I just want some threads which is set to 1 by mask param return 20. But all threads in the warp return 20 no matter what mask is set.
Any help would be appreciated.

Your code has undefined behavior. Each thread that calls shfl_sync must have its bit set in the mask.

https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#id36

The new *_sync shfl intrinsics take in a mask indicating the threads participating in the call. A bit, representing the thread’s lane id, must be set for each participating thread to ensure they are properly converged before the intrinsic is executed by the hardware. Each calling thread must have its own bit set in the mask and all non-exited threads named in mask must execute the same intrinsic with the same mask, or the result is undefined.

Do you mean the mask is wrong?
Can you give me a sample of the correct mask which specify threadIdx == [0, 2, 3] ?
0x0000000D in binary is 0000 0000 0000 0000 0000 0000 0000 1101
I can’t find what’s wrong.

For example, you also call shfl_sync with thread 5, but its bit is not set in mask.
What is the goal you are trying to achieve? A simple if thread is 0,2,or 3, val = 20 else val = 9 ?

#include<iostream>
#include<stdio.h>

__global__ void test_shfl_sync(int* out) {
  int tid = threadIdx.x;

  int val = 9;
  if (threadIdx.x == 0 || threadIdx.x == 2 || threadIdx.x == 3) {
    val = __shfl_sync(0x0000000D, 20, 0);
  }
  out[tid] = val;
}


int main(){
  const int grid = 1;
  const int block = 32;
  int *o;
  cudaMalloc((void **)&o, sizeof(int)*block*grid);
  test_shfl_sync<<<grid, block>>>(o);
  int out_host[block*grid];
  cudaMemcpy(out_host, o, sizeof(int)*block*grid, cudaMemcpyDeviceToHost);
  for (int i=0; i<block*grid; i++) {
    printf("%4d", out_host[i]);
    if ((i + 1) % 8 == 0) printf("\n");
  }
  return 0;
}

@striker159 yes, I mean if thread is 0,2,or 3, val = 20 else val = 9
And I get what I want. Thank you.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.