#include<iostream>
#include<stdio.h>
#define FULL_MASK 0x0000000D
// #define FULL_MASK 0xC0000000
__global__ void test_shfl_sync(int* out) {
int tid = threadIdx.x;
unsigned int writemask = FULL_MASK;
unsigned int total = __popc(writemask);
// Find the lowest-numbered active lane
int elected_lane = __ffs(writemask) - 1;
int val = 9;
val = __shfl_sync(writemask, 20, elected_lane);
printf("writemask: %u, total: %u, elected_lane: %d, val: %d\n", writemask, total, elected_lane, val);
out[tid] = val;
}
int main(){
const int grid = 1;
const int block = 32;
int *o;
cudaMalloc((void **)&o, sizeof(int)*block*grid);
test_shfl_sync<<<grid, block>>>(o);
int out_host[block*grid];
cudaMemcpy(out_host, o, sizeof(int)*block*grid, cudaMemcpyDeviceToHost);
for (int i=0; i<block*grid; i++) {
printf("%4d", out_host[i]);
if ((i + 1) % 8 == 0) printf("\n");
}
return 0;
}
I just want some threads which is set to 1 by mask param return 20. But all threads in the warp return 20 no matter what mask is set.
Any help would be appreciated.
The new *_sync shfl intrinsics take in a mask indicating the threads participating in the call. A bit, representing the thread’s lane id, must be set for each participating thread to ensure they are properly converged before the intrinsic is executed by the hardware. Each calling thread must have its own bit set in the mask and all non-exited threads named in mask must execute the same intrinsic with the same mask, or the result is undefined.
Do you mean the mask is wrong?
Can you give me a sample of the correct mask which specify threadIdx == [0, 2, 3] ?
0x0000000D in binary is 0000 0000 0000 0000 0000 0000 0000 1101
I can’t find what’s wrong.
For example, you also call shfl_sync with thread 5, but its bit is not set in mask.
What is the goal you are trying to achieve? A simple if thread is 0,2,or 3, val = 20 else val = 9 ?