Use __shfl_down_sync for multiple variables?

Hello,

I’d like to do reduce sum for 6 variables (var1, var2, …, var6) by using Warp Shuffle but apparently I started getting good results only on var1.

__inline__ __device__ double warpSumReduce(double val)
{
	for (int offset = WARPSIZE / 2; offset > 0; offset /= 2)
	{
		val += __shfl_down_sync(FULL_MASK, val, offset);
	}
	return val;
}

// Each warp performs partial reduction
	var1 = warpSumReduce(var1);
	var2 = warpSumReduce(var2);
	var3 = warpSumReduce(var3);
	var4 = warpSumReduce(var4);
	var5 = warpSumReduce(var5);
	var6 = warpSumReduce(var6);
	// Write reduced value to shared memory
	if (LaneIdx == 0)
	{
		var1_Array[WidthIdx] = var1;
		var2_Array[WidthIdx] = var2;
		var3_Array[WidthIdx] = var3;
		var4_Array[WidthIdx] = var4;
		var5_Array[WidthIdx] = var5;
		var6_Array[WidthIdx] = var6;
	}
	// Wait for all partial reductions
	__syncthreads();
	//read from shared memory only if that warp existed
	var1 = (threadIdx.y < blockDim.y / WARPSIZE) ? var1_Array[LaneIdx] : 0.0;
	var2 = (threadIdx.y < blockDim.y / WARPSIZE) ? var2_Array[LaneIdx] : 0.0;
	var3 = (threadIdx.y < blockDim.y / WARPSIZE) ? var3_Array[LaneIdx] : 0.0;
	var4 = (threadIdx.y < blockDim.y / WARPSIZE) ? var4_Array[LaneIdx] : 0.0;
	var5 = (threadIdx.y < blockDim.y / WARPSIZE) ? var5_Array[LaneIdx] : 0.0;
	var6 = (threadIdx.y < blockDim.y / WARPSIZE) ? var6_Array[LaneIdx] : 0.0;
	if (WidthIdx == 0)
	{
		//Final reduce within first warp
		var1 = warpSumReduce(var1);
		var2 = warpSumReduce(var2);
		var3 = warpSumReduce(var3);
		var4 = warpSumReduce(var4);
		var5 = warpSumReduce(var5);
		var6 = warpSumReduce(var6);
}

I don’t seem to have any trouble with my attempt at building a complete code around what you have shown. Yes, I made a few changes e.g. using threadIdx.x instead of threadIdx.y etc. You can decide if those are important or not. There is no way for me to tell since you haven’t provided a complete example. The problem may lie in something you haven’t shown.

In the future, if you want help with something like this, my suggestion is to provide a complete example.

$ cat t2055.cu
#include <cstdio>
const int WARPSIZE = 32;
const unsigned FULL_MASK = 0xFFFFFFFFU;

__inline__ __device__ double warpSumReduce(double val)
{
        for (int offset = WARPSIZE / 2; offset > 0; offset /= 2)
        {
                val += __shfl_down_sync(FULL_MASK, val, offset);
        }
        return val;
}
typedef float mt;
__global__ void k(){
    __shared__ mt var1_Array[32];
    __shared__ mt var2_Array[32];
    __shared__ mt var3_Array[32];
    __shared__ mt var4_Array[32];
    __shared__ mt var5_Array[32];
    __shared__ mt var6_Array[32];

    mt var1 = 1;
    mt var2 = 2;
    mt var3 = 3;
    mt var4 = 4;
    mt var5 = 5;
    mt var6 = 6;
    int LaneIdx = threadIdx.x%32;
    int WidthIdx = threadIdx.x/32;

// Each warp performs partial reduction
        var1 = warpSumReduce(var1);
        var2 = warpSumReduce(var2);
        var3 = warpSumReduce(var3);
        var4 = warpSumReduce(var4);
        var5 = warpSumReduce(var5);
        var6 = warpSumReduce(var6);
        // Write reduced value to shared memory
        if (LaneIdx == 0)
        {
                var1_Array[WidthIdx] = var1;
                var2_Array[WidthIdx] = var2;
                var3_Array[WidthIdx] = var3;
                var4_Array[WidthIdx] = var4;
                var5_Array[WidthIdx] = var5;
                var6_Array[WidthIdx] = var6;
        }
        // Wait for all partial reductions
        __syncthreads();
        //read from shared memory only if that warp existed
        var1 = (threadIdx.x < (blockDim.x / WARPSIZE)) ? var1_Array[LaneIdx] : 0.0;
        var2 = (threadIdx.x < (blockDim.x / WARPSIZE)) ? var2_Array[LaneIdx] : 0.0;
        var3 = (threadIdx.x < (blockDim.x / WARPSIZE)) ? var3_Array[LaneIdx] : 0.0;
        var4 = (threadIdx.x < (blockDim.x / WARPSIZE)) ? var4_Array[LaneIdx] : 0.0;
        var5 = (threadIdx.x < (blockDim.x / WARPSIZE)) ? var5_Array[LaneIdx] : 0.0;
        var6 = (threadIdx.x < (blockDim.x / WARPSIZE)) ? var6_Array[LaneIdx] : 0.0;
        if (WidthIdx == 0)
        {
                //Final reduce within first warp
                var1 = warpSumReduce(var1);
                var2 = warpSumReduce(var2);
                var3 = warpSumReduce(var3);
                var4 = warpSumReduce(var4);
                var5 = warpSumReduce(var5);
                var6 = warpSumReduce(var6);
        }
        if ((WidthIdx == 0) && (LaneIdx == 0))
          printf("%f, %f, %f, %f, %f, %f\n", var1, var2, var3, var4, var5, var6);
}


int main(){

  k<<<1,1024>>>();
  cudaDeviceSynchronize();
}
$ nvcc -o t2055 t2055.cu
$ compute-sanitizer ./t2055
========= COMPUTE-SANITIZER
1024.000000, 2048.000000, 3072.000000, 4096.000000, 5120.000000, 6144.000000
========= ERROR SUMMARY: 0 errors
$

An implication of your code design (at least what you have shown) is that the data set sizes to be reduced must:

  • all be the same
  • all be a multiple of 32
  • be less than or equal to the number of threads in the block

@Robert_Crovella Thank you for your response. Never mind ! In fact, the unexpected results are coming from somewhere else (so completely unrelated to cuda functions issue). I mistakenly thought that the Warp Shuffle was the culprit.

Have a Great Day !

Abdoulaye