Hello,
I’d like to do reduce sum for 6 variables (var1, var2, …, var6) by using Warp Shuffle but apparently I started getting good results only on var1.
__inline__ __device__ double warpSumReduce(double val)
{
for (int offset = WARPSIZE / 2; offset > 0; offset /= 2)
{
val += __shfl_down_sync(FULL_MASK, val, offset);
}
return val;
}
// Each warp performs partial reduction
var1 = warpSumReduce(var1);
var2 = warpSumReduce(var2);
var3 = warpSumReduce(var3);
var4 = warpSumReduce(var4);
var5 = warpSumReduce(var5);
var6 = warpSumReduce(var6);
// Write reduced value to shared memory
if (LaneIdx == 0)
{
var1_Array[WidthIdx] = var1;
var2_Array[WidthIdx] = var2;
var3_Array[WidthIdx] = var3;
var4_Array[WidthIdx] = var4;
var5_Array[WidthIdx] = var5;
var6_Array[WidthIdx] = var6;
}
// Wait for all partial reductions
__syncthreads();
//read from shared memory only if that warp existed
var1 = (threadIdx.y < blockDim.y / WARPSIZE) ? var1_Array[LaneIdx] : 0.0;
var2 = (threadIdx.y < blockDim.y / WARPSIZE) ? var2_Array[LaneIdx] : 0.0;
var3 = (threadIdx.y < blockDim.y / WARPSIZE) ? var3_Array[LaneIdx] : 0.0;
var4 = (threadIdx.y < blockDim.y / WARPSIZE) ? var4_Array[LaneIdx] : 0.0;
var5 = (threadIdx.y < blockDim.y / WARPSIZE) ? var5_Array[LaneIdx] : 0.0;
var6 = (threadIdx.y < blockDim.y / WARPSIZE) ? var6_Array[LaneIdx] : 0.0;
if (WidthIdx == 0)
{
//Final reduce within first warp
var1 = warpSumReduce(var1);
var2 = warpSumReduce(var2);
var3 = warpSumReduce(var3);
var4 = warpSumReduce(var4);
var5 = warpSumReduce(var5);
var6 = warpSumReduce(var6);
}
I don’t seem to have any trouble with my attempt at building a complete code around what you have shown. Yes, I made a few changes e.g. using threadIdx.x instead of threadIdx.y etc. You can decide if those are important or not. There is no way for me to tell since you haven’t provided a complete example. The problem may lie in something you haven’t shown.
In the future, if you want help with something like this, my suggestion is to provide a complete example.
$ cat t2055.cu
#include <cstdio>
const int WARPSIZE = 32;
const unsigned FULL_MASK = 0xFFFFFFFFU;
__inline__ __device__ double warpSumReduce(double val)
{
for (int offset = WARPSIZE / 2; offset > 0; offset /= 2)
{
val += __shfl_down_sync(FULL_MASK, val, offset);
}
return val;
}
typedef float mt;
__global__ void k(){
__shared__ mt var1_Array[32];
__shared__ mt var2_Array[32];
__shared__ mt var3_Array[32];
__shared__ mt var4_Array[32];
__shared__ mt var5_Array[32];
__shared__ mt var6_Array[32];
mt var1 = 1;
mt var2 = 2;
mt var3 = 3;
mt var4 = 4;
mt var5 = 5;
mt var6 = 6;
int LaneIdx = threadIdx.x%32;
int WidthIdx = threadIdx.x/32;
// Each warp performs partial reduction
var1 = warpSumReduce(var1);
var2 = warpSumReduce(var2);
var3 = warpSumReduce(var3);
var4 = warpSumReduce(var4);
var5 = warpSumReduce(var5);
var6 = warpSumReduce(var6);
// Write reduced value to shared memory
if (LaneIdx == 0)
{
var1_Array[WidthIdx] = var1;
var2_Array[WidthIdx] = var2;
var3_Array[WidthIdx] = var3;
var4_Array[WidthIdx] = var4;
var5_Array[WidthIdx] = var5;
var6_Array[WidthIdx] = var6;
}
// Wait for all partial reductions
__syncthreads();
//read from shared memory only if that warp existed
var1 = (threadIdx.x < (blockDim.x / WARPSIZE)) ? var1_Array[LaneIdx] : 0.0;
var2 = (threadIdx.x < (blockDim.x / WARPSIZE)) ? var2_Array[LaneIdx] : 0.0;
var3 = (threadIdx.x < (blockDim.x / WARPSIZE)) ? var3_Array[LaneIdx] : 0.0;
var4 = (threadIdx.x < (blockDim.x / WARPSIZE)) ? var4_Array[LaneIdx] : 0.0;
var5 = (threadIdx.x < (blockDim.x / WARPSIZE)) ? var5_Array[LaneIdx] : 0.0;
var6 = (threadIdx.x < (blockDim.x / WARPSIZE)) ? var6_Array[LaneIdx] : 0.0;
if (WidthIdx == 0)
{
//Final reduce within first warp
var1 = warpSumReduce(var1);
var2 = warpSumReduce(var2);
var3 = warpSumReduce(var3);
var4 = warpSumReduce(var4);
var5 = warpSumReduce(var5);
var6 = warpSumReduce(var6);
}
if ((WidthIdx == 0) && (LaneIdx == 0))
printf("%f, %f, %f, %f, %f, %f\n", var1, var2, var3, var4, var5, var6);
}
int main(){
k<<<1,1024>>>();
cudaDeviceSynchronize();
}
$ nvcc -o t2055 t2055.cu
$ compute-sanitizer ./t2055
========= COMPUTE-SANITIZER
1024.000000, 2048.000000, 3072.000000, 4096.000000, 5120.000000, 6144.000000
========= ERROR SUMMARY: 0 errors
$
An implication of your code design (at least what you have shown) is that the data set sizes to be reduced must:
- all be the same
- all be a multiple of 32
- be less than or equal to the number of threads in the block
@Robert_Crovella Thank you for your response. Never mind ! In fact, the unexpected results are coming from somewhere else (so completely unrelated to cuda functions issue). I mistakenly thought that the Warp Shuffle was the culprit.
Have a Great Day !
Abdoulaye