Hi,

I have written a shuffle reduction function that works well for warp size 32, so I can quickly reduce arrays of size <=32. I have a need to reduce arrays of size 64. So I modified the function to call two shuffle functions. The first shuffle reduces elements 0-31 correctly, and the second shuffle is supposed to reduce elements 32-63. Instead, the second shuffle seems to add a single element to the sum 32 times. I.e if element 32 contains a 10, then the “sum” output for the second array will become 320. I have provided the full code, just call shuffleReduce() from a main function.

I am aware that shuffle functions are bound by the max warp size (32) so I made blockDim.x and blockDim.y to both be 32. I.e 32 threads in X directions is one warp, and 32 threads in Y direction is another warp. Maybe my logic is wrong, but this should allow me to do two shuffles within the same kernel at the same time, as long as the shuffles do not need to share memory.

So can anyone provide any insight as to why sumY does not appear to be calculating the sum of the row, even though sumX works perfectly? Please note nvcc provides a warning that sumY is set but never referenced…

Thanks for your time!

```
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
__global__ void shuffleReduction(float * d_frame, float * d_colVector)
{
const int SIZE = 1;
int laneIDx = threadIdx.x%32;
int laneIDy = threadIdx.y%32;
const int MAX_ROW_SIZE = 64;
float sumX[MAX_ROW_SIZE*(SIZE)];
float sumY[MAX_ROW_SIZE*(SIZE)];
float sum[MAX_ROW_SIZE*(SIZE)];
sumX[blockIdx.z+(MAX_ROW_SIZE*blockIdx.y)] = d_frame[laneIDx+(64*blockIdx.z)+(64*MAX_ROW_SIZE*blockIdx.y)]; //set up lanes to point to elements 0-31
sumY[blockIdx.z+(MAX_ROW_SIZE*blockIdx.y)] = d_frame[32+laneIDy+(64*blockIdx.z)+(64*MAX_ROW_SIZE*blockIdx.y)]; // set up lanes to point to elements 32-63
__syncthreads();
unsigned mask = 0xffffffff;
for (int i=16; i>=1; i/=2)
{
sumX[blockIdx.z+(MAX_ROW_SIZE*blockIdx.y)] += __shfl_down_sync(mask, sumX[blockIdx.z+(MAX_ROW_SIZE*blockIdx.y)], i); // this shuffle outputs correct sum
sumY[blockIdx.z+(MAX_ROW_SIZE*blockIdx.y)] += __shfl_down_sync(mask, sumY[blockIdx.z+(MAX_ROW_SIZE*blockIdx.y)], i); // this shuffle outputs incorrect sum
}
__syncthreads();
if (threadIdx.x ==0 && blockIdx.y == 0 && blockIdx.z == 0 && threadIdx.y == 0)
{
printf("sumX[0] = %f\t (correct, expected 528)\n",sumX[0]); //this sum is correct (should be 528)
printf("sumY[0] = %f\t (incorrect, expected 1552)\n",sumY[0]); //this sum is incorrect (should be 2080-528 = 1552)
}
//if (laneIDx==0 && laneIDy ==0)
sum[blockIdx.z+(MAX_ROW_SIZE*blockIdx.y)] = sumX[blockIdx.z+(MAX_ROW_SIZE*blockIdx.y)] + sumY[blockIdx.z+(MAX_ROW_SIZE*blockIdx.y)];
__syncthreads();
if (threadIdx.x==0 && threadIdx.y ==0)
{
d_colVector[blockIdx.z+(MAX_ROW_SIZE*blockIdx.y)] = sum[blockIdx.z+(MAX_ROW_SIZE*blockIdx.y)];
}
__syncthreads();
if (threadIdx.x ==0 && blockIdx.y == 0 && blockIdx.z == 0 && threadIdx.y == 0)
printf("d_colVector[0] = %f \t (incorrect, expected 2080)\n",d_colVector[0]); //this sum should be 528+1552 = 2080
}
void shuffleReduce()
{
const int SIZE = 1;
float * d_frame;
float * d_colVector;
float h_frame[64*64*SIZE] = {};
float h_colVector[64*SIZE] = {};
int counter = 1;
// fill each row with 1,2,3,4,....,64
for (int i = 0 ; i < 64*64*(SIZE) ; i++)
{
h_frame[i] = counter;
if (counter < 64)
counter++;
else
counter = 1;
}
cudaMallocManaged((void**) &d_frame, 64*(SIZE)*64*sizeof(float));
cudaMallocManaged((void**) &d_colVector, 64*(SIZE)*sizeof(float));
memcpy(d_frame, h_frame, 64*(SIZE)*64*sizeof(float));
//launch the kernel
shuffleReduction<<<dim3(1,SIZE,64), dim3(32,32,1)>>>(d_frame, d_colVector);
cudaDeviceSynchronize();
memcpy(h_colVector, d_colVector, 64*(SIZE)*sizeof(float));
cudaFree(d_frame);
cudaFree(d_colVector);
}
```