This would be a segmented reduction. In thrust you would use reduce_by_key in order to perform all reductions in a single thrust call.
In cub, using block reduce, each block handles multiple elements. The canonical way to do this in cub is to define a local array of a size that, when multiplied by the block size, is equal or larger than the size of each segment you want to reduce. You do a cub block load into this local array, then do the block reduce, specialized for the size of the local array. For large segments (segment size >> block size) this gets unwieldy (the large local array is inconvenient). Alternatively you could preface the block load in cub with a block-striding loop to load the segment into one element per thread, and then use the cub example you linked more or less unmodified for the remainder of the kernel. The block-stride loop is given by example below.
Thrust should be pretty easy to implement with reduce_by_key. If it were me, and I wanted a cub-like approach, I would probably just implement my own block-striding segmented reduce. I’m pretty sure I’ve posted examples of a block-stride segmented reduction here on this forum, but I can’t locate at the moment, so here is something:
$ cat t1530.cu
#include <iostream>
// designed to work only with threads per block = 1024
// K = size of segment
// warp shuffles may need adjustment for 64-bit T
template <typename T>
__global__ void seg_red(const T * __restrict__ in, T * __restrict__ out, const size_t K){
__shared__ T sdata[32];
int tid = threadIdx.x;
unsigned mask = 0xFFFFFFFFU;
int lane = threadIdx.x % warpSize; // could be optimized for warpSize = 32
int warpID = threadIdx.x / warpSize; // could be optimized for warpSize = 32
size_t my_idx = threadIdx.x+K*blockIdx.x;
T sum = 0;
while (my_idx < K*(blockIdx.x+1)){
sum += in[my_idx];
my_idx += blockDim.x;}
// 1st warp-shuffle reduction
for (int offset = warpSize>>1; offset > 0; offset >>= 1)
sum += __shfl_down_sync(mask, sum, offset);
if (lane == 0) sdata[warpID] = sum;
__syncthreads(); // put warp results in shared mem
// hereafter, just warp 0
if (warpID == 0){
// reload val from shared mem
sum = sdata[lane];
// final warp-shuffle reduction
for (int offset = warpSize>>1; offset > 0; offset >>= 1)
sum += __shfl_down_sync(mask, sum, offset);
if (tid == 0) out[blockIdx.x] = sum;
}
}
typedef float myt;
int main(){
size_t K = 1048576, M = 200; // modifiable
myt *d_in, *d_out, *h_in, *h_out;
h_in = new myt[K*M];
h_out = new myt[M];
cudaMalloc(&d_in, K*M*sizeof(myt));
cudaMalloc(&d_out, M*sizeof(myt));
for (int i = 0; i < K*M; i++) h_in[i] = 1;
cudaMemcpy(d_in, h_in, K*M*sizeof(myt), cudaMemcpyHostToDevice);
seg_red<<<M, 1024>>>(d_in, d_out, K);
cudaMemcpy(h_out, d_out, M*sizeof(myt), cudaMemcpyDeviceToHost);
for (int i = 0; i < M; i++) if (h_out[i] != K){std::cout << "mismatch at: " << i << " was: " << h_out[i] << " should be: " << K << std::endl; return 0;}
std::cout << "success" << std::endl;
return 0;
}
$ nvcc -o t1530 t1530.cu
$ cuda-memcheck ./t1530
========= CUDA-MEMCHECK
success
========= ERROR SUMMARY: 0 errors
$