I think I’ve managed to implement the mean reduction of an array of arbitrary size. I pad an odd length array with zeros to the power of two size and compute the mean in the last block considering this padding. Tested it with a different array lengths, block and thread numbers. In prod, I’ll change the thread block tile size from 4 to 32.

I’ve got an idea how to detect the last block from threadFenceReduction CUDA example. Incidentally, I’m uncertain if I need to call the `__threadfence()`

intrinsic function, it appears that everything works even without it.

```
#include "cuda_runtime.h"
#include <cuda_runtime_api.h>
#include "device_launch_parameters.h"
#include <device_atomic_functions.h>
#include <stdio.h>
#include <math.h>
#include <iostream>
// This is for Visual Studio Intellisence to work
#ifndef __CUDACC__
#define __CUDACC__
#endif // !__CUDACC__
#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>
namespace cg = cooperative_groups;
/// Calculate approximate standard deviation of integers in vec
__device__ float reduce_mean(const cg::thread_block_tile<4>& tile, const float* vec, int length) {
float thread_sum = 0;
// calculate average first
for (int i = tile.thread_rank(); i < length; i+= tile.num_threads()) {
thread_sum += vec[i];
}
// cg::plus<float> allows cg::reduce() to know it can use hardware acceleration for addition
//auto reduce_sum = cg::reduce(tile, thread_sum, cg::plus<float>());
//printf("Block %d reduce_sum=%f mean=%f \n", blockIdx.x, reduce_sum, reduce_sum / length);
return cg::reduce(tile, thread_sum, cg::plus<float>()) / length;
}
// Global variable used by reduceSinglePass to count how many blocks have
// finished
__device__ unsigned int retirementCount = 0;
__global__ void statisticsKernelArbitrarySize(const float* data, int length, int odd_len, float* mean)
{
cg::grid_group grid = cg::this_grid();
cg::thread_block block = cg::this_thread_block();
extern float __shared__ sdata[];
// Stride over grid and add the values to a shared memory buffer
sdata[block.thread_rank()] = 0;
for (int i = grid.thread_rank(); i < length; i += grid.size())
{
sdata[block.thread_rank()] += data[i];
}
cg::sync(block);
cg::thread_block_tile<4> tile = cg::tiled_partition<4>(block);
float tile_mean = reduce_mean(tile, sdata, block.size());
if (block.thread_rank() == 0)
mean[blockIdx.x] = tile_mean;
__shared__ bool amLast;
// wait until all outstanding memory instructions in this thread are
// finished
//__threadfence();
// Thread 0 takes a ticket
if (block.thread_rank() == 0) {
unsigned int ticket = atomicInc(&retirementCount, grid.num_blocks());
// If the ticket ID is equal to the number of blocks, we are the last
// block!
amLast = (ticket == grid.num_blocks() - 1);
}
grid.sync();
if (grid.thread_rank() == 0)
{
for (int block = 1; block < grid.num_blocks(); block++) {
mean[0] += mean[block];
}
if (amLast)
mean[0] /= grid.num_blocks() * (float)(length - odd_len) / grid.size();
else
mean[0] /= grid.num_blocks() * (float)length / grid.size();
}
}
int main()
{
constexpr int length = 16;
constexpr int odd_len = 2;
float h_data[length]{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 0 };
float* d_data = nullptr;
cudaMalloc(&d_data, sizeof(float) * length);
cudaMemcpyAsync(d_data, h_data, sizeof(float) * length, cudaMemcpyHostToDevice);
constexpr int n_blocks = 2;
constexpr int n_threads = 4;
dim3 dimBlock(n_threads, 1, 1);
dim3 dimGrid(n_blocks, 1, 1);
float* d_mean = nullptr;
cudaMalloc(&d_mean, n_blocks * sizeof(float));
int sharedMemorySize = n_threads * sizeof(float);
//void* kernelArgs[] = { (void*)&d_data, (void*)&length, (void*)&d_mean };
void* kernelArgs[] = { (void*)&d_data, (void*)&length, (void*)&odd_len, (void*)&d_mean };
cudaLaunchCooperativeKernel(
(void*)statisticsKernelArbitrarySize,
dimGrid, dimBlock,
kernelArgs,
sharedMemorySize);
auto error = cudaDeviceSynchronize();
float h_mean = 0;
cudaMemcpyAsync(&h_mean, d_mean, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_data);
cudaFree(d_mean);
float expected = 0;
for (int i = 0; i < length; i++)
{
expected += h_data[i];
}
std::cout << "\nExpected Mean: " << expected / (length - odd_len) <<" Actual Mean: " << h_mean << '\n';
return 0;
}
```