I get it. Sorry for not sending the full code, I didn’t want to complicate it. I actually defined a macro based on TileSize when it is greater than 32, it just defines a subtile and performs reduction using that.
I have done some refactoring to remove the code from any dependencies, so it won’t match with the post. compute sanitizer --tool racecheck
on this code will throw an error with TileSize > 32 and won’t otherwise. However I don’t think there is actually any race since there is a tile.sync() operation.
Here is the full code:
#include <stdio.h>
#include <cuda_runtime.h>
#include <cub/cub.cuh> // Include CUB
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
#define BlockSize 128
#define TileSize 64
#define SubTileSize 32
#if TileSize > 32
template <typename Op>
__device__ float warpReduce(cg::thread_block_tile<SubTileSize> tile, float value, Op operation)
{
// Intra-tile reduction
for (int offset = tile.size() / 2; offset > 0; offset /= 2)
{
value = operation(value, tile.shfl_down(value, offset));
}
return value;
}
// Generalized tile-based reduction function
template <typename Op>
__device__ void tileReduce(cg::thread_block_tile<TileSize> tile, float *orig_val, Op operation, float &red_val)
{
float value = 0.0f;
cg::thread_block_tile<SubTileSize> subtile = cg::tiled_partition<SubTileSize>(tile);
if (subtile.meta_group_rank() == 0)
{
subtile.sync();
for (uint i = subtile.thread_rank(); i < TileSize; i += SubTileSize)
value = max(value, orig_val[i]);
subtile.sync();
value = warpReduce(subtile, value, operation);
}
tile.sync();
if (tile.thread_rank() == 0)
red_val = value;
}
#else
template <typename Op>
__device__ void tileReduce(cg::thread_block_tile<TileSize> tile, float *orig_val, Op operation, float &red_val)
{
// Intra-tile reduction
float value = orig_val[tile.thread_rank()];
for (int offset = tile.size() / 2; offset > 0; offset /= 2)
{
value = operation(value, tile.shfl_down(value, offset));
}
tile.sync();
if (tile.thread_rank() == 0)
red_val = value;
}
#endif
// Kernel to perform reduction using tiles
template <typename Op>
__global__ void reduceKernel(float *input, float *output, int numElements, Op operation)
{
__shared__ float orig_val[TileSize];
__shared__ float red_val;
int tid = blockIdx.x * blockDim.x + threadIdx.x;
const uint group_rank = tid / TileSize;
cg::thread_block cta = cg::this_thread_block();
cg::thread_block_tile<TileSize> tile = cg::tiled_partition<TileSize>(cta);
// Perform tile-based reduction
if (group_rank == 0)
{
for (uint i = tile.thread_rank(); i < numElements; i += TileSize)
orig_val[tile.thread_rank()] = operation(orig_val[tile.thread_rank()], input[i]);
tile.sync();
tileReduce(tile, orig_val, operation, red_val); // a function for reduction uses that uses tile.shfl_down
tile.sync();
if (tile.thread_rank() == 0)
{
*output = red_val;
}
tile.sync();
}
if (group_rank == 0 && tile.thread_rank() == 0)
{
printf("S is %f\n", red_val);
}
}
// Host-side code
int main()
{
const int numElements = 1024;
// Allocate and initialize host data
float h_input[numElements];
for (int i = 0; i < numElements; ++i)
{
h_input[i] = 1.0f; // simple input where each element is 1.0
}
h_input[2] = 2.0f; // change one element to 2.0
h_input[16] = 5.0f;
h_input[60] = 6.3f;
// Allocate device memory
float *d_input = nullptr;
float *d_output = nullptr;
cudaMalloc((void **)&d_input, numElements * sizeof(float));
cudaMalloc((void **)&d_output, sizeof(float));
// Copy input data to device
cudaMemcpy(d_input, h_input, numElements * sizeof(float), cudaMemcpyHostToDevice);
// Initialize output to zero
float h_output = 0.0f;
cudaMemcpy(d_output, &h_output, sizeof(float), cudaMemcpyHostToDevice);
// Launch the kernel (using cub::Sum functor)
// execKernel((reduceKernel), 1, BlockSize, 0, true,
// d_input, d_output, numElements, cub::Max());
reduceKernel<<<1, BlockSize>>>(d_input, d_output, numElements, cub::Max());
// Copy the result back to host
cudaMemcpy(&h_output, d_output, sizeof(float), cudaMemcpyDeviceToHost);
// Print the result
printf("Reduction result: %f\n", h_output); // Expect 1024.0 for sum reduction
// Clean up
cudaFree(d_input);
cudaFree(d_output);
return 0;
}
Edit: Deleting general reply and changing to threaded reply.