I’m prototyping some comparative reduction kernels using cooperative groups
by following the principles I just learned from here. This is the developer blog where a simple 1-D example of an add-reduction is showcased. I managed to extend the ideas using the CUDA documentation for simple 1-D comparison strategy using integers (because the documentation advises to use 32-bit types). However, when I tried to extend these ideas to use a 2-D grid (because I’m interested in 2-D pitched arrays), I got a funny behavior out of it.
I’m sharing my snippet so that any CUDA expert can spot the a mistake / conceptual miss-understanding on using cooperative groups
principles:
#include<iostream>
#include<cmath>
#include<cuda_runtime.h>
#include<cuda_runtime_api.h>
#include<cooperative_groups.h>
namespace default_operators {
template<class T>
struct equal{__device__ constexpr int operator()(const T& a, const T& b) const { return (a==b?1:0); }};
template<class T>
struct lt{__device__ constexpr int operator()(const T& a, const T& b) const { return (a<b?1:0); }};
template<class T>
struct gt{__device__ constexpr int operator()(const T& a, const T& b) const { return (a>b?1:0); }};
}
namespace cg = cooperative_groups;
template<class T, class OP>
__device__ int thread_group_2d(T *a, size_t lda, T *b, size_t ldb, size_t ni, size_t nj, OP op)
{
int reduced = 1; // Assuming 'true' as initial condition
for(size_t j = blockIdx.x * blockDim.x + threadIdx.x; j < nj; j += blockDim.x * gridDim.x) {
for(size_t i = blockIdx.y * blockDim.y + threadIdx.y; i < ni; i += blockDim.y * gridDim.y) {
reduced *= op(a[i * lda + j], b[i * ldb + j]);
}
}
return reduced;
}
template<class T, class OP>
__device__ int reduce_group(cg::thread_group g, T *temp, T val, OP op)
{
int lane = g.thread_rank();
// Each iteration halves the number of active threads
// Each thread computes its partial temp[i] to temp[lane+i]
for (int i = g.size() / 2; i > 0; i /= 2)
{
temp[lane] = val;
g.sync(); // wait for all threads to store
if(lane<i) val *= op(val, temp[lane + i]);
g.sync(); // wait for all threads to load
}
return val; // note: only thread 0 will return full sum
}
template<class T>
__global__ void reduceCompare(int* output, T* a, size_t lda, T* b, size_t ldb, size_t ni, size_t nj)
{
auto op = default_operators::equal<T>();
// auto op = default_operators::lt<T>();
// auto op = default_operators::gt<T>();
extern __shared__ int temp[];
int my_reduce = thread_group_2d(a, lda, b, ldb, ni, nj, op);
auto g = cg::this_thread_block();
int block_reduce = reduce_group(g, temp, my_reduce, op);
if (g.thread_rank() == 0) atomicAdd(output, block_reduce);
}
int main()
{
// Padded array paramaters
int NI = 33;
int NJ = 33;
int LD = 92;
std::cout << "array size :" << NI * NJ << "\n";
using scalar = float;
// Populate array
scalar* data1 = (scalar*) malloc(NI * LD * sizeof(scalar));
scalar* data2 = (scalar*) malloc(NI * LD * sizeof(scalar));
memset(data1, 0, NI * LD * sizeof(scalar));
memset(data1, 0, NI * LD * sizeof(scalar));
// Fill array
for (size_t i = 0; i < NI; i++) {
for (size_t j = 0; j < NJ; j++) {
data1[i * LD + j] = 1./3.;
data2[i * LD + j] = 1./3.;
}
}
// but ...
data1[10*LD+10] = 0.331; // in block (0,0)
data1[32*LD+11] = 0.332; // in block (1,0)
data1[32*LD+32] = 0.333; // in block (1,1)
data1[15*LD+32] = 0.334; // in block (0,1)
scalar* d_data1;
scalar* d_data2;
cudaMalloc(&d_data1, NI * LD * sizeof(scalar));
cudaMalloc(&d_data2, NI * LD * sizeof(scalar));
cudaMemcpy(d_data1, data1, NI * LD * sizeof(scalar), cudaMemcpyHostToDevice);
cudaMemcpy(d_data2, data2, NI * LD * sizeof(scalar), cudaMemcpyHostToDevice);
int* d_out;
cudaMalloc(&d_out, sizeof(int));
cudaMemset(d_out, 0, sizeof(int));
dim3 blockSize(32, 32, 1);
dim3 nBlocks;
nBlocks.x = (NI + blockSize.x - 1) / blockSize.x;
nBlocks.y = (NJ + blockSize.y - 1) / blockSize.y;
nBlocks.z = 1;
size_t sharedBytes = blockSize.x * blockSize.y * sizeof(scalar);
reduceCompare<<<nBlocks, blockSize, sharedBytes>>>(d_out, d_data1, LD, d_data2, LD, NI, NJ);
int out = 0;
cudaMemcpy(&out, d_out, sizeof(int), cudaMemcpyDeviceToHost);
// std::cout << "answer : " << out << "\n"; // this reports answer = 1,2,3 or 4 depending on the values of data1 and data2
std::cout << "answer / nBlocks : " << out << " / " << nBlocks.x * nBlocks.y << "\n";
// std::cout << "answer : " << ((out == nBlocks.x * nBlocks.y) ? 1 : 0) << "\n";
cudaFree(d_data1);
cudaFree(d_data2);
cudaFree(d_out);
free(data1);
free(data2);
}
In the above snippet, I’m basically doing a comparison between double variables (not the greatest idea), but as I’m not operating on the data (just simple copy-paste from Host to Device) the idea of the prototype holds for illustration.
Notice that depending on array values of data1 and data2, my expectation is to return an integer, either 0 or 1. However, it returns either integers 0,1,2,3 … N, where N is the number of blocks that my data spams.
In my 1-D scenario this didn’t happen!
Thus, it begs the questions … Why? What am I missing?
Best regards,
-M