Reduction of Padded array using Cooperative Groups

I’m prototyping some comparative reduction kernels using cooperative groups by following the principles I just learned from here. This is the developer blog where a simple 1-D example of an add-reduction is showcased. I managed to extend the ideas using the CUDA documentation for simple 1-D comparison strategy using integers (because the documentation advises to use 32-bit types). However, when I tried to extend these ideas to use a 2-D grid (because I’m interested in 2-D pitched arrays), I got a funny behavior out of it.

I’m sharing my snippet so that any CUDA expert can spot the a mistake / conceptual miss-understanding on using cooperative groups principles:

#include<iostream>
#include<cmath>
#include<cuda_runtime.h>
#include<cuda_runtime_api.h>
#include<cooperative_groups.h>

namespace default_operators {
  template<class T>
  struct equal{__device__ constexpr int operator()(const T& a, const T& b) const { return (a==b?1:0); }};
  template<class T>
  struct lt{__device__ constexpr int operator()(const T& a, const T& b) const { return (a<b?1:0); }};
  template<class T>
  struct gt{__device__ constexpr int operator()(const T& a, const T& b) const { return (a>b?1:0); }};
}

namespace cg = cooperative_groups;

template<class T, class OP>
__device__ int thread_group_2d(T *a, size_t lda, T *b, size_t ldb, size_t ni, size_t nj, OP op) 
{
    int reduced = 1; // Assuming 'true' as initial condition

    for(size_t j = blockIdx.x * blockDim.x + threadIdx.x; j < nj; j += blockDim.x * gridDim.x) {
      for(size_t i = blockIdx.y * blockDim.y + threadIdx.y; i < ni; i += blockDim.y * gridDim.y) {
        reduced *= op(a[i * lda + j], b[i * ldb + j]);
      }
    }
    return reduced;
}

template<class T, class OP>
__device__ int reduce_group(cg::thread_group g, T *temp, T val, OP op)
{
    int lane = g.thread_rank();

    // Each iteration halves the number of active threads
    // Each thread computes its partial temp[i] to temp[lane+i]
    for (int i = g.size() / 2; i > 0; i /= 2)
    {
        temp[lane] = val;
        g.sync(); // wait for all threads to store
        if(lane<i) val *= op(val, temp[lane + i]);
        g.sync(); // wait for all threads to load
    }
    return val; // note: only thread 0 will return full sum
}


template<class T>
__global__ void reduceCompare(int* output, T* a, size_t  lda, T* b, size_t ldb, size_t  ni, size_t  nj)
{
    auto op = default_operators::equal<T>();
    // auto op = default_operators::lt<T>();
    // auto op = default_operators::gt<T>();

    extern __shared__ int temp[];

    int my_reduce = thread_group_2d(a, lda, b, ldb, ni, nj, op);

    auto g = cg::this_thread_block();
    int block_reduce = reduce_group(g, temp, my_reduce, op);

    if (g.thread_rank() == 0) atomicAdd(output, block_reduce);
}


int main()
{
  // Padded array paramaters
  int NI = 33;
  int NJ = 33;
  int LD = 92;
  std::cout << "array size :" << NI * NJ << "\n";
  
  using scalar = float;

  // Populate array
  scalar* data1 = (scalar*) malloc(NI * LD * sizeof(scalar));
  scalar* data2 = (scalar*) malloc(NI * LD * sizeof(scalar));
  memset(data1, 0, NI * LD * sizeof(scalar));
  memset(data1, 0, NI * LD * sizeof(scalar));

  // Fill array
  for (size_t i = 0; i < NI; i++) { 
    for (size_t j = 0; j < NJ; j++) { 
      data1[i * LD + j] = 1./3.;
      data2[i * LD + j] = 1./3.;
    }
  }
  // but ...
  data1[10*LD+10] = 0.331; // in block (0,0)
  data1[32*LD+11] = 0.332; // in block (1,0)
  data1[32*LD+32] = 0.333; // in block (1,1)
  data1[15*LD+32] = 0.334; // in block (0,1)
  
  scalar* d_data1;
  scalar* d_data2;
  cudaMalloc(&d_data1, NI * LD * sizeof(scalar));
  cudaMalloc(&d_data2, NI * LD * sizeof(scalar));
  cudaMemcpy(d_data1, data1, NI * LD * sizeof(scalar), cudaMemcpyHostToDevice);
  cudaMemcpy(d_data2, data2, NI * LD * sizeof(scalar), cudaMemcpyHostToDevice);

  int* d_out;
  cudaMalloc(&d_out, sizeof(int));
  cudaMemset(d_out, 0, sizeof(int));

  dim3 blockSize(32, 32, 1);
  dim3 nBlocks;
  nBlocks.x = (NI + blockSize.x - 1) / blockSize.x;
  nBlocks.y = (NJ + blockSize.y - 1) / blockSize.y;
  nBlocks.z = 1;
  size_t sharedBytes = blockSize.x * blockSize.y * sizeof(scalar);

  reduceCompare<<<nBlocks, blockSize, sharedBytes>>>(d_out, d_data1, LD, d_data2, LD, NI, NJ);

  int out = 0;
  cudaMemcpy(&out, d_out, sizeof(int), cudaMemcpyDeviceToHost);
  // std::cout << "answer : " << out << "\n"; // this reports answer = 1,2,3 or 4 depending on the values of data1 and data2
  std::cout << "answer / nBlocks : " << out << " / " << nBlocks.x * nBlocks.y << "\n"; 
  // std::cout << "answer : " << ((out == nBlocks.x * nBlocks.y) ? 1 : 0) << "\n";

  cudaFree(d_data1);
  cudaFree(d_data2);
  cudaFree(d_out);
  free(data1);
  free(data2);
}

In the above snippet, I’m basically doing a comparison between double variables (not the greatest idea), but as I’m not operating on the data (just simple copy-paste from Host to Device) the idea of the prototype holds for illustration.

Notice that depending on array values of data1 and data2, my expectation is to return an integer, either 0 or 1. However, it returns either integers 0,1,2,3 … N, where N is the number of blocks that my data spams.
In my 1-D scenario this didn’t happen!

Thus, it begs the questions … Why? What am I missing?

Best regards,
-M

After running your code 10 times, I always get the following result:

array size :1089
answer / nBlocks : 0 / 4

Oh ! must have left the exceptions probes that I was manually using to test … my apologies! Please, in the above code, comment the following lines:

//  data1[10*LD+10] = 0.331; // in block (0,0)
//  data1[32*LD+11] = 0.332; // in block (1,0)
//  data1[32*LD+32] = 0.333; // in block (1,1)
//  data1[15*LD+32] = 0.334; // in block (0,1)

… and now the output reads:

array size :1089
answer / nBlocks : 4 / 4

Your equal operator returns 1 if the two elements being compared are equal (and 0 otherwise). All of your array elements are equal, and you are accumulating a product result, so each block produces a value of 1. You have 4 blocks and the 4 blocks are doing an atomicAdd of that 1 value to the global result.

So the global result is 4.

What value are you expecting? If it is not 4, why do you expect that value?

If you are expecting a value of 0, I have no idea why.
If you are expecting a value of 1, then your usage of atomicAdd is not the right choice for the “final” reduction. You would need to use an atomic product, just like you are using a product to accumulate the individual reduction results (and initialize the global result to 1, not zero, just as you do when you start out on the initial reduction sequence.)

Of course genius, I’m not expecting 0 as my result!
I wish to accumulate a value of (int) 1. As you pointed out, one problem is the use of an atomicAdd function to do this.

As everyone knows, the CUDA toolkit does not provide and atomicMult, one is left to fill this gap. If you do as proposed in the devel-guide, one can simply write an approximation of atomicMult that reads

__device__ int atomicMult(int* address, int val) 
{ 
  int old = *address, assumed; 
  do { 
    assumed = old; 
    old = atomicCAS(address, assumed, (val * assumed)); 
 } while (assumed != old); return old;
}

Replace the atomicAdd by pluging the above function into my initial reduction snippet and check the result.

You’ll see that I still don’t get the accumulation that I’m looking for.

Perhaps my definition of atomicMult is improper, … or I’m still doing something wrong in my reduction strategy.
( … right now I don’t see it. But I hope an outsider could shed light on the correct way to do this. )

As for now by best solution I come up with is:

  • output the result of every block to an output array of size nBlocks.x * nBlocks.y to produce the blocks reduction on the host side. (Perhaps this is the only way to do it?)

One possible refactoring approach that could preserve atomicAdd usage would be to redefine the equal compare op to return 0 for equality and 1 otherwise. Then all of your work is done via sums, not products, and the final result of 0 would indicate equality. Any non-equal compare would be added to the chain which can only become positive (barring overflow) and in so doing your final result would be 0 for equality or some non-zero positive value otherwise.

To preserve your current semantics, I get the expected result if I use the code you have shown and initialize the global value to 1, not zero.

$ cat t2204.cu
#include<iostream>
#include<cmath>
#include<cuda_runtime.h>
#include<cuda_runtime_api.h>
#include<cooperative_groups.h>

__device__ int atomicMult(int* address, int val)
{
  int old = *address, assumed;
  do {
    assumed = old;
    old = atomicCAS(address, assumed, (val * assumed));
 } while (assumed != old); return old;
}

namespace default_operators {
  template<class T>
  struct equal{__device__ constexpr int operator()(const T& a, const T& b) const { return (a==b?1:0); }};
  template<class T>
  struct lt{__device__ constexpr int operator()(const T& a, const T& b) const { return (a<b?1:0); }};
  template<class T>
  struct gt{__device__ constexpr int operator()(const T& a, const T& b) const { return (a>b?1:0); }};
}

namespace cg = cooperative_groups;

template<class T, class OP>
__device__ int thread_group_2d(T *a, size_t lda, T *b, size_t ldb, size_t ni, size_t nj, OP op)
{
    int reduced = 1; // Assuming 'true' as initial condition

    for(size_t j = blockIdx.x * blockDim.x + threadIdx.x; j < nj; j += blockDim.x * gridDim.x) {
      for(size_t i = blockIdx.y * blockDim.y + threadIdx.y; i < ni; i += blockDim.y * gridDim.y) {
        reduced *= op(a[i * lda + j], b[i * ldb + j]);
      }
    }
    return reduced;
}

template<class T, class OP>
__device__ int reduce_group(cg::thread_group g, T *temp, T val, OP op)
{
    int lane = g.thread_rank();

    // Each iteration halves the number of active threads
    // Each thread computes its partial temp[i] to temp[lane+i]
    for (int i = g.size() / 2; i > 0; i /= 2)
    {
        temp[lane] = val;
        g.sync(); // wait for all threads to store
        if(lane<i) val *= op(val, temp[lane + i]);
        g.sync(); // wait for all threads to load
    }
    return val; // note: only thread 0 will return full sum
}


template<class T>
__global__ void reduceCompare(int* output, T* a, size_t  lda, T* b, size_t ldb, size_t  ni, size_t  nj)
{
    auto op = default_operators::equal<T>();
    // auto op = default_operators::lt<T>();
    // auto op = default_operators::gt<T>();

    extern __shared__ int temp[];

    int my_reduce = thread_group_2d(a, lda, b, ldb, ni, nj, op);

    auto g = cg::this_thread_block();
    int block_reduce = reduce_group(g, temp, my_reduce, op);

    if (g.thread_rank() == 0) atomicMult(output, block_reduce);
}


int main()
{
  // Padded array paramaters
  int NI = 33;
  int NJ = 33;
  int LD = 92;
  std::cout << "array size :" << NI * NJ << "\n";

  using scalar = float;

  // Populate array
  scalar* data1 = (scalar*) malloc(NI * LD * sizeof(scalar));
  scalar* data2 = (scalar*) malloc(NI * LD * sizeof(scalar));
  memset(data1, 0, NI * LD * sizeof(scalar));
  memset(data1, 0, NI * LD * sizeof(scalar));

  // Fill array
  for (size_t i = 0; i < NI; i++) {
    for (size_t j = 0; j < NJ; j++) {
      data1[i * LD + j] = 1./3.;
      data2[i * LD + j] = 1./3.;
    }
  }
  // but ...
//  data1[10*LD+10] = 0.331; // in block (0,0)
//  data1[32*LD+11] = 0.332; // in block (1,0)
//  data1[32*LD+32] = 0.333; // in block (1,1)
//  data1[15*LD+32] = 0.334; // in block (0,1)

  scalar* d_data1;
  scalar* d_data2;
  cudaMalloc(&d_data1, NI * LD * sizeof(scalar));
  cudaMalloc(&d_data2, NI * LD * sizeof(scalar));
  cudaMemcpy(d_data1, data1, NI * LD * sizeof(scalar), cudaMemcpyHostToDevice);
  cudaMemcpy(d_data2, data2, NI * LD * sizeof(scalar), cudaMemcpyHostToDevice);

  int* d_out;
  cudaMalloc(&d_out, sizeof(int));
  //cudaMemset(d_out, 0, sizeof(int));
  int init_val = 1;
  cudaMemcpy(d_out, &init_val, sizeof(int), cudaMemcpyHostToDevice);

  dim3 blockSize(32, 32, 1);
  dim3 nBlocks;
  nBlocks.x = (NI + blockSize.x - 1) / blockSize.x;
  nBlocks.y = (NJ + blockSize.y - 1) / blockSize.y;
  nBlocks.z = 1;
  size_t sharedBytes = blockSize.x * blockSize.y * sizeof(scalar);

  reduceCompare<<<nBlocks, blockSize, sharedBytes>>>(d_out, d_data1, LD, d_data2, LD, NI, NJ);

  int out = 0;
  cudaMemcpy(&out, d_out, sizeof(int), cudaMemcpyDeviceToHost);
  // std::cout << "answer : " << out << "\n"; // this reports answer = 1,2,3 or 4 depending on the values of data1 and data2
  std::cout << "answer / nBlocks : " << out << " / " << nBlocks.x * nBlocks.y << "\n";
  // std::cout << "answer : " << ((out == nBlocks.x * nBlocks.y) ? 1 : 0) << "\n";

  cudaFree(d_data1);
  cudaFree(d_data2);
  cudaFree(d_out);
  free(data1);
  free(data2);
}
$ nvcc -o t2204 t2204.cu
$ compute-sanitizer ./t2204
========= COMPUTE-SANITIZER
array size :1089
answer / nBlocks : 1 / 4
========= ERROR SUMMARY: 0 errors
$

I get you now !
Is the initial value on the accumulation output (I mistakenly thought it was initial value of the block reductions).
Good catch ; )

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.