CUDA 11.4.19 : median on ~40 elements integer vector + if statement

You could possibly use something like this as a starting point:

#include <cub/cub.cuh>


using mt = int;
const int NS = 64;
const int NV = 1000;

template <typename T>
__global__ void k(const T * __restrict__ d, bool * __restrict__ r, const T f1) {
    // Specialize BlockRadixSort for a 1D block of NS threads owning 1 item each
    typedef cub::BlockRadixSort<T, NS, 1> BlockRadixSort;
    // Allocate shared memory for BlockRadixSort
    __shared__ typename BlockRadixSort::TempStorage temp_storage;
    __shared__ T median;
    if (!threadIdx.x) median = 0;
    // Obtain a segment of consecutive items that are blocked across threads
    T thread_keys[1];
    thread_keys[0] = d[blockIdx.x*NS+threadIdx.x];
    // Collectively sort the keys
    BlockRadixSort(temp_storage).Sort(thread_keys);
    if (NS & 1) { // set length is odd
      if (threadIdx.x == (NS/2)) median = 2*thread_keys[0];}
    else // set length is even
      if ((threadIdx.x == (NS/2)) || (threadIdx.x == ((NS/2)-1))) atomicAdd(&median, thread_keys[0]);
    __syncthreads();
    r[blockIdx.x*NS+threadIdx.x] = (thread_keys[0] > f1*(median/2));
}


int main(){

  mt *d;
  bool *r;
  mt fac = 2;
  cudaMallocManaged(&d, NS*NV*sizeof(d[0]));
  cudaMallocManaged(&r, NS*NV*sizeof(r[0]));
  // initialize data in d
  // ...
  k<<<NV, NS>>>(d, r, fac);
  cudaDeviceSynchronize();
}

If this is the only thing you intend to do on the GPU, I doubt it will be meaningfully faster than a reasonably good CPU implementation. For vector length less than 64 (e.g. 50) I would recommend padding each vector and adjust the median point accordingly. Note that padding could be done at the point of data load by each threadblock; you do not necessarily have to pad the data in global memory. This code provides results for the elements in sorted order. If you wanted results for elements in unsorted order, you could use cub sort pairs functionality, sorting each data set item with an index. Due to the median handling, this approach will have some limitations. For example, for integer types, it won’t work if the actual median is greater than half the available integer range. There are ways to address this with additional code complexity.

1 Like