Alternative for a linked list in CUDA

I have a problem with my CUDA code. I search for values in an 1D array. When I found for example the value 255 in an RGB image i want to store the index of this value.

I know how to do it with standard c++ (like a linked list) but i have no clue how to do this in parallel with cuda c.

Thank you in advance!

cross posting:

http://stackoverflow.com/questions/43117245/alternative-for-a-linked-list-in-cuda

The description of the required processing is a bit vague, but seems to be closely related to stream compaction (here: a variant where you would store the original index instead of the original data). If you do an internet search for “stream compaction” plus “CUDA” you will find much relevant material.

njuffa brings up a good point with regards to stream compaction as there may be more than one element in the array that contains the value you’re looking for.

thrust::device_vector<int> vals;
/* ... */

thrust::device_vector<ptrdiff_t> indices{vals.size(), -1};

auto const target_val = int{255};

auto const set_found_idx = [=] __device__ (int const v) -> ptrdiff_t
{
  if (v == target_val) {
    return blockIdx.x * blockDim.x + threadIdx.x;
  } else {
    return -1;
  }
}

auto const indices_end = thrust::remove_copy_if(
  thrust::make_transform_iterator(vals.begin(), set_found_idx),
  thrust::make_transform_iterator(val.end(),    set_found_idx),
  indices.begin(),
  [] __device__ (int const v) -> bool
  {
    return v >= 0;
  }
);

cudaDeviceSynchronize();

I think this’ll do what you want. The code wasn’t tested or compiled but basically we use the lambda to transform the values array to -1 or the current thread index and then we remove all the -1’s and store the result in “indices”.

Edit:

This is bad code. The only reliable way to get the thread id like that is to include something like a counting iterator which really complicates things. It might be easiest to just roll your own kernel :P

Or you can make a transform iterator out of a zip iterator with the vals and counting iterator.

Okay, I felt super bad about giving bad CUDA advice so I sat down and came up with the proper, completely over-engineered solution so hopefully I can get some feedback on the quality of it :P

// build with: nvcc -gencode=arch=compute_61,code=sm_61 -O3 --expt-extended-lambda -o ll linked_list.cu

#include <array>
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/remove.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/tuple.h>
#include <thrust/functional.h>

namespace T = thrust;

// unfortunately, we have to define a formal callable
// to be used in our thrust::make_transform_iterator call
// nvcc/cl.exe was giving incomplete type errors
template <typename V>
struct lame_callable 
  : public T::unary_function<T::tuple<int, ptrdiff_t> const&, ptrdiff_t>
{
  V val;

  lame_callable(V const& v) : val{v} {}

  __host__ __device__
  auto operator()(T::tuple<int, ptrdiff_t> const& tup) -> ptrdiff_t
  {
    if (T::get<0>(tup) == val) {
      return T::get<1>(tup);
    } else {
      return -1;
    }    
  }
};

int main(void)
{
  auto const target_val = int{0};

  // array indices:      0  1  2  3  4  5  6  7  8  9
  std::array<int, 10> vs{0, 1, 0, 3, 4, 0, 6, 7, 0, 9};

  T::device_vector<int>       vals{vs.begin(), vs.end()};
  T::device_vector<ptrdiff_t> indices{vals.size(), -1};


  // basically, what we do here is take our values array
  // and create a temporary aligned array whose values
  // equal the current array index
  auto const zip_begin = T::make_zip_iterator(
    T::make_tuple(
      vals.begin(), 
      T::make_counting_iterator(0)));

  // we then transform our iterator into one that
  // processes the vals array and temporary array
  // and returns a simple -1 or the index value
  auto const begin = T::make_transform_iterator(
    zip_begin, lame_callable<int>{target_val});

  auto const end = begin + vals.size();

  // perform actual calculations
  auto const indices_end = T::remove_copy_if(
    begin, end,
    indices.begin(),
    [] __device__ (ptrdiff_t const& pdiff) -> bool
    {
      return pdiff == -1;
    });

  // I don't know if this needs to be here...
  // But better safe than sorry
  cudaDeviceSynchronize();

  T::host_vector<int> h_indices{indices.begin(), indices_end};

  for (auto const& idx : h_indices) {
    std::cout << idx << "\n";
  }

  // prints:
  // 0
  // 2
  // 5
  // 8

  return 0;
}