thrust::exclusive_scan with thrust::zip_iterator?

If I have two integer arrays that are both size N and I create a zip iterator to the beginning of the data, can I use the thrust’s exclusive scan to perform a prefix sum on both arrays at the same time?

Yes.

$ cat t612.cu
#include <thrust/device_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/sequence.h>
#include <thrust/copy.h>
#include <thrust/scan.h>
#include <iostream>
#define DSIZE 4

struct my_op {

template <typename T>
  __host__ __device__
    T operator()(T v1, T v2)  {
    thrust::get<0>(v1) = thrust::get<0>(v2) + thrust::get<0>(v1);
    thrust::get<1>(v1) = thrust::get<1>(v2) + thrust::get<1>(v1);
    return v1 ;
}

};

int main(){

  thrust::device_vector<float> i0(DSIZE);
  thrust::device_vector<float> i1(DSIZE);
  thrust::device_vector<float> o0(DSIZE);
  thrust::device_vector<float> o1(DSIZE);
  thrust::sequence(i0.begin(), i0.end());
  thrust::sequence(i1.begin(), i1.end());
  thrust::exclusive_scan(thrust::make_zip_iterator(thrust::make_tuple(i0.begin(), i1.begin())), thrust::make_zip_iterator(thrust::make_tuple(i0.end(), i1.end())), thrust::make_zip_iterator(thrust::make_tuple(o0.begin(), o1.begin())), thrust::tuple<float, float>(0.0f, 1.0f), my_op());
  thrust::copy(o0.begin(), o0.end(), std::ostream_iterator<float>(std::cout, ","));
  std::cout << std::endl;
  thrust::copy(o1.begin(), o1.end(), std::ostream_iterator<float>(std::cout, ","));
  std::cout << std::endl;
  return 0;
}
$ nvcc -arch=sm_20 -o t612 t612.cu
$ ./t612
0,0,1,3,
1,1,2,4,
$

Your code is very nice. Thank you!

Okay, I’m sorry to necro this thread but it turns out, this random question I asked turned useful for my purposes.

I’m trying to do a modified prefix sum.

Let’s say I have the the two arrays :

int a[10] = { 1, 1, 1, 0, 1, 1, 1, 0, 0, 1 };
int b[10] = { 1, 1, 1, 3, 2, 2, 3, 1, 1, 3 };

And I want to do an exclusive scan on b but with instead of just b[i] + b[i + 1] or w/e, I want a[i] * b[i] + a[i + 1] * b[i + 1]. I also want a to be non-mutated by the summing procedure, if that’s possible.

So using the two arrays above, the output should be :

a       = { 1, 1, 1, 0, 1, 1, 1,  0,  0,  1 }
b       = { 1, 1, 1, 3, 2, 2, 3,  1,  1,  3 }
pfx_sum = { 0, 1, 2, 3, 3, 5, 7, 10, 10, 10 }

So far, I don’t know how to do this so I make a tmp array. Here’s what I have now :

#include <thrust/device_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/sequence.h>
#include <thrust/copy.h>
#include <thrust/scan.h>
#include <iostream>

const int size = 10;

struct my_op {

    template <typename T>
    __host__ __device__
    T operator()(T v1, T v2)  
    {
        thrust::get<0>(v1) = thrust::get<1>(v2) * thrust::get<0>(v2) + thrust::get<0>(v1);
        return v1; 
    }
};

int main(void)
{
    thrust::device_vector<int> fs(size),
                               nominated_pa(size),
                               pfx_sum(size),
                               tmp(size);
    
    int h_fs[10]           = { 1, 1, 1, 3, 2, 2, 3, 1, 1, 3 };
    int h_nominated_pa[10] = { 1, 1, 1, 0, 1, 1, 1, 0, 0, 1 };

    cudaMemcpy(thrust::raw_pointer_cast(fs.data()),
               h_fs,
               size * sizeof(int),
               cudaMemcpyHostToDevice);

    cudaMemcpy(thrust::raw_pointer_cast(nominated_pa.data()),
               h_nominated_pa,
               size * sizeof(int),
               cudaMemcpyHostToDevice);

    thrust::exclusive_scan(
        thrust::make_zip_iterator(
            thrust::make_tuple(fs.begin(), 
                               nominated_pa.begin())),
        thrust::make_zip_iterator(
            thrust::make_tuple(fs.end(), 
                               nominated_pa.end())), 
        thrust::make_zip_iterator(
            thrust::make_tuple(pfx_sum.begin(), 
                               tmp.begin())), 
        thrust::tuple<int, int>(0, nominated_pa[0]), 
        my_op());

    cudaDeviceSynchronize();

    for (int i = 0; i < size; ++i)
        std::cout << pfx_sum[i] << ", " << nominated_pa[i] << std::endl;

    return 0;
}

Output:

0, 1
1, 1
2, 1
3, 0
3, 1
5, 1
7, 1
10, 0
10, 0
10, 1

You seem to be generating the data you want. Are you asking a question? Are you just asking how to do it without creating a temp array?

The last part, how to do it without needing a temp array.

Can I just change my template function to return an int instead of the template type?

If you study carefully the documentation for thrust::exclusive_scan:

https://thrust.github.io/doc/group__prefixsums.html#gaa3f981950f16c9dae693590b79a9ff90

you can observe a few things. First of all, the binary op must be associative, which has the implication that the input types must be convertible to each other (the operation could be presented as A+B or B+A, and must produce the same result). Secondly, extending from the previous point, the types associated with input and output of the binary op, as well as input iterators and output iterators of the excusive_scan operation, must all be convertible to each other. Convertible for this discussion means that they must be the same type, i.e. dereferencing them produces the same type data item (quantity, tuple or whatever).

Therefore, the answer to this question:

“Can I just change my template function to return an int instead of the template type?”

is no. If you did that, the return type (int) of the binary op passed to exclusive_scan would not match its input types (which are effectively tuples, in this case).

So this means that if I desire a single int vector as output, then I must somehow figure out how to pass a single int vector as input (to the exclusive_scan). However we’ve already observed that you don’t really want a scan on a single vector, but on some combination of two vectors, thus leading us to the zip_iterator approach. Based on what’s been covered so far, the zip_iterator approach by itself won’t get us there.

However we can make an observation about the nature of the operation you are trying to do, in that in all cases, your scan function is not operating on A or B independently ever, but always on the product of A[i]*B[i]. We can therefore use another thrust fancy iterator called a transform iterator (on top of a zip iterator) to combine the corresponding elements of A and B by taking their product, immediately before operating on the product with the exclusive scan operation. The transform iterator also allows us to avoid any extra data storage or memory loads and stores associated with the intermediate result (the product of A[i] and B[i]) and is therefore “efficient” in that respect.

We no longer need a special functor for the exclusive_scan operation in this realization, but we will need one for the transform_iterator.

Here’s a fully worked example:

$ cat t602.cu
#include <thrust/device_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/copy.h>
#include <thrust/scan.h>
#include <iostream>

template <typename T>
struct my_op : public thrust::unary_function<thrust::tuple<T, T>, T> {

  __host__ __device__
    T operator()(thrust::tuple<T, T> v1)  {
    return thrust::get<0>(v1)*thrust::get<1>(v1);
}

};

int main(){
  int h_fs[]           = { 1, 1, 1, 3, 2, 2, 3, 1, 1, 3 };
  int h_nominated_pa[] = { 1, 1, 1, 0, 1, 1, 1, 0, 0, 1 };
  size_t dsize = sizeof(h_fs)/sizeof(h_fs[0]);
  thrust::device_vector<int> i0(h_fs, h_fs+dsize);
  thrust::device_vector<int> i1(h_nominated_pa, h_nominated_pa+dsize);
  thrust::device_vector<int> o0(dsize);
  thrust::exclusive_scan(thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(i0.begin(), i1.begin())), my_op<int>()), thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(i0.end(), i1.end())), my_op<int>()), o0.begin());
  thrust::copy(i0.begin(), i0.end(), std::ostream_iterator<int>(std::cout,","));
  std::cout << std::endl;
  thrust::copy(i1.begin(), i1.end(), std::ostream_iterator<int>(std::cout,","));
  std::cout << std::endl;
  thrust::copy(o0.begin(), o0.end(), std::ostream_iterator<int>(std::cout,","));
  std::cout << std::endl;
  return 0;
}
$ nvcc -arch=sm_35 -o t602 t602.cu
$ ./t602
1,1,1,3,2,2,3,1,1,3,
1,1,1,0,1,1,1,0,0,1,
0,1,2,3,3,5,7,10,10,10,
$

If you’re not familiar with how a transform_iterator works, for example, you may want to start with the thrust quick start guide:

https://github.com/thrust/thrust/wiki/Quick-Start-Guide

You’re amazing, thank you!

How’d you get so knowledgeable about all this CUDA/thrust stuff?

it relates to my day job.

Well, I really appreciate the help. You’re really helping me get a hang of this thrust thing. At first, I thought it was a poor implementation of the STL from C++ but now I’m beginning to realize how different and truly powerful it is.