Thrust question

Hi,
Is there an easy way to do a segmented conditional copy in thrust?
Suppose I have the following:

int partitions[] = {0, 0, 0, 1, 2, 2, 2};
int values[] = {1, 2, 3, 4, 5, 6, 7};

I would like the output to be:

output = {2, 3, <Empty>, <Empty>, 6, 7, <Empty>

So in that example, I copy the next value to the current position but with partitions in mind.
Something like inclusive_scan_by_key

I’m not seeing the logic in how you got output, but this link might provide some insight.

https://stackoverflow.com/questions/39240098/cuda-thrust-copy-transformed-result-only-if-it-satisfies-a-predicate

I’m trying to move the input one position left per partition…

Maybe a simpler example would be better:

int partitions[] = {0, 0, 0, 0, 1, 1, 1, 1};
int values[] = {1, 2, 3, 4, 5, 6, 7, 8};

output = {2, 3, 4, <NULL>, 6, 7, 8, <NULL>}

I don’t wont want to guarantee this is optimal (or exactly what you want) but does the following work?

#include <thrust/transform.h>
#include <thrust/functional.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <vector>

struct Copy
{
    template< class T >
    __host__ __device__
    T operator()( const T &a, const T &b ) const
    {
        return ( b );
    }
};

int main () {

    std::vector<int> h_part  = { 0, 0, 0, 0, 1, 1, 1, 1 };
    std::vector<int> h_data  = { 1, 2, 3, 4, 5, 6, 7, 8 };
    
    // Copy data to GPU
    thrust::device_vector<int> d_part ( h_part );
    thrust::device_vector<int> d_data ( h_data );
    thrust::device_vector<int> d_rle (2); // Assuming number of partition are known at compile time
    thrust::device_vector<int> d_len (2);
    
    // Compute RLE
    size_t num_runs = thrust::reduce_by_key( d_part.begin(), 
                                             d_part.end(),                      // input key sequence
                                             thrust::constant_iterator<int>(1), // input value sequence
                                             d_rle.begin(),                     // output key sequence
                                             d_len.begin()                      // output value sequence
                                            ).first - d_rle.begin();            // compute the output size
                           
    // Create stencil
    thrust::device_vector<int> d_stencil ( d_data.size(), 1 );
    
    for ( int i = 0; i < num_runs; i++)
        d_stencil[i * d_len[i]] = 0;
    
    
    // Output array 
    thrust::device_vector<int> d_output ( d_data.size(), -1 );
    thrust::identity<int> identity;

    // Perform transformation
    thrust::transform_if( d_part.begin(), 
                          d_part.end(), 
                          d_data.begin(), 
                          d_stencil.begin(), 
                          d_output.data(), 
                          Copy(), 
                          identity );
    
    thrust::host_vector<int> h_output (d_output);
    
    for (auto & i : h_output)
        std::printf("%d\n", i);
    
    // {-1, 2, 3, 4, -1, 6, 7, 8}
}

Thanks for the help, the stencil is a nice idea but the results are not what I wanted.
I need to copy the values in an offset (of 1…x)
So in your output the values are:

// {-1, 2, 3, 4, -1, 6, 7, 8}

whereas the output I need is this:

// {2, 3, 4, -1, 6, 7, 8, -1}

I guess this is a general thrust question.
How I can iterate/manipulate/read values from an underlying device pointer in a different
offset than the one thrust is currently looking at.

I solved it like this, but I wanted to know if there’s a more elegant way:

template <typename T>
struct LagWithNulls
{
	size_t m_rows;
	unsigned int m_offset;
	unsigned int *m_partitions;
	T *m_aggregated;
	T *m_out;
	
	LagWithNulls(size_t rows, unsigned int offset, unsigned int *partitions, T *aggergated, T *out) 
		: m_rows(rows), m_offset(offset), m_partitions(partitions), m_aggregated(aggergated), 
		m_out(out) {}
	
	__device__ void operator()(unsigned int index)
	{
		unsigned int partition = m_partitions[index];
		
		int lag_index = index - m_offset;
		if (lag_index < 0) {
			m_out[index] = 0;
			return;
		}
		unsigned int lag_partition = m_partitions[lag_index];
		if (partition == lag_partition)
		{
			m_out[index] = m_aggregated[lag_index];
		}
		else
		{
			m_out[index] = 0;
		}
	}
};


thrust::counting_iterator<unsigned int> index(0);
thrust::for_each(index, index + N, 
		LagWithNulls<T>(N, offset,
		thrust::raw_pointer_cast(d_partitions.data()),
		thrust::raw_pointer_cast(d_aggregated.data()),
		thrust::raw_pointer_cast(d_sum_out.data()))