Thrust::async::for_each() with zip_iterators

When I try to use thrust::async::for_each() with zip iterators, the code [1] does not compile ([2]).

It does work however if I use the “sync” version thrust::for_each() with zip iterators [3] or if I use two calls of thrust::async::for_each() with normal iterators [4].

I am using hpc_sdk/Linux_x86_64/22.3/ [5,6].

Thanks

====
[1]

//test.cu
#include <thrust/device_vector.h>
#include <thrust/async/for_each.h>
#include <thrust/copy.h>
#include <iostream>
#include <chrono>

const int ds = 10000000;

int main(){
  thrust::device_vector<int> d_A(ds,1);
  thrust::device_vector<int> d_B(ds,2);
    
  auto t1 = std::chrono::steady_clock::now();   // Start timing     

    ///////////////
    #ifdef ZIPSYNC // this works
    
    auto e12 = thrust::for_each(
    thrust::make_zip_iterator(thrust::make_tuple(d_A.begin(),d_B.begin())), 
    thrust::make_zip_iterator(thrust::make_tuple(d_A.end(),d_B.end())), 
    [=]__device__ (auto &tup){
        if (thrust::get<0>(tup)==1) thrust::get<0>(tup)++;
        if (thrust::get<1>(tup)==2) thrust::get<1>(tup)++;
    }    
    );
    
    #elif defined(ZIPASYNC)     // this does not compile
    
    auto e12 = thrust::async::for_each(
    thrust::make_zip_iterator(thrust::make_tuple(d_A.begin(),d_B.begin())), 
    thrust::make_zip_iterator(thrust::make_tuple(d_A.end(),d_B.end())), 
    [=]__device__ (auto &tup){
        if (thrust::get<0>(tup)==1) thrust::get<0>(tup)++;
        if (thrust::get<1>(tup)==2) thrust::get<1>(tup)++;
    }    
    );
    
    #else      // this works
    auto e1 = thrust::async::for_each(d_A.begin(), d_A.end(),  
    [=]__device__(auto &t){
      if (t==1) t++;
    }
    );
    auto e2 = thrust::async::for_each(d_B.begin(), d_B.end(), 
        [=]__device__(auto &t){
          if (t==2) t++;
        }
    );
    #endif
  
    auto t2 = std::chrono::steady_clock::now();       
    cudaDeviceSynchronize();
    auto t3 = std::chrono::steady_clock::now();   
  
    thrust::copy_n(d_A.begin(), 5, std::ostream_iterator<int>(std::cout, ","));
    thrust::copy_n(d_B.begin(), 5, std::ostream_iterator<int>(std::cout, ","));
    std::cout << std::endl;
  
    std::cout
    << "before cudaDeviceSynchronize "
    << std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count()
    << std::endl;
  
    std::cout
    << "after cudaDeviceSynchronize "
    << std::chrono::duration_cast<std::chrono::microseconds>(t3 - t1).count()
    << std::endl;
}

[2] nvcc --extended-lambda test.cu -DZIPASYNC
[3] nvcc --extended-lambda test.cu -DZIPSYNC
[4] nvcc --extended-lambda test.cu

[5]
which nvcc
/opt/nvidia/hpc_sdk/Linux_x86_64/22.3/compilers/bin/nvcc

[6]
nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Thu_Feb_10_18:23:41_PST_2022
Cuda compilation tools, release 11.6, V11.6.112
Build cuda_11.6.r11.6/compiler.30978841_0

This seems to work. I removed the ampersand after auto.

1 Like

Great. I confirm that removing the ampersand of the tuple works.

On one hand, I do not understand why the (auto &tup) seems to work for thrust::for_each() [1] but not for thrust::async::for_each(), the only difference being the async.

On the other hand, it is clear from the test that (auto &tup) is not only unnecessary but also prone to error in the async case. The ampersand is nevertheless necessary for normal iterators (i.e. not packed in a tuple through zip_iterator), otherwise it compiles, but we do not modify the arrays.

So, Is it in general incorrect to use the ampersand with zip_iterators?

[1]
I have also checked in

that if I use (Tuple &t) instead of (Tuple t) in arbitrary_functor1 the code compiles and run ok.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.