When I try to use thrust::async::for_each() with zip iterators, the code [1] does not compile ([2]).
It does work however if I use the “sync” version thrust::for_each() with zip iterators [3] or if I use two calls of thrust::async::for_each() with normal iterators [4].
I am using hpc_sdk/Linux_x86_64/22.3/ [5,6].
Thanks
====
[1]
//test.cu
#include <thrust/device_vector.h>
#include <thrust/async/for_each.h>
#include <thrust/copy.h>
#include <iostream>
#include <chrono>
const int ds = 10000000;
int main(){
thrust::device_vector<int> d_A(ds,1);
thrust::device_vector<int> d_B(ds,2);
auto t1 = std::chrono::steady_clock::now(); // Start timing
///////////////
#ifdef ZIPSYNC // this works
auto e12 = thrust::for_each(
thrust::make_zip_iterator(thrust::make_tuple(d_A.begin(),d_B.begin())),
thrust::make_zip_iterator(thrust::make_tuple(d_A.end(),d_B.end())),
[=]__device__ (auto &tup){
if (thrust::get<0>(tup)==1) thrust::get<0>(tup)++;
if (thrust::get<1>(tup)==2) thrust::get<1>(tup)++;
}
);
#elif defined(ZIPASYNC) // this does not compile
auto e12 = thrust::async::for_each(
thrust::make_zip_iterator(thrust::make_tuple(d_A.begin(),d_B.begin())),
thrust::make_zip_iterator(thrust::make_tuple(d_A.end(),d_B.end())),
[=]__device__ (auto &tup){
if (thrust::get<0>(tup)==1) thrust::get<0>(tup)++;
if (thrust::get<1>(tup)==2) thrust::get<1>(tup)++;
}
);
#else // this works
auto e1 = thrust::async::for_each(d_A.begin(), d_A.end(),
[=]__device__(auto &t){
if (t==1) t++;
}
);
auto e2 = thrust::async::for_each(d_B.begin(), d_B.end(),
[=]__device__(auto &t){
if (t==2) t++;
}
);
#endif
auto t2 = std::chrono::steady_clock::now();
cudaDeviceSynchronize();
auto t3 = std::chrono::steady_clock::now();
thrust::copy_n(d_A.begin(), 5, std::ostream_iterator<int>(std::cout, ","));
thrust::copy_n(d_B.begin(), 5, std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
std::cout
<< "before cudaDeviceSynchronize "
<< std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count()
<< std::endl;
std::cout
<< "after cudaDeviceSynchronize "
<< std::chrono::duration_cast<std::chrono::microseconds>(t3 - t1).count()
<< std::endl;
}
[2] nvcc --extended-lambda test.cu -DZIPASYNC
[3] nvcc --extended-lambda test.cu -DZIPSYNC
[4] nvcc --extended-lambda test.cu
[5]
which nvcc
/opt/nvidia/hpc_sdk/Linux_x86_64/22.3/compilers/bin/nvcc
[6]
nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Thu_Feb_10_18:23:41_PST_2022
Cuda compilation tools, release 11.6, V11.6.112
Build cuda_11.6.r11.6/compiler.30978841_0