Hi all,
I am trying to do something in thrust akin to s=x*y+z where all vectors are of the same size and all operations are done element-wise; I understand how to program it via two separate thrust::transforms()
but I am looking to reduce it down to a single transform
call to try and use the fused multiplication addition feature of the GPU hardware.
I’ve scrounged around on the internet and it seems like it would need some combination of zip_iterator
s and transform_iterator
s to get this to work, but the specifics appear to be eluding me.
Any help would be greatly appreciated.
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/complex.h>
#include <chrono>
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/zip_iterator.h>
using namespace thrust::placeholders;
using namespace thrust;
#include <iostream>
typedef thrust::tuple<thrust::complex<float> const, thrust::complex<float> const, thrust::complex<float> const> triplet;
constexpr int repeat = 100000000;
int zip() {
thrust::device_vector<thrust::complex<float>> d_vec1(repeat, thrust::complex<float>(1.0f, 1.0f)),
d_vec2(repeat, thrust::complex<float>(1.0f, 1.0f)),
d_vec3(repeat, thrust::complex<float>(1.0f, 1.0f)),
d_vec4(repeat, thrust::complex<float>(1.0f, 1.0f));
std::chrono::time_point<std::chrono::high_resolution_clock> t0 = std::chrono::high_resolution_clock::now();
auto start_zip = thrust::make_zip_iterator(thrust::make_tuple(d_vec1.begin(), d_vec2.begin(), d_vec3.begin()));
auto end_zip = thrust::make_zip_iterator(thrust::make_tuple(d_vec1.end(), d_vec2.end(), d_vec3.end()));
// auto thisf = thrust::make_transform_iterator(start_zip, end_zip, func());
thrust::transform(start_zip, end_zip, d_vec4, [=] __device__ (triplet t) {
return thrust::get<0>(t) * thrust::get<1>(t) + thrust::get<2>(t);
} );
std::chrono::time_point<std::chrono::high_resolution_clock> t1 = std::chrono::high_resolution_clock::now();
return std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0).count();
}
int two_calls(){
thrust::device_vector<thrust::complex<float>>
d_vec1(repeat,thrust::complex<float> (10.0, 2.0)),
d_vec2(repeat,thrust::complex<float> (10.0, 2.0)),
d_vec3(repeat,thrust::complex<float> (10.0, 2.0)),
d_vec4(repeat,thrust::complex<float> (10.0, 2.0));
std::chrono::time_point<std::chrono::high_resolution_clock> t0 = std::chrono::high_resolution_clock::now();
thrust::transform(d_vec1.begin(), d_vec1.begin(), d_vec2.begin(), d_vec2.begin(), thrust::multiplies<complex<float>>());
thrust::transform(d_vec2.begin(), d_vec2.begin(), d_vec3.begin(), d_vec4.begin(), thrust::plus<complex<float>>());
std::chrono::time_point<std::chrono::high_resolution_clock> t1 = std::chrono::high_resolution_clock::now();
return std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0).count();
}
int main(){
// For loop over zip() 5 times
for(int i = 0; i < 5; i++){
zip();
}
std::cout << "zip:" << zip() << "ms" << std::endl;
std::cout << "two_calls:" << two_calls() << "ms" << std::endl;
}