Thank you both for your comments. I am going to post here the code we use for testing
#include <cmath>
#include <omp.h>
#include <cusp/array1d.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/fill.h>
#include <thrust/functional.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <chrono>
/* simple timer utility based on std::chrono
Example of Usage is
tictoc<milliseconds> T();
T.tic();
{ Code to be timed}
std::cout << " Time elapsed since last call to tic in milliseconds is " << T.toc() << std::endl;
T.reset() // reset internal elapsed to 0.
*/
template <typename DurationUnits>
class tictoc
{
private:
double m_elapsed;
std::chrono::time_point<std::chrono::steady_clock> m_start;
public:
tictoc(double _a = (double)0) : m_elapsed(_a) {}
void tic()
{
m_start = std::chrono::steady_clock::now();
}
double toc()
{
auto stop = std::chrono::steady_clock::now();
m_elapsed += (double)std::chrono::duration_cast<std::chrono::duration<DurationUnits>>(stop - m_start).count();
return m_elapsed;
}
void reset()
{
m_elapsed = (double)0;
}
};
/*Functor that transforms a sequence of integers 0 <= idx < N into the position
indices of a subvolume of a volume. The volume is specified as [0, nx)x[0,ny)x[0,nz)x[0,nq),
while the subvolume is given in numpy notation as [lx:-hx, ly:-hy, lz:-hz, lq:-hq].
N=(nx-(lx+hx))*(ny-(ly+hy))*etc. It is used to make transform iterators that are fed to
permutation iterators to limit calculations to the subvolume.
*/
template <typename IndType>
struct subvolume_map
{
IndType nx, ny, nz, nq, lx, ly, lz, lq, dx, dy, dz, dq;
subvolume_map(std::array<IndType, 3> _n, std::array<IndType, 3> _l, std::array<IndType, 3> _h,
IndType _nq, IndType _lq, IndType _hq) : nx(_n[0]), ny(_n[1]), nz(_n[2]), nq(_nq),
lx(_l[0]), ly(_l[1]), lz(_l[2]), lq(_lq),
dx(_n[0] - (_l[0] + _h[0])),
dy(_n[1] - (_l[1] + _h[1])),
dz(_n[2] - (_l[2] + _h[2])) {}
__host__ __device__
IndType
operator()(IndType idx)
{
#if 0
auto rx = idx / dx;
auto ix = idx % dx + lx;
auto ry = rx / dy;
auto iy = rx % dy + ly;
auto rz = ry / dz;
auto iz = ry % dz + lz;
auto iq = rz + lq;
return (((iq)*nz + iz) * ny + iy) * nx + ix;
#else
auto rx = idx / dx;
auto ix = idx - (rx * dx);
auto ry = rx / dy;
auto iy = rx - (ry * dy);
auto rz = ry / dz;
auto iz = ry - (rz * dz);
auto iq = rz;
return (((iq + lq) * nz + iz + lz) * ny + iy + ly) * nx + ix + lx;
#endif
}
};
template <typename T, typename F>
decltype(tictoc<double>().toc()) testFunctor(T it, F &f, unsigned int Ni)
{
auto It = thrust::make_permutation_iterator(it,
thrust::make_transform_iterator(
thrust::counting_iterator<unsigned int>(0), f));
tictoc<double> Tind;
Tind.reset();
cudaDeviceSynchronize();
Tind.tic();
thrust::fill(It, It + Ni, 0.);
cudaDeviceSynchronize();
return Tind.toc();
}
template <typename T>
decltype(tictoc<double>().toc()) testFunctor(T it, unsigned int Ni)
{
auto It = thrust::make_permutation_iterator(it,
thrust::make_transform_iterator(
thrust::counting_iterator<unsigned int>(0), thrust::identity<unsigned int>()));
tictoc<double> Tind;
Tind.reset();
cudaDeviceSynchronize();
Tind.tic();
thrust::fill(It, It + Ni, 0.);
cudaDeviceSynchronize();
return Tind.toc();
}
int main()
{
int nt = 1;
unsigned int nx = 256, ny = 512, nz = 392;
//unsigned int nx = 2, ny = 2, nz = 2;
// for these values, the functor subvolume_map behaves like the identity.
std::array<unsigned int, 3> n = {nx, ny, nz};
std::array<unsigned int, 3> l = {0, 0, 0};
std::array<unsigned int, 3> h = {0, 0, 0};
unsigned int nq = 1;
unsigned int lq = 0;
unsigned int hq = 0;
unsigned int Ni = (nx - l[0] - h[0]) * (ny - l[1] - h[1]) * (nz - l[2] - h[2]) * (nq - lq - hq);
unsigned int Nt = (nx) * (ny) * (nz) * (nq);
thrust::host_vector<double> V(Nt);
thrust::device_vector<double> Vd(Nt);
subvolume_map<unsigned int> SV(n, l, h, nq, lq, hq);
while (nt < 13)
{
omp_set_num_threads(nt);
std::cout << " --------------------- " << std::endl;
std::cout << " Using thrust with " << omp_get_max_threads() << " threads on CPU " << std::endl;
auto T = testFunctor(V.begin(), SV, Ni);
std::cout << " BandWidth (MB/s) using subvolume functor on actual iterator " << int(Ni * sizeof(double) / T / 1.e6) << std::endl;
T = testFunctor(V.begin(), Ni);
std::cout << " BandWidth (MB/s) using identity functor on actual iterator " << int(Ni * sizeof(double) / T / 1.e6) << std::endl;
nt += 1;
}
std::cout << " --------------------- " << std::endl;
std::cout << " Using thrust on GPU " << std::endl;
auto T = testFunctor(Vd.begin(), SV, Ni);
std::cout << " BandWidth (MB/s) using subvolume functor " << int(Ni * sizeof(double) / T / 1.e6) << std::endl;
T = testFunctor(Vd.begin(), Ni);
std::cout << " BandWidth (MB/s) using identity functor " << int(Ni * sizeof(double) / T / 1.e6) << std::endl;
std::cout << " testing the functor " << std::endl;
tictoc<double> Tind;
Tind.reset();
Tind.tic();
unsigned int i = 0;
#pragma omp parallel
#pragma omp for
for (i=0; i < Ni; ++i)
{
V[SV(i)]=0.;
}
T = Tind.toc();
std::cout << V[0] << std::endl;
std::cout << " Bandwidth (MB/s) for subvolume functor in while loop " << int(Ni * sizeof(double) / T / 1.e6) << std::endl;
Tind.reset();
Tind.tic();
i = 0;
#pragma omp parallel
#pragma omp for
for (i = 0; i <Ni; ++i)
{
V[thrust::identity<unsigned int>()(i)]=0.;
}
T = Tind.toc();
std::cout << V[0] << std::endl; // to avoid the compiler optimizing out the loop
std::cout << " Bandwidth (MB/s) for identity functor in while loop " << int(Ni * sizeof(double) / T / 1.e6) << std::endl;
return 0;
}
The difference between using the remainder vs backmultiply is actually pretty small. This is what we get using backmultiply
Using thrust with 1 threads on CPU
BandWidth (MB/s) using subvolume functor on actual iterator 650
BandWidth (MB/s) using identity functor on actual iterator 8704
Using thrust with 2 threads on CPU
BandWidth (MB/s) using subvolume functor on actual iterator 1290
BandWidth (MB/s) using identity functor on actual iterator 13111
Using thrust with 3 threads on CPU
BandWidth (MB/s) using subvolume functor on actual iterator 1943
BandWidth (MB/s) using identity functor on actual iterator 13310
Using thrust with 4 threads on CPU
BandWidth (MB/s) using subvolume functor on actual iterator 2586
BandWidth (MB/s) using identity functor on actual iterator 13248
Using thrust with 5 threads on CPU
BandWidth (MB/s) using subvolume functor on actual iterator 3214
BandWidth (MB/s) using identity functor on actual iterator 13177
Using thrust with 6 threads on CPU
BandWidth (MB/s) using subvolume functor on actual iterator 3841
BandWidth (MB/s) using identity functor on actual iterator 13131
Using thrust with 7 threads on CPU
BandWidth (MB/s) using subvolume functor on actual iterator 2812
BandWidth (MB/s) using identity functor on actual iterator 12853
Using thrust with 8 threads on CPU
BandWidth (MB/s) using subvolume functor on actual iterator 3170
BandWidth (MB/s) using identity functor on actual iterator 13035
Using thrust with 9 threads on CPU
BandWidth (MB/s) using subvolume functor on actual iterator 3374
BandWidth (MB/s) using identity functor on actual iterator 12938
Using thrust with 10 threads on CPU
BandWidth (MB/s) using subvolume functor on actual iterator 3695
BandWidth (MB/s) using identity functor on actual iterator 13014
Using thrust with 11 threads on CPU
BandWidth (MB/s) using subvolume functor on actual iterator 4040
BandWidth (MB/s) using identity functor on actual iterator 13011
Using thrust with 12 threads on CPU
BandWidth (MB/s) using subvolume functor on actual iterator 4310
BandWidth (MB/s) using identity functor on actual iterator 13000
Using thrust on GPU
BandWidth (MB/s) using subvolume functor 94887
BandWidth (MB/s) using identity functor 157701
testing the functor
0
Bandwidth (MB/s) for subvolume functor in while loop 4449
0
Bandwidth (MB/s) for identity functor in while loop 12907
Whereas this is what we get using the remainder
Using thrust with 1 threads on CPU
BandWidth (MB/s) using subvolume functor on actual iterator 696
BandWidth (MB/s) using identity functor on actual iterator 8632
Using thrust with 2 threads on CPU
BandWidth (MB/s) using subvolume functor on actual iterator 1378
BandWidth (MB/s) using identity functor on actual iterator 13153
Using thrust with 3 threads on CPU
BandWidth (MB/s) using subvolume functor on actual iterator 2077
BandWidth (MB/s) using identity functor on actual iterator 13142
Using thrust with 4 threads on CPU
BandWidth (MB/s) using subvolume functor on actual iterator 2754
BandWidth (MB/s) using identity functor on actual iterator 13256
Using thrust with 5 threads on CPU
BandWidth (MB/s) using subvolume functor on actual iterator 3450
BandWidth (MB/s) using identity functor on actual iterator 13178
Using thrust with 6 threads on CPU
BandWidth (MB/s) using subvolume functor on actual iterator 4116
BandWidth (MB/s) using identity functor on actual iterator 13202
Using thrust with 7 threads on CPU
BandWidth (MB/s) using subvolume functor on actual iterator 2898
BandWidth (MB/s) using identity functor on actual iterator 12563
Using thrust with 8 threads on CPU
BandWidth (MB/s) using subvolume functor on actual iterator 3008
BandWidth (MB/s) using identity functor on actual iterator 13008
Using thrust with 9 threads on CPU
BandWidth (MB/s) using subvolume functor on actual iterator 3382
BandWidth (MB/s) using identity functor on actual iterator 12903
Using thrust with 10 threads on CPU
BandWidth (MB/s) using subvolume functor on actual iterator 3757
BandWidth (MB/s) using identity functor on actual iterator 13074
Using thrust with 11 threads on CPU
BandWidth (MB/s) using subvolume functor on actual iterator 4107
BandWidth (MB/s) using identity functor on actual iterator 13013
Using thrust with 12 threads on CPU
BandWidth (MB/s) using subvolume functor on actual iterator 4393
BandWidth (MB/s) using identity functor on actual iterator 13045
Using thrust on GPU
BandWidth (MB/s) using subvolume functor 105597
BandWidth (MB/s) using identity functor 156678
testing the functor
0
Bandwidth (MB/s) for subvolume functor in while loop 4478
0
Bandwidth (MB/s) for identity functor in while loop 13073
The difference is minimal, compared to the difference between functor and identity. These tests were performed on a (oldish) machine with a Intel(R) Core™ i7 CPU X 990 @ 3.47GHz (6 physical cores, 12 virtual cores). Memory is DDR2. The GPU is a vintage GeForce GTX 680, 2GB ot total memory. nvcc V9.0, thrust V1.9.0, gcc V7.3.1, and compiled with -O3 optimization.
If I read the data correctly, on CPU the process seems CPU bound using the functor, since bandwidth saturates when we hit 6 threads, whereas using the identity functor it saturates at 2 threads. I guess the next step would be to inspect the assembly for the functor…