Performance of memory coalescing in CUDA using Thrust: structure of array vs. array of structure

Following plethora of discussion on nVidia forms, and even some articles, I thought of trying out a fair test of performance hit due to memory coalescing issues when using Thrust library. I was up for a big surprise.

Let’s start with a simple example code supplied with Thrust library. I’ve modified it slightly in order to illustrate that performance improvement that is claimed in the comments to the code is not due to memory coalescing, but due to zip iterators manipulation and tuple unpacking.

Here is what I’ve added to this example code:

struct LessThan {
  __host__ __device__
  bool operator()(const thrust::tuple<int, float>& lhs, const thrust::tuple<int, float>& rhs) const {
    return thrust::get<0>(lhs) < thrust::get<0>(rhs);
  }

};
    
...

// In the main() function
// Sort Key-Value pairs using Structure of Arrays (SoA) storage and zip iterator
{
  thrust::device_vector<int>   keys(N);
  thrust::device_vector<float> values(N);
  
  LessThan lt;
    
  initialize_keys(keys);
    
  timer t;
    
  thrust::sort(thrust::make_zip_iterator(thrust::make_tuple(keys.begin(), values.begin())),
               thrust::make_zip_iterator(thrust::make_tuple(keys.end(), values.end())), lt);
    
  std::cout << "SoA sort using zip_iterator took " << 1e3 * t.elapsed() << " milliseconds" << std::endl;
    
  assert(thrust::is_sorted(keys.begin(), keys.end()));
}

On Quadro K620M using nvcc 9.2, I got the following results:

AoS sort took 39.5851 milliseconds
SoA sort took 22.7603 milliseconds
SoA sort using zip_iterator took 44.8267 milliseconds

Increasing N by a factor of 8, in order to eliminate possible overhead from kernel calls, creation of zip iterators, and so on, yields very similar performance ratio. Removing LessThen functor does not change the outcome either.

At this point, I suspected that the results are unfair, because of the way sort function may work and memory/computation ratio. So, I decided to try out Nathan’s example from the article I’ve mentioned in the beginning. It is a simple transform and memory coalescing should show up. Here is the piece of code that I’ve tried:

#include <thrust/device_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/random.h>
#include "include/timer.h"
using namespace thrust;
using namespace thrust::random;

struct float_3 {
    float x;
    float y;
    float z;
};

struct rotate_float_3 {
    __host__ __device__
    float_3 operator() (const float_3& t) const {
        float x = t.x;
        float y = t.y;
        float z = t.z;
        float_3 result;
        result.x = 0.36f * x + 0.48f * y - 0.80f * z;
        result.y =-0.80f * x + 0.60f * y + 0.00f * z;
        result.z = 0.48f * x + 0.64f * y + 0.60f * z;
        return result;
    }
};

struct rotate_tuple {
    __host__ __device__
    tuple<float, float, float> operator()(const tuple<float,float,float>& t) const {
        float x = get<0>(t);
        float y = get<1>(t);
        float z = get<2>(t);
        float rx = 0.36f * x + 0.48f * y - 0.80f * z;
        float ry =-0.80f * x + 0.60f * y + 0.00f * z;
        float rz = 0.48f * x + 0.64f * y + 0.60f * z;
        return make_tuple(rx, ry, rz);
    }
};

struct random_fill {
    random_fill() : rng(), dist(0.0, 1.0) { }
    __host__ __device__
    float operator() () {
        return dist(rng);
    }
    default_random_engine rng;
    uniform_real_distribution<float> dist;
};

struct random_fill_3 {
    random_fill_3() : rng(), dist(0.0, 1.0) { }
    __host__ __device__
    float_3 operator() () {
        float_3 result;
        result.x = dist(rng); result.y = dist(rng); result.z = dist(rng);
        return result;
    }
    default_random_engine rng;
    uniform_real_distribution<float> dist;
};

int main(void)
{
    size_t N = 512 * 1024 * 1024;
    {
        device_vector<float_3> xyz(N);
        generate(xyz.begin(), xyz.end(), random_fill_3());
        timer t;
        transform(xyz.begin(), xyz.end(), xyz.begin(), rotate_float_3());
        std::cout << "AOS took " << 1e3 * t.elapsed() << " milliseconds" << std::endl;
    }
    {
        device_vector<float> x(N), y(N), z(N);
        generate(x.begin(), x.end(), random_fill());
        generate(y.begin(), y.end(), random_fill());
        generate(z.begin(), z.end(), random_fill());
        timer t;

        transform(make_zip_iterator(make_tuple(x.begin(), y.begin(), z.begin())),
                  make_zip_iterator(make_tuple(x.end(), y.end(), z.end())),
                  make_zip_iterator(make_tuple(x.begin(), y.begin(), z.begin())),
                  rotate_tuple());
        std::cout << "SOA took " << 1e3 * t.elapsed() << " milliseconds" << std::endl;
    }
}

I’ve tried the code on GTX 1070 using nvcc 9.2 compiler. I’m getting very different results from what Nathan claims in his article:

AOS took 67.4427 milliseconds
SOA took 64.5253 milliseconds

As far as I’m concerned, the results indicate that unpacking tuples and zip iterator overhead in Thrust library overshadows potential benefit of memory coalescing. However, I may be doing something wrong.

The question is, can anybody provide a fair example in which we can clearly see the performance hit from memory coalescing specifically when using Thrust library? Please note that I’ve seen a couple examples on SO using user defined kernels, but I never seen one using Thrust.

Thank you!