Thrust::inner_product is very slow on thrust::device_vector for small vectorsc

When I try to use thrust::inner_product with thrust::device_vector it works very slow for small batches.

See the test:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/inner_product.h>
#include <thrust/execution_policy.h>
#include <iostream>

int main(int argc, char** argv)
{
    const int vec_size = 1000000;
    const int iterations = 10000;

    float milliseconds = 0;

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    thrust::host_vector<float> host_x(vec_size);
    thrust::generate(host_x.begin(), host_x.end(), rand);

    thrust::host_vector<float> host_y(vec_size);
    thrust::generate(host_y.begin(), host_y.end(), rand);

    printf("vector size = %lu floats\n", vec_size);

    cudaEventRecord(start);

    thrust::device_vector<float> device_x = host_x;
    thrust::device_vector<float> device_y = host_y;
    cudaEventRecord(stop);

    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&milliseconds, start, stop);

    printf("copy (device)\t\t%f ms\n", milliseconds);

    float gpu_sum = 0.0f;

    cudaEventRecord(start);

    for (int i = 0; i < iterations; ++i)
    {
        gpu_sum += thrust::inner_product(device_x.begin(), device_x.end(), device_y.begin(), 0.0f);
    }

    cudaEventRecord(stop);

    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&milliseconds, start, stop);

    printf("inner_product (device)\t%f ms\n", milliseconds / iterations);

    float cpu_sum = 0.0f;

    cudaEventRecord(start);

    for (int i = 0; i < iterations; ++i)
    {
        cpu_sum += thrust::inner_product(thrust::host, host_x.begin(), host_x.end(), host_y.begin(),0.0f);
    }

    cudaEventRecord(stop);

    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&milliseconds, start, stop);

    printf("inner_product (host)\t%f ms\n", milliseconds / iterations);

    std::cout << "GPU sum = " << gpu_sum << std::endl;
    std::cout << "CPU sum = " << cpu_sum << std::endl;

    return 0;
}

I get the following results on the AMD Ryzen 9 3950X + NVIDIA RTX2080 Super (Release settings, device: compute_75,sm_75.):

inner.exe 3100000 1000
vector size = 24800000 bytes
copy (device)           6.553728 ms
inner_product (device)  0.503536 ms
inner_product (host)    0.000006 ms

It is related to the amount of calls thrust::inner_product or to the size of the processed batch, to be more specific.

Here are the results for some sizes for the code above:

vec_size(floats) Device Host
100 0.030 ms 0.000062 ms
1’000 0.031 ms 0.000730 ms
10’000 0.068 ms 0.007ms
100’000 0.081 ms 0.070 ms
1’000’000 0.37 ms 0.71 ms

So, based on this, every call to thrust::inner_product if executed on device, costs 0.03 ms which is much larger than processing time until the batch size gets to 100’000 elements or more.

Since my vectors were about 20 elements, performance degradation was about 3000 times and I didn’t expect to have this for the data allocated on device (GPU). I was sure that the call itself should be significantly cheaper. Indeed, getting benefits only from 100’000 on high-end GPU against single-threaded CPU version makes arguable profits from such approach.

If I got the wrong end of the stick, please let me know. I was sure thrust is faster than that.

Is this “as expected” or I do something wrong and thrust could be faster here?

Can you change the code to calculate several different of the small vectors at the same time?

Invoking a kernel takes a few microseconds, so perhaps the duration of the thrust call can be lowered a bit (?), but it will be difficult to get far below your number for vec_size 10,000 on host with 7µs.

Can you move more steps of your algorithm to the GPU?
With 20 elements one would probably stay within one block/one SM on the GPU or even within one warp or even one thread.