When I try to use thrust::inner_product
with thrust::device_vector
it works very slow for small batches.
See the test:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/inner_product.h>
#include <thrust/execution_policy.h>
#include <iostream>
int main(int argc, char** argv)
{
const int vec_size = 1000000;
const int iterations = 10000;
float milliseconds = 0;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
thrust::host_vector<float> host_x(vec_size);
thrust::generate(host_x.begin(), host_x.end(), rand);
thrust::host_vector<float> host_y(vec_size);
thrust::generate(host_y.begin(), host_y.end(), rand);
printf("vector size = %lu floats\n", vec_size);
cudaEventRecord(start);
thrust::device_vector<float> device_x = host_x;
thrust::device_vector<float> device_y = host_y;
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
printf("copy (device)\t\t%f ms\n", milliseconds);
float gpu_sum = 0.0f;
cudaEventRecord(start);
for (int i = 0; i < iterations; ++i)
{
gpu_sum += thrust::inner_product(device_x.begin(), device_x.end(), device_y.begin(), 0.0f);
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
printf("inner_product (device)\t%f ms\n", milliseconds / iterations);
float cpu_sum = 0.0f;
cudaEventRecord(start);
for (int i = 0; i < iterations; ++i)
{
cpu_sum += thrust::inner_product(thrust::host, host_x.begin(), host_x.end(), host_y.begin(),0.0f);
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
printf("inner_product (host)\t%f ms\n", milliseconds / iterations);
std::cout << "GPU sum = " << gpu_sum << std::endl;
std::cout << "CPU sum = " << cpu_sum << std::endl;
return 0;
}
I get the following results on the AMD Ryzen 9 3950X + NVIDIA RTX2080 Super (Release settings, device: compute_75,sm_75.):
inner.exe 3100000 1000
vector size = 24800000 bytes
copy (device) 6.553728 ms
inner_product (device) 0.503536 ms
inner_product (host) 0.000006 ms
It is related to the amount of calls thrust::inner_product
or to the size of the processed batch, to be more specific.
Here are the results for some sizes for the code above:
vec_size(floats) | Device | Host |
---|---|---|
100 | 0.030 ms | 0.000062 ms |
1’000 | 0.031 ms | 0.000730 ms |
10’000 | 0.068 ms | 0.007ms |
100’000 | 0.081 ms | 0.070 ms |
1’000’000 | 0.37 ms | 0.71 ms |
So, based on this, every call to thrust::inner_product
if executed on device, costs 0.03 ms which is much larger than processing time until the batch size gets to 100’000 elements or more.
Since my vectors were about 20 elements, performance degradation was about 3000 times and I didn’t expect to have this for the data allocated on device (GPU). I was sure that the call itself should be significantly cheaper. Indeed, getting benefits only from 100’000 on high-end GPU against single-threaded CPU version makes arguable profits from such approach.
If I got the wrong end of the stick, please let me know. I was sure thrust is faster than that.
Is this “as expected” or I do something wrong and thrust could be faster here?