Hi, all. I have done some testing by calling copy_if function on both gpu and cpu data.
I have found when using gpu data, thrust::copy_if is really slow. Here is the result:
using GPU looping: 900 us
using CPU looping: 10 us
On Ubuntu 14.04, cuda-8.0, driver 384.130, device is Tesla P4.
/usr/local/cuda/bin/nvcc -arch=sm_61 -std=c++11 -O2 copy_if.cu -o output
Below is the code of copy_if.cu:
#include <chrono>
#include <cstddef>
#include <cstdlib>
#include <iostream>
#include <string>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/functional.h>
struct dummy {
__host__ __device__
bool operator()(const int index) { return true; }
};
int main() {
cudaStream_t stream;
cudaStreamCreate(&stream);
int num = 56670;
int *origin_gpu;
cudaMalloc(&origin_gpu, num * sizeof(int));
int *idx_gpu;
cudaMalloc(&idx_gpu, num * sizeof(int));
int *origin_cpu = (int *)malloc(num * sizeof(int));
int *idx_cpu = (int *)malloc(num * sizeof(int));
auto start = std::chrono::steady_clock::now();
thrust::device_ptr<int> origin_ptr(origin_gpu);
thrust::device_ptr<int> idx_ptr(idx_gpu);
auto end = std::chrono::steady_clock::now();
std::cout << "device_ptr: "
<< std::chrono::duration_cast<std::chrono::microseconds>(end-start).count()
<< " us"
<< std::endl;
{
for (int i=0; i<10; i++) {
start = std::chrono::steady_clock::now();
auto idx_end = thrust::copy_if(
origin_ptr,
origin_ptr + num,
idx_ptr,
dummy());
end = std::chrono::steady_clock::now();
std::cout << "copy_if gpu: "
<< std::chrono::duration_cast<std::chrono::microseconds>(end-start).count()
<< " us"
<< std::endl;
}
}
{
for (int i=0; i<10; i++) {
start = std::chrono::steady_clock::now();
auto idx_end = thrust::copy_if(
origin_cpu,
origin_cpu + num,
idx_cpu,
dummy());
end = std::chrono::steady_clock::now();
std::cout << "copy_if cpu: "
<< std::chrono::duration_cast<std::chrono::microseconds>(end-start).count()
<< " us"
<< std::endl;
}
}
cudaFree(origin_gpu);
cudaFree(idx_gpu);
free(origin_cpu);
free(idx_cpu);
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) {
std::cerr << cudaGetErrorString(error) << std::endl;
exit(EXIT_FAILURE);
}
return 0;
}