thrust copy_if function is slow on gpu data

Hi, all. I have done some testing by calling copy_if function on both gpu and cpu data.

I have found when using gpu data, thrust::copy_if is really slow. Here is the result:

using GPU looping: 900 us
using CPU looping: 10 us

On Ubuntu 14.04, cuda-8.0, driver 384.130, device is Tesla P4.

/usr/local/cuda/bin/nvcc -arch=sm_61 -std=c++11 -O2 copy_if.cu -o output

Below is the code of copy_if.cu:

#include <chrono>
#include <cstddef>
#include <cstdlib> 
#include <iostream>
#include <string>

#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/functional.h>

struct dummy {
  __host__ __device__
  bool operator()(const int index) { return true; }
};

int main() {
  cudaStream_t stream;
  cudaStreamCreate(&stream);

  int num = 56670;
  int *origin_gpu;
  cudaMalloc(&origin_gpu, num * sizeof(int));
  int *idx_gpu;
  cudaMalloc(&idx_gpu, num * sizeof(int));

  int *origin_cpu = (int *)malloc(num * sizeof(int));
  int *idx_cpu = (int *)malloc(num * sizeof(int));

  auto start = std::chrono::steady_clock::now();
  thrust::device_ptr<int> origin_ptr(origin_gpu);
  thrust::device_ptr<int> idx_ptr(idx_gpu);
  auto end = std::chrono::steady_clock::now();
  std::cout << "device_ptr: "
            << std::chrono::duration_cast<std::chrono::microseconds>(end-start).count()
            << " us"
            << std::endl;
  {
    for (int i=0; i<10; i++) {
    start = std::chrono::steady_clock::now();
    auto idx_end = thrust::copy_if(
        origin_ptr,
        origin_ptr + num,
        idx_ptr,
        dummy());
    end = std::chrono::steady_clock::now();
    std::cout << "copy_if gpu: "
              << std::chrono::duration_cast<std::chrono::microseconds>(end-start).count()
              << " us"
              << std::endl;
    }
  }

  {
    for (int i=0; i<10; i++) {
    start = std::chrono::steady_clock::now();
    auto idx_end = thrust::copy_if(
        origin_cpu,
        origin_cpu + num,
        idx_cpu,
        dummy());
    end = std::chrono::steady_clock::now();
    std::cout << "copy_if cpu: "
              << std::chrono::duration_cast<std::chrono::microseconds>(end-start).count()
              << " us"
              << std::endl;
    }
  }

  cudaFree(origin_gpu);
  cudaFree(idx_gpu);

  free(origin_cpu);
  free(idx_cpu);

  cudaError_t error = cudaGetLastError();
  if (error != cudaSuccess) {
     std::cerr << cudaGetErrorString(error) << std::endl;
     exit(EXIT_FAILURE);
  }
  return 0;
}

thrust::copy_if is doing a cudaMalloc operation under the hood. You can discover this with careful use of a profiler. This is slowing down the device operation.

You can take control of thrust temporary allocations to avoid this effect if you wish.

[url]algorithm - Stream compaction and transform based on the index in CUDA - Stack Overflow

Thanks a lot.