How to use thrust::async::for_each with cuda streams?

Hello,

I would like to use cuda streams with thrust to asynchronously run thrust::for_each on the GPU while the CPU thread continues and executes other code.

thrust::async::for_each(thrust::cuda::par.on(stream), data.begin(), data.end(), functor);
// .... more CPU code ...
cudaStreamSynchronize(stream);

But I get a static error at compile time:
error: static assertion failed with “this algorithm is not implemented for the specified system”

I’m using cuda 11.0.2 and GCC 8.3.1. I believe I read from the changelogs that thrust::async::for_each was implemented in cuda 10.

Any help is appreciated.

I believe thrust::for_each should already be asynchronous. There shouldn’t be any need to use ::async:: for what you have described so far.

The system is the one being specified in your execution policy: thrust::cuda::par.on(stream). So it seems evident that thrust doesn’t like the combination of that execution policy with ::async:: but I believe that combination should work fine with ordinary thrust::for_each

Thanks for your reply.

According to the changelog:

Thrust 1.9.4 (CUDA Toolkit 10.1)
Synchronous Thrust algorithms now block until all of their operations have completed. Use the new asynchronous Thrust algorithms for non-blocking behavior.

And later in same version changelog

  • So, now thrust::for_each , thrust::transform , thrust::sort , etc are truly synchronous. In some cases this may be a performance regression; if you need asynchrony, use the new asynchronous algorithms.

In performance testing my kernel is taking ~0.27 seconds to execute thrust::for_each. A kernel launch that does nothing takes ~0.004 seconds to execute. A cudaDeviceSynchronize after the thrust::for_each takes ~0.0001 seconds. Timing pseudo-code below:

timer.start();
thrust::for_each(...);
timer.stop();
std::cout << timer.time_elapsed(); ~0.27 seconds

timer.start();
cudaDeviceSynchronize();
timer.stop();
std::cout << timer.time_elapsed(); // ~0.00001 seconds

Sorry, you are correct. It is now synchronous. Unfortunately the async algorithms are not well documented.

Alright, no problem.

Also, I have tried

thrust::async::for_each(thrust::device, data.begin(), data.end(), functor)

with the same static error.

this will compile and seems to produce expected output:

$ cat t140.cu
#include <thrust/device_vector.h>
#include <thrust/async/for_each.h>
#include <thrust/copy.h>


__global__ void k(int *d, int n){
  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < n) d[idx]++;
}
struct f
{
  template <typename T>
  __host__ __device__
  void operator()(T &t){ t++;}
};
const int ds = 1000000;
const int nTPB = 256;
int main(){
  thrust::device_vector<int> d_A(ds,1);
  cudaStream_t s1;
  cudaStreamCreate(&s1);
  k<<<(ds+nTPB-1)/nTPB, nTPB, 0, s1>>>(thrust::raw_pointer_cast(d_A.data()), ds);
  auto x = thrust::async::for_each(thrust::device.on(s1), d_A.begin(), d_A.end(), f());
  thrust::copy_n(d_A.begin(), 5, std::ostream_iterator<int>(std::cout, ","));
  std::cout << std::endl;
}
$ nvcc -o t140 t140.cu -std=c++14
$ ./t140
3,3,3,3,3,
$

I haven’t verified asynchronicity, however. Also, I am using CUDA 11.3, gcc 8.3.1

It seems to be asynchronous:

$ cat t140.cu
#include <thrust/device_vector.h>
#include <thrust/async/for_each.h>
#include <thrust/copy.h>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL

unsigned long long dtime_usec(unsigned long long start){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}


__global__ void k(int *d, int n){
  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < n) d[idx]++;
}
struct f
{
  template <typename T>
  __host__ __device__
  void operator()(T &t){ t++;}
};
const int ds = 10000000;
const int nTPB = 256;
int main(){
  thrust::device_vector<int> d_A(ds,1);
  cudaStream_t s1;
  cudaStreamCreate(&s1);
  unsigned long long dt0 = dtime_usec(0);
  k<<<(ds+nTPB-1)/nTPB, nTPB, 0, s1>>>(thrust::raw_pointer_cast(d_A.data()), ds);
  unsigned long long dt1 = dtime_usec(dt0);
  auto x = thrust::async::for_each(thrust::device.on(s1), d_A.begin(), d_A.end(), f());
  unsigned long long dt2 = dtime_usec(dt0);
  cudaDeviceSynchronize();
  unsigned long long dt3 = dtime_usec(dt0);
  thrust::copy_n(d_A.begin(), 5, std::ostream_iterator<int>(std::cout, ","));
  std::cout << std::endl << "Timing: " << dt1 << ", " << dt2 << ", " << dt3 << std::endl;
}
$ nvcc -o t140 t140.cu -std=c++14
$ ./t140
3,3,3,3,3,
Timing: 9, 24, 1899
$

Interesting, maybe it’s my CUDA version. I’ll try your code on our machine and see if it works there. Thanks!

I didn’t have any trouble compiling my code on CUDA 11.1. I don’t happen to have CUDA 11.0 handy. Actually, this also works, so I’m not sure what kind of problem you are having, maybe it is specific to CUDA 11.0:

$ cat t140.cu
#include <thrust/device_vector.h>
#include <thrust/async/for_each.h>
#include <thrust/copy.h>
#include <iostream>
#include <thrust/execution_policy.h>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL

unsigned long long dtime_usec(unsigned long long start){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}


__global__ void k(int *d, int n){
  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < n) d[idx]++;
}

struct f
{
  template <typename T>
  __host__ __device__
  void operator()(T &t){ if (t==2) t++;}
};

const int ds = 10000000;
const int nTPB = 256;

int main(){
  thrust::device_vector<int> d_A(ds,1);
  cudaStream_t s1;
  cudaStreamCreate(&s1);
  unsigned long long dt0 = dtime_usec(0);
  k<<<(ds+nTPB-1)/nTPB, nTPB, 0, s1>>>(thrust::raw_pointer_cast(d_A.data()), ds);
  unsigned long long dt1 = dtime_usec(dt0);
  auto x = thrust::async::for_each(thrust::cuda::par.on(s1), d_A.begin(), d_A.end(), f());
  unsigned long long dt2 = dtime_usec(dt0);
  cudaDeviceSynchronize();
  unsigned long long dt3 = dtime_usec(dt0);
  thrust::copy_n(d_A.begin(), 5, std::ostream_iterator<int>(std::cout, ","));
  std::cout << std::endl << "Timing: " << dt1 << ", " << dt2 << ", " << dt3 << std::endl;
}
$ /usr/local/cuda-11.1/bin/nvcc -o t140 t140.cu -std=c++14
$ ./t140
3,3,3,3,3,
Timing: 9, 24, 1900
$

To proceed further I would need to see a complete code on your side.

Later: I found a system with CUDA 11.0 on it. The above code compiles correctly there as well. So I’m not sure what issue you are running into.

oh.

Is data a thrust::device_vector ? If it is a std::vector, or a host_vector, you might be seeing that static assert.

It’s a std::vector that uses a custom allocator that uses cudaMallocManaged

my guess would be that is the source of the problem. Thrust now has a universal_vector that may be interesting.

Ah, ok, thanks for your help.

It looks like universal_vector is effectively std::vector that uses managed memory? Or is begin(), end(), and operator usable in device code?

AFAIK, it shares its interface with host_vector, so its methods are not callable in device code.