Hello,
I would like to use cuda streams with thrust to asynchronously run thrust::for_each on the GPU while the CPU thread continues and executes other code.
thrust::async::for_each(thrust::cuda::par.on(stream), data.begin(), data.end(), functor);
// .... more CPU code ...
cudaStreamSynchronize(stream);
But I get a static error at compile time:
error: static assertion failed with “this algorithm is not implemented for the specified system”
I’m using cuda 11.0.2 and GCC 8.3.1. I believe I read from the changelogs that thrust::async::for_each was implemented in cuda 10.
Any help is appreciated.
I believe thrust::for_each
should already be asynchronous. There shouldn’t be any need to use ::async::
for what you have described so far.
The system is the one being specified in your execution policy: thrust::cuda::par.on(stream)
. So it seems evident that thrust doesn’t like the combination of that execution policy with ::async::
but I believe that combination should work fine with ordinary thrust::for_each
Thanks for your reply.
According to the changelog:
Thrust 1.9.4 (CUDA Toolkit 10.1)
Synchronous Thrust algorithms now block until all of their operations have completed. Use the new asynchronous Thrust algorithms for non-blocking behavior.
And later in same version changelog
- So, now
thrust::for_each
, thrust::transform
, thrust::sort
, etc are truly synchronous. In some cases this may be a performance regression; if you need asynchrony, use the new asynchronous algorithms.
In performance testing my kernel is taking ~0.27 seconds to execute thrust::for_each. A kernel launch that does nothing takes ~0.004 seconds to execute. A cudaDeviceSynchronize after the thrust::for_each takes ~0.0001 seconds. Timing pseudo-code below:
timer.start();
thrust::for_each(...);
timer.stop();
std::cout << timer.time_elapsed(); ~0.27 seconds
timer.start();
cudaDeviceSynchronize();
timer.stop();
std::cout << timer.time_elapsed(); // ~0.00001 seconds
Sorry, you are correct. It is now synchronous. Unfortunately the async algorithms are not well documented.
Alright, no problem.
Also, I have tried
thrust::async::for_each(thrust::device, data.begin(), data.end(), functor)
with the same static error.
this will compile and seems to produce expected output:
$ cat t140.cu
#include <thrust/device_vector.h>
#include <thrust/async/for_each.h>
#include <thrust/copy.h>
__global__ void k(int *d, int n){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < n) d[idx]++;
}
struct f
{
template <typename T>
__host__ __device__
void operator()(T &t){ t++;}
};
const int ds = 1000000;
const int nTPB = 256;
int main(){
thrust::device_vector<int> d_A(ds,1);
cudaStream_t s1;
cudaStreamCreate(&s1);
k<<<(ds+nTPB-1)/nTPB, nTPB, 0, s1>>>(thrust::raw_pointer_cast(d_A.data()), ds);
auto x = thrust::async::for_each(thrust::device.on(s1), d_A.begin(), d_A.end(), f());
thrust::copy_n(d_A.begin(), 5, std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
}
$ nvcc -o t140 t140.cu -std=c++14
$ ./t140
3,3,3,3,3,
$
I haven’t verified asynchronicity, however. Also, I am using CUDA 11.3, gcc 8.3.1
It seems to be asynchronous:
$ cat t140.cu
#include <thrust/device_vector.h>
#include <thrust/async/for_each.h>
#include <thrust/copy.h>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
__global__ void k(int *d, int n){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < n) d[idx]++;
}
struct f
{
template <typename T>
__host__ __device__
void operator()(T &t){ t++;}
};
const int ds = 10000000;
const int nTPB = 256;
int main(){
thrust::device_vector<int> d_A(ds,1);
cudaStream_t s1;
cudaStreamCreate(&s1);
unsigned long long dt0 = dtime_usec(0);
k<<<(ds+nTPB-1)/nTPB, nTPB, 0, s1>>>(thrust::raw_pointer_cast(d_A.data()), ds);
unsigned long long dt1 = dtime_usec(dt0);
auto x = thrust::async::for_each(thrust::device.on(s1), d_A.begin(), d_A.end(), f());
unsigned long long dt2 = dtime_usec(dt0);
cudaDeviceSynchronize();
unsigned long long dt3 = dtime_usec(dt0);
thrust::copy_n(d_A.begin(), 5, std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "Timing: " << dt1 << ", " << dt2 << ", " << dt3 << std::endl;
}
$ nvcc -o t140 t140.cu -std=c++14
$ ./t140
3,3,3,3,3,
Timing: 9, 24, 1899
$
Interesting, maybe it’s my CUDA version. I’ll try your code on our machine and see if it works there. Thanks!
I didn’t have any trouble compiling my code on CUDA 11.1. I don’t happen to have CUDA 11.0 handy. Actually, this also works, so I’m not sure what kind of problem you are having, maybe it is specific to CUDA 11.0:
$ cat t140.cu
#include <thrust/device_vector.h>
#include <thrust/async/for_each.h>
#include <thrust/copy.h>
#include <iostream>
#include <thrust/execution_policy.h>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
__global__ void k(int *d, int n){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < n) d[idx]++;
}
struct f
{
template <typename T>
__host__ __device__
void operator()(T &t){ if (t==2) t++;}
};
const int ds = 10000000;
const int nTPB = 256;
int main(){
thrust::device_vector<int> d_A(ds,1);
cudaStream_t s1;
cudaStreamCreate(&s1);
unsigned long long dt0 = dtime_usec(0);
k<<<(ds+nTPB-1)/nTPB, nTPB, 0, s1>>>(thrust::raw_pointer_cast(d_A.data()), ds);
unsigned long long dt1 = dtime_usec(dt0);
auto x = thrust::async::for_each(thrust::cuda::par.on(s1), d_A.begin(), d_A.end(), f());
unsigned long long dt2 = dtime_usec(dt0);
cudaDeviceSynchronize();
unsigned long long dt3 = dtime_usec(dt0);
thrust::copy_n(d_A.begin(), 5, std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "Timing: " << dt1 << ", " << dt2 << ", " << dt3 << std::endl;
}
$ /usr/local/cuda-11.1/bin/nvcc -o t140 t140.cu -std=c++14
$ ./t140
3,3,3,3,3,
Timing: 9, 24, 1900
$
To proceed further I would need to see a complete code on your side.
Later: I found a system with CUDA 11.0 on it. The above code compiles correctly there as well. So I’m not sure what issue you are running into.
oh.
Is data
a thrust::device_vector
? If it is a std::vector, or a host_vector, you might be seeing that static assert.
It’s a std::vector that uses a custom allocator that uses cudaMallocManaged
my guess would be that is the source of the problem. Thrust now has a universal_vector
that may be interesting.
Ah, ok, thanks for your help.
It looks like universal_vector is effectively std::vector that uses managed memory? Or is begin(), end(), and operator usable in device code?
AFAIK, it shares its interface with host_vector
, so its methods are not callable in device code.