Hi All,
According to my tests which are detailed below, it seems that thrust::copy is asynchronous with cuda7.0.
Same tests indicate that with cuda6.5 thrust::copy is synchronous.
Is it indeed correct?
+
Which thrust functions are asynchronous (transform, copy_if, reduce_by_key etc)?
Here’s the test procedure:
- Fill a device buffer with 0s and a host buffer with 1s.
- Copy from device to host buffer using thrust::copy.
- Count host elements which are !=0 before and after cudaDeviceSynchronize().
- Print result to screen.
#include <cuda.h>
#include <cuda_runtime.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/copy.h>
#include <thrust/sort.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/functional.h>
#include <thrust/reduce.h>
#include <thrust/fill.h>
#include <thrust/execution_policy.h>
#include <iostream>
// test modified values after thrust::copy before and after cudaDeviceSynchronize
int main(int argc, char **argv)
{
unsigned int length = 99999;
unsigned int* d_inVal;
// alocate device buffer, fill with 0s:
if(cudaSuccess != cudaMalloc((void **) &d_inVal, length * sizeof(unsigned int))){
std::cout << "cudaMalloc err\n";
}
cudaMemset(d_inVal, 0, length * sizeof(unsigned int));
// alocate hostBuf fill with 1s::
unsigned int* hostBuf;
cudaHostAlloc((void**)&hostBuf, length*sizeof(unsigned int), cudaHostAllocPortable);
for(unsigned int idx =0; idx < length;idx++){
hostBuf[idx] = 1;
}
// copy to host buffer:
thrust::device_ptr<unsigned int> d_inVPtr(d_inVal);
thrust::copy(d_inVPtr, d_inVPtr + length, hostBuf);
// if it were synchronous, buffer was all 0s:
unsigned int cnt = 0;
for(int idx =length -1 ; idx >= 0;idx--){
if(hostBuf[idx] != 0){
cnt++;
}
}
std::cout << "mismatches before sync = " << cnt << "\n";
// Now sync an test again:
cudaDeviceSynchronize();
cnt = 0;
for(unsigned int idx =0; idx < length;idx++){
if(hostBuf[idx] != 0){
cnt++;
}
}
std::cout << "mismatches after sync = " << cnt << "\n";
return 0;
}
compile:
nvcc -arch=compute_52 copy.cu -o run
execute:
./run
resuts with cuda7.0:
mismatches before sync = 7879
mismatches after sync = 0
resuts with cuda6.5:
mismatches before sync = 0
mismatches after sync = 0
Would be grateful to get your advice about this.
Ronen Halevy.