Using thrust on device data : unimplemented for this system

My goal is to use thrust on device data to benefit for free of advanced pipelined memory pre-fetching, as explained in “CUDA Techniques to Maximize Memory Bandwidth and Hide Latency [S72683]”

My typical use-case is strided 2D data, so I have to create an iterator adaptor in order to map N linear indices to the 2D strided data.

The code below is my attempt to do so (with a simple square function) , but results in “unimplemented for this system” when trying to call transform()

I understand that it must be a default template instanciation that does not match “valid” instanciations. But I really don’t understand what I am doing wrong.

Do I misunderstand how thrust is supposed to be used ?

  const int N = src_gpu.size().area();

  const int srcElemsPerRow = static_cast<int>(src_gpu.cols);
  const int srcStrideInElements = static_cast<int>(src_gpu.step1());

  auto srcBegin = thrust::make_permutation_iterator(
    thrust::device_pointer_cast(src_gpu.ptr<float>(0)),
    thrust::make_transform_iterator(thrust::counting_iterator<int>(0),
      cuda::proclaim_copyable_arguments(
        [srcElemsPerRow, srcStrideInElements] __host__ __device__ (int i) {
          return (i/srcElemsPerRow)*srcStrideInElements+(i%srcElemsPerRow);
        }
      )
    )
  );

  const int dstElemsPerRow = static_cast<int>(dst_gpu.cols);
  const int dstStrideInElements = static_cast<int>(dst_gpu.step1());
  auto dstBegin = thrust::make_permutation_iterator(
    thrust::device_pointer_cast(dst_gpu.ptr<float>(0)),
    thrust::make_transform_iterator(thrust::counting_iterator<int>(0),
      cuda::proclaim_copyable_arguments(
        [dstElemsPerRow, dstStrideInElements] __host__ __device__ (int i) {
          return (i/dstElemsPerRow)*dstStrideInElements+(i%dstElemsPerRow);
        }
      )
    )
  );

  thrust::transform(thrust::cuda::par.on(cudaStream), srcBegin, srcBegin+N, dstBegin, [] __host__ __device__ (const float& x) {return x*x;});

Please provide a complete minimal example and compilation command required to reproduce the error.

Leaving that aside, I doubt that thrust transform would use async data loading in your case because the inputs are neither pointers nor contiguous (and the transform op does not proclaim copyable arguments)

Here is the code without OpenCV structures
It’s just a 2D array with width*height elements, but some stride.

const int width = 111;
const int strideInElements = 128;
const int height = 256;
float* src = nullptr;
float* dst = nullptr;
cudaMalloc(&src, height*strideInElements*sizeof(float));
cudaMalloc(&dst, height*strideInElements*sizeof(float));

const int srcElemsPerRow = width;
const int srcStrideInElements = strideInElements;
auto srcBegin = thrust::make_permutation_iterator(
  thrust::device_pointer_cast(src),
  thrust::make_transform_iterator(thrust::counting_iterator<int>(0),
    cuda::proclaim_copyable_arguments(
      [srcElemsPerRow, srcStrideInElements] __host__ __device__ (int i) {
        return (i/srcElemsPerRow)*srcStrideInElements+(i%srcElemsPerRow);
      }
    )
  )
);

const int dstElemsPerRow = width;
const int dstStrideInElements = strideInElements;
auto dstBegin = thrust::make_permutation_iterator(
  thrust::device_pointer_cast(dst),
  thrust::make_transform_iterator(thrust::counting_iterator<int>(0),
    cuda::proclaim_copyable_arguments(
      [dstElemsPerRow, dstStrideInElements] __host__ __device__ (int i) {
        return (i/dstElemsPerRow)*dstStrideInElements+(i%dstElemsPerRow);
      }
    )
  )
);

thrust::transform(thrust::cuda::par.on(cudaStream), srcBegin, srcBegin+N, dstBegin,
  [] __host__ __device__ (const float& x) {return x*x;}
);

I haven’t watched the GTC session, but I gather that cuda::proclaim_copyable_arguments() is central to your question. Nevertheless I will point out that my attempt to create a test case on CUDA 12.8.1 did not turn up any compile errors:

$ cat t5.cu
#include <thrust/transform.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/functional.h>
#include <cuda/functional>
#include <thrust/device_ptr.h>

int main(){
  float *src_gpu = NULL;
  float *dst_gpu = NULL;
  cudaStream_t cudaStream;
  cudaStreamCreate(&cudaStream);
  const int srcStrideInElements = static_cast<int>(64);
  const int srcElemsPerRow = static_cast<int>(32);
  const int N = 256;

  auto srcBegin = thrust::make_permutation_iterator(
    thrust::device_pointer_cast(src_gpu),
    thrust::make_transform_iterator(thrust::counting_iterator<int>(0),
     // cuda::proclaim_copyable_arguments(
        [srcElemsPerRow, srcStrideInElements] __host__ __device__ (int i) {
          return (i/srcElemsPerRow)*srcStrideInElements+(i%srcElemsPerRow);
        }
    //  )
    )
  );

  const int dstElemsPerRow = static_cast<int>(32);
  const int dstStrideInElements = static_cast<int>(64);

  auto dstBegin = thrust::make_permutation_iterator(
    thrust::device_pointer_cast(dst_gpu),
    thrust::make_transform_iterator(thrust::counting_iterator<int>(0),
   //   cuda::proclaim_copyable_arguments(
        [dstElemsPerRow, dstStrideInElements] __host__ __device__ (int i) {
          return (i/dstElemsPerRow)*dstStrideInElements+(i%dstElemsPerRow);
        }
     // )
    )
  );

  thrust::transform(thrust::cuda::par.on(cudaStream), srcBegin, srcBegin+N, dstBegin, [] __host__ __device__ (const float& x) {return x*x;});
}

$ nvcc -o t5 t5.cu -arch=sm_75  --extended-lambda
$

I have commented out the cuda::proclaim_copyable_arguments. That is apparently a CUDA 12.9 feature and is not available in my default install of CUDA 12.8.1

So I guess I would need to install the latest CCCL or something like that, which I have not done.

I am stupid I was not compiling a cu file.
When using nvcc, things get better (still can’t get it work, but no more compilation error).

For information, I indeed use latest CCCL because 12.8.1 does not support cuda::proclaim_copyable_arguments and I thought it was the problem.