Have written C++-ish wrappers for the CUDA Runtime API - wanna try them out?

Could I get an example of how to use Managed Memory and a non-default stream? I am using a Jetson Xavier which has a shared GPU memory and CPU memory. I was initially using zero copy, but switched to using Managed Memory per recommendation here:

Then I have a pipeline of kernels based on this recommendation:

Overall, I want the cuda-api-wrapper equivalent of:

cudaMallocManaged( (void**)input_raw_ptr, num_bytes, cudaMemAttachHost );
cudaMalloc( (void**)intermediate_data1_raw_ptr, num_immed_bytes );
cudaMalloc( (void**)intermediate_data2_raw_ptr, num_immed_bytes );
cudaMallocManaged( (void**)output_raw_ptr, num_output_bytes );

// No cudaMemcpy HtoD needed.

my_kernel1<<< blocks_per_thread, threads_per_block, 0, stream >>>( immed_data1_raw_ptr, input_raw_ptr, num_vals );
my_kernel2<<< blocks_per_thread, threads_per_block, 0, stream >>>( immed_data2_raw_ptr, immed_data1_raw_ptr, num_immed_vals );
my_kernel3<<< blocks_per_thread, threads_per_block, 0, stream >>>( output_raw_ptr, immed_data2_raw_ptr, num_output_vals );

cudaStreamAttachMemAsync(stream, output_raw_ptr, 0, cudaMemAttachHost);
cudaStreamSynchronize(stream);

// No cudaMemcpy DtoH needed.

// Use output_raw_ptr data in host code