How to copy an image correctly using cudaMemcpy2DAsync

i also posted this on github discussion how to copy an image correctly using cudaMemcpy2DAsync · NVIDIA/VideoProcessingFramework · Discussion #529 · GitHub.

somthing weird happended when i tried to copy an image from device to device.

first, i decoded a yuv420 image using pyav, and saved it into a npy file

def test_decoder():
    fpath = r"D:\Downloads\b1_480p.mp4"
    container = av.open(fpath)
    for packet in container.demux(video=0):
        for frame in packet.decode():
            print(frame)
            np.save("pyav_yuv420p", frame.to_ndarray())
            return
    return

and then loaded that npy file to gpu using torch

def yuv420npy_to_tensor():
    yuv420p_img = np.load("pyav_yuv420p.npy")
    tensor = torch.from_numpy(yuv420p_img)
    print(tensor.shape, tensor.device)
    tensor = tensor.cuda()
    print(tensor.shape, tensor.device)
    torch.save(tensor, "pyav_yuv420p.pt")
    return

as far as i know, yuv420p means Y colors are going to be stored in the first plane, and U colors in the second plane, and V colors in

the third plane.
so in a contiguous memory space, it’s going to look like the following, assume we have a 1920x1080 image, then the first 1920x1080

space is going to hold Y plane, and the next 960x540 space contains U colors, and followed by V colors stored in the last 960X540.

and actually 960x540 == 1920x270


    Y Y Y Y Y Y 
    Y Y Y Y Y Y 
    Y Y Y Y Y Y    plane 1
    Y Y Y Y Y Y 
    U U U U U U    plane2
    V V V V V V    plane3

   ---------------  --------------  -----------
     y:1920x1080    u:1920x270      v:1920x270

so to construct a yuv420P surface, just copy Y plane , U plane, and V plane seperately. the key is to calculate the pointer

addresses for Y, U, and V planes.

we can obtain those three pointers by splitting the tensor like this


    yuv420p_tensor = torch.load("pyav_yuv420p.pt")
    w, h = yuv420p_tensor.shape[1], int(yuv420p_tensor.shape[0] // 1.5)
    print(w, h)
    y_tensor, uv_tensor = yuv420p_tensor.split(h, 0)
    u_tensor, v_tensor = uv_tensor.split(h//4, 0)
    y_ptr = y_tensor.data_ptr()
    u_ptr = u_tensor.data_ptr()
    v_ptr = v_tensor.data_ptr()
    print("!", y_tensor.shape, u_tensor.shape, v_tensor.shape)
    print("#", u_ptr - y_ptr, v_ptr - u_ptr)

the statement u_tensor, v_tensor = uv_tensor.split(h//4, 0) is splitting the tensor like this

   y_ptr    Y Y Y Y Y Y 
            Y Y Y Y Y Y 
            Y Y Y Y Y Y
            Y Y Y Y Y Y

   u_ptr    U U U U U U 1920X270
   v_ptr    V V V V V V 1920X270

and to copy the image, we just exposed cudaMemcpy2DAsync to Python interface.

void PycudaMemcpy2DAsync(CUdeviceptr src_ptr, uint32_t src_pitch, uint32_t width, uint32_t height,
                         CUdeviceptr dst_ptr, uint32_t dst_pitch, uint32_t elem_size, size_t str = 0U)
{
  if (elem_size != 1) {
    std::stringstream ss;
    ss << __FUNCTION__;
    ss << ": only torch::kUInt8 data type is supported";
    throw std::runtime_error(ss.str());
  }
  auto res = str ? cudaMemcpy2DAsync(
                       (void*)dst_ptr, dst_pitch, (const void*)src_ptr, src_pitch, width,
                       height, cudaMemcpyDeviceToDevice, (cudaStream_t)str)
                 : cudaMemcpy2D((void*)dst_ptr, dst_pitch, (const void*)src_ptr, src_pitch,
                                width, height, cudaMemcpyDeviceToDevice);
  if (cudaSuccess != res) {
    std::stringstream ss;
    ss << __FUNCTION__;
    ss << ": failed to copy data from src ptr to dst ptr. CUDA error code: ";
    ss << res;

    throw std::runtime_error(ss.str());
  }
}

and then copied those three pointers setting pitches

    y_pitch = w
    u_pitch = w // 2
    v_pitch = w // 2
    yuv420_surface = nvc.Surface.Make(nvc.PixelFormat.YUV420, w, h, gpu_id=0)
    y_plane = yuv420_surface.PlanePtr(0)
    u_plane = yuv420_surface.PlanePtr(1)
    v_plane = yuv420_surface.PlanePtr(2)
    #
    nvc.PycudaMemcpy2DAsync(y_ptr, y_pitch, w,    h,    y_plane.GpuMem(), y_plane.Pitch(), 1, cuda_stream_handler)
    nvc.PycudaMemcpy2DAsync(u_ptr, u_pitch, w//2, h//2, u_plane.GpuMem(), u_plane.Pitch(), 1, cuda_stream_handler)
    nvc.PycudaMemcpy2DAsync(v_ptr, v_pitch, w//2, h//2, v_plane.GpuMem(), v_plane.Pitch(), 1, cuda_stream_handler)

and finally, combined all of them into one function

def run_yuv420p_copy_conversion():
    yuv420p_tensor = torch.load("pyav_yuv420p.pt")
    w, h = yuv420p_tensor.shape[1], int(yuv420p_tensor.shape[0] // 1.5)
    print(w, h)
    y_tensor, uv_tensor = yuv420p_tensor.split(h, 0)
    u_tensor, v_tensor = uv_tensor.split(h//4, 0)
    y_ptr = y_tensor.data_ptr()
    u_ptr = u_tensor.data_ptr()
    v_ptr = v_tensor.data_ptr()
    y_pitch = w
    u_pitch = w // 2
    v_pitch = w // 2
    print("!", y_tensor.shape, u_tensor.shape, v_tensor.shape)
    print("#", u_ptr - y_ptr, v_ptr - u_ptr)
    # copy !!!!!!!
    yuv420_surface = nvc.Surface.Make(nvc.PixelFormat.YUV420, w, h, gpu_id=0)
    y_plane = yuv420_surface.PlanePtr(0)
    u_plane = yuv420_surface.PlanePtr(1)
    v_plane = yuv420_surface.PlanePtr(2)
    nvc.PycudaMemcpy2DAsync(y_ptr, y_pitch, w,    h,    y_plane.GpuMem(), y_plane.Pitch(), 1, 0)
    nvc.PycudaMemcpy2DAsync(u_ptr, u_pitch, w//2, h//2, u_plane.GpuMem(), u_plane.Pitch(), 1, 0)
    nvc.PycudaMemcpy2DAsync(v_ptr, v_pitch, w//2, h//2, v_plane.GpuMem(), v_plane.Pitch(), 1, 0)
    # conversion
    nv_cvt = nvc.PySurfaceConverter(w, h,
                                    nvc.PixelFormat.YUV420,
                                    nvc.PixelFormat.BGR,
                                    gpu_id=0,
                                    )
    cc_ctx = nvc.ColorspaceConversionContext(nvc.ColorSpace.BT_601, nvc.ColorRange.MPEG)
    surface = nv_cvt.Execute(yuv420_surface, cc_ctx)
    # show the bgr img
    surf_plane = surface.PlanePtr()
    bgr_tensor = pnvc.DptrToTensor(
        surf_plane.GpuMem(),
        surf_plane.Width(),
        surf_plane.Height(),
        surf_plane.Pitch(),
        surf_plane.ElemSize(),
    )
    print(bgr_tensor.shape)
    bgr_ndarray = bgr_tensor.cpu().numpy().reshape(h, w, 3)
    print(bgr_ndarray.shape)
    cv.imshow("sda", bgr_ndarray)
    cv.waitKey()
    return

and it worked perfectly.

BUT, the problem is that when i changed the statement u_tensor, v_tensor = uv_tensor.split(h//4, 0) to

u_tensor, v_tensor = uv_tensor.split(w//2, 1), it worked as well, but it shouldn’t.

the statement u_tensor, v_tensor = uv_tensor.split(w//2, 1) was splitting the u and v plane at the middle into two parts,.

it was assuming that the left of size 960x540 contains U colors, and the right part of size 960x540 contains V colors

   y_ptr    Y Y Y Y Y Y 
            Y Y Y Y Y Y 
            Y Y Y Y Y Y
            Y Y Y Y Y Y

                      
   u_ptr    U U U    v_ptr V V V 
   960X540  U U U  960X540 V V V 

and according to the Nvidia’s document, cudaMemcpy2DAsync is going to copy a block of contiguous memory space as

cuMemcpyHtoDAsync does except taking pitch into account.

so what did i miss?

below is the pyav_yuv420p.pt

pyav_yuv420p.pt (601.2 KB)