How to convert YUV to RGB using CUDA NPP?

When I try to use CUDA NPP to convert YUVNV12 to BGRHWC, but the conversion fails.
Please tell me how to use NPP for correct conversion.
Below is my code.

#include <cuda_runtime.h>
#include <iostream>
#include <npp.h>
#include <nppi_color_conversion.h>
#include <opencv2/opencv.hpp>
#include <vector>

inline void InitInput(std::vector<unsigned char> &vec)
{
    for (int i = 0; i < vec.size(); i++)
    {
        vec[i] = i % 127 + 128;
    }
}

int main()
{
    int width = 200, height = 600;
    std::vector<unsigned char> input_y(width * height);
    std::vector<unsigned char> input_uv(width * height / 2);
    InitInput(input_y);
    InitInput(input_uv);
    std::vector<unsigned char> input_yuv;
    input_yuv.insert(input_yuv.end(), input_y.begin(), input_y.end());
    input_yuv.insert(input_yuv.end(), input_uv.begin(), input_uv.end());
    cv::Mat src(height * 3 / 2, width, CV_8UC1, input_yuv.data());

    Npp8u *d_src[2], *d_dst;
    cudaMalloc(&d_src[0], height * width * sizeof(Npp8u));
    cudaMalloc(&d_src[1], height * width / 2 * sizeof(Npp8u));
    cudaMalloc(&d_dst, height * width * 3 * sizeof(Npp8u));

    cudaMemcpy(d_src[0], input_y.data(), width * height * sizeof(Npp8u), cudaMemcpyHostToDevice);
    cudaMemcpy(d_src[1], input_uv.data(), width * height / 2 * sizeof(Npp8u), cudaMemcpyHostToDevice);

    NppiSize npp_size = {width, height};
    nppiNV12ToBGR_8u_P2C3R(d_src, width, d_dst, width * 3, npp_size);

    cv::Mat npp_res(height, width, CV_8UC3);
    cudaMemcpy(npp_res.data, d_dst, width * height * 3, cudaMemcpyDeviceToHost);

    cv::Mat opencv_res(height, width, CV_8UC3);
    cv::cvtColor(src, opencv_res, cv::ColorConversionCodes::COLOR_YUV2BGR_NV21);

    int step = height * width * 3;
    int cnt = 0;
    for (int i = 0; i < width * height * 3; i++)
    {
        int npp_num = static_cast<int>(npp_res.data[i]);
        int opencv_num = static_cast<int>(opencv_res.data[i]);
        if (fabs(npp_num - opencv_num) >= 1.0f)
        {
            cnt++;
        }
    }
    std::cout << "cnt " << cnt << std::endl;
    std::cout << "YUV_I420_To_BGR_HWC int8,error rate: "
              << (float)cnt / step * 100 << "%" << std::endl;

    cudaFree(d_src);
    cudaFree(d_dst);

    return 0;
}

Error rate near 55%.
Thank you very much!

NV12 and NV21 are not the same format.