nppiResize_8u_C3R function of cuda 10.1 outputs a wrong result

hi,I want to downsamping a image by nppiResize_8u_C3R funtion. I use opencv343 to load images and then copy host data to Device to do resize computation,and finally copy device data to host and save the resized image to my disk.My problem is that when I setted the resize scale to 1(same size of src image),the resized image is black in the last few lines;when I setted the resize scale to 0.5(half width and half height of src images),the resized image is very different from the src image.I run my code on Win10,NVIDIA 2080TI.Below is my codeCould you help my check is there any errors of my code?thank u.I didn’t find where to attach my src image and resized images with scale 1 and 0.5. If you need I may email them.thank u.

// cudaNPPI.cpp : Defines the entry point for the console application.
//

// ConsoleApplication1.cpp : main project file.

#include “stdafx.h”
#include <nppi_geometry_transforms.h>
#include <opencv2\opencv.hpp>
#include
#include <nppi.h>
#include <npps.h>
#include <nppcore.h>
//#include “Exceptions.h”

using namespace cv;
using namespace std;

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <math.h>
#include <cuda_runtime.h>
#include <npp.h>

int main(int argc, char* argv)
{
cudaError_t cuRet;
NppStatus nppRet;
unsigned char* pSrcData;
unsigned char* pDstData;
Npp8u* pSrcDataCUDA = nullptr;
Npp8u* pDstDataCUDA = nullptr;
NppiSize oSrcSize;
NppiSize oDstSize;
NppiRect oSrcROI;
NppiRect oDstROI;
int nSrcPitch;
int nDstPitch;
int nSrcPitchCUDA;
int nDstPitchCUDA;

/* load src image */
Mat pSrcBmp = imread("src.jpg");

pSrcData = pSrcBmp.data;

oSrcSize.width = pSrcBmp.cols;
oSrcSize.height = pSrcBmp.rows;
nSrcPitch = pSrcBmp.cols * 3;
cout << nSrcPitch << endl;
oSrcROI.x = oSrcROI.y = 0;
oSrcROI.width = oSrcSize.width;
oSrcROI.height = oSrcSize.height;
float nScaleFactor = 1;// 0.5 resize scale
//set gpu number
cuRet = cudaSetDevice(0);
assert(cuRet == cudaSuccess);

/*allocate memory */
pSrcDataCUDA = nppiMalloc_8u_C3(oSrcSize.width, oSrcSize.height, &nSrcPitchCUDA);
assert(pSrcDataCUDA != NULL);

/* copy src image data to gpu memory */
cudaMemcpy2D(pSrcDataCUDA, nSrcPitchCUDA, pSrcData, nSrcPitch, oSrcSize.width * 3, oSrcSize.height, cudaMemcpyHostToDevice);

/*compute resized ROI*/
nppiGetResizeRect(oSrcROI, &oDstROI, nScaleFactor, nScaleFactor, 0, 0, NPPI_INTER_CUBIC);
oDstSize.width = oDstROI.width;
oDstSize.height = oDstROI.height;

/* create image to store resized image */
Mat pDstBmp(oDstSize.height, oDstSize.width, CV_8UC3);
pDstData = pDstBmp.data;
nDstPitch = pDstBmp.cols * 3;

/* allocate gpu memory for resized image */
pDstDataCUDA = nppiMalloc_8u_C3(oDstSize.width, oDstSize.height, &nDstPitchCUDA);
assert(pDstDataCUDA != NULL);
cudaMemset2D(pDstDataCUDA, nDstPitchCUDA, 0, oDstSize.width * 3, oDstSize.height);

/* resize */
nppRet = nppiResize_8u_C3R(pSrcDataCUDA, oSrcSize.width * 3, oSrcSize, oSrcROI,
		pDstDataCUDA, oDstSize.width * 3, oDstSize, oDstROI, NPPI_INTER_LINEAR);
assert(nppRet == NPP_NO_ERROR);

cudaMemcpy2D(pDstData, nDstPitch, pDstDataCUDA, nDstPitchCUDA, oDstSize.width * 3, oDstSize.height, cudaMemcpyDeviceToHost);
imwrite("resized.jpg", pDstBmp);

nppiFree(pSrcDataCUDA);
nppiFree(pDstDataCUDA);

cudaDeviceReset();
return 0;

}

src.jpg
resized-scale0.5.jpg
resized-scale1.jpg

src.jpg

resized-scale0.5.jpg

resized-scale1.jpg