Image batch resize with npp

I am having a problem with the nppiResizeBatch function. When I run my code (please see below) I get the exception “./UtilNPP/ImageAllocatorsNPP.h:114: cudaSuccess == eResult assertion faild!”. When I run the non batch version instead (which is currently commented out in the code below) everything works fine. I am using CUDA 10.2 and Ubuntu 16.04. Thank you very much for your help :-)

#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
#  define WINDOWS_LEAN_AND_MEAN
#  define NOMINMAX
#  include <windows.h>
#  pragma warning(disable:4819)
#endif

#include <ImagesCPU.h>
#include <ImagesNPP.h>
#include <ImageIO.h>
#include <Exceptions.h>

#include <string.h>
#include <fstream>
#include <iostream>

#include <cuda_runtime.h>
#include <npp.h>
#include <helper_string.h>
#include <helper_cuda.h>

int main(int argc, char *argv[])
{
    printf("%s Starting...\n\n", argv[0]);

    try
    {
        std::string sFilename, sResultFilename;

        sFilename = "Lena.pgm";
        sResultFilename = "Lena_resizeTest.pgm";

        // declare a host image object for an 8-bit grayscale image
        npp::ImageCPU_8u_C1 oHostSrc1, oHostSrc2;
        // load gray-scale image from disk
        npp::loadImage(sFilename, oHostSrc1);
        npp::loadImage(sFilename, oHostSrc2);
        // declare a device image and copy construct from the host image,
        // i.e. upload host to device
        npp::ImageNPP_8u_C1 oDeviceSrc1(oHostSrc1);
        npp::ImageNPP_8u_C1 oDeviceSrc2(oHostSrc2);
        NppiSize oSrcSize = {(int)oDeviceSrc1.width(), (int)oDeviceSrc1.height()};
        NppiRect oSrcROI = {0, 0, int(oDeviceSrc1.width()), int(oDeviceSrc1.height())};

        // allocate device image of appropriately reduced size
        npp::ImageNPP_8u_C1 oDeviceDst1(int(oSrcROI.width/2), int(oSrcROI.height/2));
        npp::ImageNPP_8u_C1 oDeviceDst2(int(oSrcROI.width/2), int(oSrcROI.height/2));
        NppiSize oDstSize = {int(oDeviceDst1.width()), int(oDeviceDst1.height())};
        NppiRect oDstROI = {0, 0, int(oDeviceDst1.width()), int(oDeviceDst1.height())};

        //const void *pSrc1, *pSrc2;
        const Npp8u *pSrc1, *pSrc2;
        pSrc1 = oDeviceSrc1.data();
        pSrc2 = oDeviceSrc2.data();

        int nSrcStep = oDeviceSrc1.pitch();

        //void *pDst1, *pDst2;
        Npp8u *pDst1, *pDst2;
        pDst1 = oDeviceDst1.data();
        pDst2 = oDeviceDst2.data();

        int nDstStep = oDeviceDst1.pitch();

        //NppiResizeBatchCXR *pBatchList = new(NppiResizeBatchCXR[2]);
        NppiResizeBatchCXR pBatchList[2];
        pBatchList[0].pSrc = pSrc1;
        pBatchList[0].nSrcStep = nSrcStep;
        pBatchList[0].pDst = pDst1;
        pBatchList[0].nDstStep = nDstStep;
        pBatchList[1].pSrc = pSrc2;
        pBatchList[1].nSrcStep = nSrcStep;
        pBatchList[1].pDst = pDst2;
        pBatchList[1].nDstStep = nDstStep;

        cudaDeviceSynchronize();

        NPP_CHECK_NPP(nppiResizeBatch_8u_C1R(oSrcSize,				//NppiSize 	oSmallestSrcSize,
                                             oSrcROI,				//NppiRect 	oSrcRectROI,
                                             oDstSize,				//NppiSize 	oSmallestDstSize,
                                             oDstROI,				//NppiRect 	oDstRectROI,
                                             NPPI_INTER_NN,			//int	 	eInterpolation,
                                             pBatchList,			//NppiResizeBatchCXR * 	pBatchList,
                                             2					//unsigned int 	nBatchSize 
                                             ));

        cudaDeviceSynchronize();


   /*	NPP_CHECK_NPP(nppiResize_8u_C1R(pSrc1,					//const Npp8u*	pSrc,
                                        nSrcStep,				//int		nSrcStep,
                                        oSrcSize,				//NppiSize	oSrcSize,
                                        oSrcROI,				//NppiRect 	oSrcRectROI,
                                        pDst1,					//Npp8u * 	pDst,
                                        nDstStep,				//int	 	nDstStep,
                                        oDstSize,				//NppiSize 	oDstSize,
                                        oDstROI,				//NppiRect 	oDstRectROI,
                                        NPPI_INTER_NN				//int 	eInterpolation 
                                        ));*/


        // declare a host image for the result
        npp::ImageCPU_8u_C1 oHostDst1(oDeviceDst1.size());
        // and copy the device result data into it
        oDeviceDst1.copyTo(oHostDst1.data(), oHostDst1.pitch());

        saveImage(sResultFilename, oHostDst1);
        std::cout << "Saved image: " << sResultFilename << std::endl;

        nppiFree(oDeviceSrc1.data());
        nppiFree(oDeviceSrc2.data());
        nppiFree(oDeviceDst1.data());
        nppiFree(oDeviceDst2.data());

        exit(EXIT_SUCCESS);
    }
    catch (npp::Exception &rException)
    {
        std::cerr << "Program error! The following exception occurred: \n";
        std::cerr << rException << std::endl;
        std::cerr << "Aborting." << std::endl;

        exit(EXIT_FAILURE);
    }
    catch (...)
    {
        std::cerr << "Program error! An unknow type of exception occurred. \n";
        std::cerr << "Aborting." << std::endl;

        exit(EXIT_FAILURE);
        return -1;
    }

    return 0;
}

Quoting the documentation:

The NppiResizeBatchCXR or NppiImageDescriptor and NppiResizeBatchROI_Advanced arrays must be in device memory.

So in your case you need to copy pBatchList array to the device.

And for example, calling nppiResizeBatch_8u_C3R_Advanced_Ctx you need to copy these 3 arrays to the device:

  • array of input images pointers
  • array of output crops pointers
  • array of rois

also, you can check this batched example call in NPP lib

Copying these arrays to the device will solve your problem.

1 Like