[closed]nppiRemap_8u_C3R function

Hi,

I want to remap image, here is my code:

int nImageWidth{ 640 };
    int nImageHeight{ 480 };
     int radius{ 240 };

    uint8_t * pOut{ new uint8_t[ nImageWidth * nImageHeight * 3 ] };
    YUV2RGB( pIn, pOut );
    //testsave pOut to bmp image OK


    int dstStepCUDA;
    Npp8u * pDstImgCUDA{ nppiMalloc_8u_C3( 4 * radius, 4 * radius, & dstStepCUDA) };
    NPP_ASSERT_NOT_NULL( pDstImgCUDA );    
    NppiSize dstRectCUDA{ 4 * radius, 4 * radius };

    //input image params from pOut
    NppiSize srcSize{ nImageWidth, nImageHeight };
    //square rect inside 4:3 image
    NppiRect srcRect{ ( nImageWidth - nImageHeight ) / 2, 0, nImageHeight, nImageHeight };

    //allocate host x, y maps
    Npp32f * pMapX{ new Npp32f[ 4 * radius * 4 * radius ] };
    Npp32f * pMapY{ new Npp32f[ 4 * radius * 4 * radius ] };
    //fill maps OK
    polar2LinearMaps( radius, radius, radius, 0.0, pMapX, pMapY );

    //allocate device x map
    Npp32f * pMapXCUDA = nppiMalloc_32f_C1(4 * radius, 4 * radius, & nMapPitchCUDA);
    NPP_ASSERT_NOT_NULL( pMapXCUDA );
    //upload cuda x map to device
    NPP_CHECK_CUDA(
        cudaMemcpy2D( pMapXCUDA, nMapPitchCUDA, pMapX, 4 * radius * sizeof(Npp32f), 4 * radius, 4 * radius, cudaMemcpyHostToDevice );
    );

    //allocate device y map
    Npp32f * pMapYCUDA = nppiMalloc_32f_C1(4 * radius, 4 * radius, & nMapPitchCUDA);
    NPP_ASSERT_NOT_NULL( pMapYCUDA );
    //upload cuda y map to device
    NPP_CHECK_CUDA(
        cudaMemcpy2D( pMapYCUDA, nMapPitchCUDA, pMapY, 4 * radius * sizeof(Npp32f), 4 * radius, 4 * radius, cudaMemcpyHostToDevice );
    );

Output is OK:

cudaSetDevice GPU 0 = GK20A
NPP Library Version 6.5.34
CUDA Driver Version: 6.5
CUDA Runtime Version: 6.5
Device 0: < GK20A >, Compute SM 3.2 detected
0 3314.55 ms

But if i apply remap function, Ubuntu freezes:

try{

    qDebug() << nppiRemap_8u_C3R(
        pOut, srcSize, nImageWidth * 3, srcRect,
        pMapXCUDA, nMapPitchCUDA,
        pMapYCUDA, nMapPitchCUDA,
        pDstImgCUDA, dstStepCUDA, dstRectCUDA,
        NPPI_INTER_NN
    );
    } catch( int a ){
        qDebug() <<  "Caught exception number:  " << a;
        return;

    }

Please help to understand my issue.

Best regards Viktor.

I’m pretty sure in nppiRemap_xxxxxx the source image pointer is supposed to point to device memory. But your source image pointer (pOut) points to host memory:

uint8_t * pOut{ new uint8_t[ nImageWidth * nImageHeight * 3 ] };

    qDebug() << nppiRemap_8u_C3R(
        pOut...

Thank you very much! Yes it is my mistake. Now is no crashes and status of remap functiom = 0.

But now always black bitmaps returned.

I have tried different maps sizes and output cuda arrays. But allways black pixels.

Do you have any ideas?

read the documentation.
the remap function will return black pixels under some circumstances

Please advice me true manual for NPP api.
I read NPP_Library.pdf about remap function and there is only functions definitions and few words about rect.
I have read and try OpenCV and Intel IPP remap docs with examples and it works fine.

Here is my code:

static void processImage( const void * pIn, int size ) {

	//convert input YUV buffer to RGB buffer
    Npp8u * pOut{ new Npp8u[ nImageWidth * nImageHeight * 3 ] };
    YUV2RGB( pIn, pOut );
    QImage i( pOut, nImageWidth, nImageHeight, QImage::Format_RGB888 );
    //i.save( "/home/ubuntu/test/testx" + QString::number( count ) + ".bmp" ); <-- full image OK

	//create source device buffer and fill it with RGB buffer 
    int srcStepCUDA;
    Npp8u * pSrcImgCUDA{ nppiMalloc_8u_C3( nImageWidth, nImageHeight, & srcStepCUDA) };
    NPP_ASSERT_NOT_NULL( pSrcImgCUDA );
    NPP_CHECK_CUDA(
        cudaMemcpy2D( pSrcImgCUDA, srcStepCUDA, pOut, nImageWidth * 3, nImageWidth, nImageHeight, cudaMemcpyHostToDevice );
    );

	//create destination device buffer
    int w{ 2 * nImageHeight };
    int h{ nImageHeight / 2 };		
    int dstStepCUDA;
    Npp8u * pDstImgCUDA{ nppiMalloc_8u_C3( w, h, & dstStepCUDA) };
    NPP_ASSERT_NOT_NULL( pDstImgCUDA );
    NppiSize dstRectCUDA{ w, h };

    NppiSize srcSize{ nImageWidth, nImageHeight };
    NppiRect srcRect{ ( nImageWidth - nImageHeight ) / 2, 0, nImageHeight, nImageHeight };
    //NppiRect srcRect{ 0, 0, nImageWidth, nImageHeight };

    // start timer
    struct timeval t1, t2;
    double elapsedTime;
    gettimeofday(&t1, NULL);

    NppStatus res = nppiRemap_8u_C3R(
        pSrcImgCUDA, srcSize, srcStepCUDA, srcRect,
        pMapXTopCUDA, nMapPitchCUDA,
        pMapYTopCUDA, nMapPitchCUDA,
        pDstImgCUDA, dstStepCUDA, dstRectCUDA,
        NPPI_INTER_LINEAR
    );

    qDebug() << "Status " << res;

    // stop timer
    gettimeofday(&t2, NULL);
    elapsedTime = (t2.tv_sec - t1.tv_sec) * 1000.0;      // sec to ms
    elapsedTime += (t2.tv_usec - t1.tv_usec) / 1000.0;   // us to ms
    qDebug() << "remap " << elapsedTime << " ms";

	//create RGB buffer and fill it from device destination buffer
    Npp8u * p{ new Npp8u[ w * h * 3 ] };
    NPP_CHECK_CUDA(
        cudaMemcpy2D( p, w * 3, pDstImgCUDA, dstStepCUDA, w, h, cudaMemcpyDeviceToHost )
    );
    QImage outImage( p, w, h, w * 3, QImage::Format_RGB888 );
    outImage.save( "/home/ubuntu/test/test" + QString::number( count ) + ".bmp" ); <-- 1/3 part of width is color and 2/3 part of width is black

    nppiFree( pDstImgCUDA );
    nppiFree( pSrcImgCUDA );
    delete [] pOut;
    delete [] p;
}

Now if i don’t apply remap function then i get image with only 1/3 part of color pixels.
If i use remap then i have full black image.

Here is 4 similar maps:

Npp32f * pMapXTop{ new Npp32f[ w * h ] };
    Npp32f * pMapYTop{ new Npp32f[ w * h ] };
    Npp32f * pMapXBottom{ new Npp32f[ w * h ] };
    Npp32f * pMapYBottom{ new Npp32f[ w * h ] };

    polar2LinearMaps( radius, xT, yT, 0.0, pMapXTop, pMapYTop );
    polar2LinearMaps( radius, xB, yB, 0.0, pMapXBottom, pMapYBottom );

    pMapXTopCUDA = nppiMalloc_32f_C1( w, h, & nMapPitchCUDA );
    NPP_ASSERT_NOT_NULL( pMapXTopCUDA );
    NPP_CHECK_CUDA(
        cudaMemcpy2D( pMapXTopCUDA, nMapPitchCUDA, pMapXTop, w * sizeof(Npp32f), w, h, cudaMemcpyHostToDevice );
    );

    pMapYTopCUDA = nppiMalloc_32f_C1( w, h, & nMapPitchCUDA );
    NPP_ASSERT_NOT_NULL( pMapYTopCUDA );
    NPP_CHECK_CUDA(
        cudaMemcpy2D( pMapYTopCUDA, nMapPitchCUDA, pMapYTop, w * sizeof(Npp32f), w, h, cudaMemcpyHostToDevice );
    );

    pMapXBottomCUDA = nppiMalloc_32f_C1( w, h, & nMapPitchCUDA );
    NPP_ASSERT_NOT_NULL( pMapXBottomCUDA );
    NPP_CHECK_CUDA(
        cudaMemcpy2D( pMapXBottomCUDA, nMapPitchCUDA, pMapXBottom, w * sizeof(Npp32f), w, h, cudaMemcpyHostToDevice );
    );

    pMapYBottomCUDA = nppiMalloc_32f_C1( w, h, & nMapPitchCUDA );
    NPP_ASSERT_NOT_NULL( pMapYBottomCUDA );
    NPP_CHECK_CUDA(
        cudaMemcpy2D( pMapYBottomCUDA, nMapPitchCUDA, pMapYBottom, w * sizeof(Npp32f), w, h, cudaMemcpyHostToDevice );
    );

    delete [] pMapXTop;
    delete [] pMapYTop;
    delete [] pMapXBottom;
    delete [] pMapYBottom;

Please can you give me advice where is my issue?

Also may be my tegra 3 board has corrupted RAM or bad driver?

Best regards Viktor.

Solved. Width in cudaMemcpy2D must be in bytes. Thanks.