Using nppiResizeBatch_8u_C3R causes exception wrap illegal address

I am changing some working code repository that originally uses nppiResize_8u_C3R to use nppiResizeBatch_8u_C3R instead for advantages here NVIDIA 2D Image And Signal Performance Primitives (NPP): ResizeBatch. So far nppiResize_8u_C3R invokes without error but whenever the batched version is called with similar stride and pointer parameters it outputs an CUDA_EXCEPTION_14 (Warp Illegal Address). Code used below:


NppiSize getLargestSize(NppiSize first, NppiSize second)
{
    NppiSize smallest{first.width, first.height};
    if (second.height > first.height || first.height == 0) {
        smallest.height = second.height;
    }
    if (second.width > first.width || first.width == 0) {
        smallest.width = second.width;
    }
    return smallest;
}

int main()
{
    cv::Mat image = cv::imread("./tests/data/persons.png");
    int w = 4000;
    int h = 4000;
    int m_channels = 3;
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    int batchSize = 2;

    std::vector<cv::Mat> images;
    for (int i = 0; i < batchSize; i++) {
        cv::Mat imgClone = image.clone();
//         imgClone.resize(1000 + i * 1000);
        images.emplace_back(imgClone);
    }
    int dstStep = 640 * m_channels;
    NppiSize dstSize{.width = 640, .height = 640};
    NppiRect dstRoi{.x = 0, .y = 0, .width = 640, .height = 640};

    // MixedBufferBatch struct is on CPU
    NppiResizeBatchCXR mixedBufferBatch[batchSize];
    NppiSize largestSize = {0, 0};

    // Pointers
    void *m_inputBuffer;
    void *m_resizedBuffer;
    (cudaMalloc(reinterpret_cast<void **>(&m_inputBuffer), batchSize * w * h * m_channels));
    (cudaMalloc(reinterpret_cast<void **>(&m_resizedBuffer), batchSize * 640 * 640 * m_channels));
    {
        auto *inbuf = (uchar *)(m_inputBuffer);
        auto *outbuf = (uchar *)(m_resizedBuffer);

        int idx = 0;
        for (cv::Mat &image: images) {
            largestSize = getLargestSize(largestSize, {image.cols, image.rows});
            assert(image.elemSize() == 3);
            size_t N = image.rows * image.cols * image.elemSize();
            cudaMemcpyAsync(inbuf, image.data, N, cudaMemcpyHostToDevice, stream);

            mixedBufferBatch[idx].nSrcStep = image.cols * image.elemSize();// * image.rows;
            mixedBufferBatch[idx].nDstStep = dstStep;
            mixedBufferBatch[idx].pSrc = inbuf;
            mixedBufferBatch[idx].pDst = outbuf;
            inbuf += N;    // on GPU
            outbuf += (640 * 640 * m_channels);
            idx++;
        }
    }
    {
        cudaError_t err = cudaDeviceSynchronize();
        if (err != cudaSuccess) {
            std::cout << "[Before Resize] ERROR OCCURED: " << err <<"\n";
        }
    }
    NppiRect largestSizeRect{.x = 0, .y = 0, .width = largestSize.width, .height = largestSize.height};
    NppStatus resultStatus = nppiResizeBatch_8u_C3R(
                    largestSize, largestSizeRect, dstSize, dstRoi,
                    NPPI_INTER_LINEAR, mixedBufferBatch, images.size()
            );
//    NppStatus resultStatus =
//            nppiResize_8u_C3R((unsigned char *) (m_inputBuffer), largestSize.width * m_channels, largestSize, largestSizeRect,
//                              (unsigned char *) (m_resizedBuffer), dstStep, dstSize, dstRoi,
//                              NPPI_INTER_LINEAR);

//    The device encountered a load or store instruction on an invalid memory address.
//    This leaves the process in an inconsistent state and any further CUDA work will
//    return the same error. To continue using CUDA, the process must be terminated and relaunched.
    cudaError_t err = cudaDeviceSynchronize();
    if (err != cudaSuccess) {
        std::cout << "[After resize] ERROR OCCURED: " << err <<"\n";
    }

    cv::Mat saveII(640, 640, CV_8UC3, cv::Scalar(10, 10, 10));
    cudaMemcpy(saveII.data, m_resizedBuffer, 640 * 640*3, cudaMemcpyDeviceToHost);
    (cudaStreamSynchronize(stream));
    cv::imwrite("saveII.png", saveII);


    cudaFree(m_inputBuffer);
    cudaFree(m_resizedBuffer);
    return 0;
}

If you uncomment the nppiResize_8u_C3R section instead of using nppiResizeBatch_8u_C3R function, you’ll see it doesn’t raise the 700 error and correctly outputs a resized RGB image as expected. Would appreciate any inputs on this peculiar issue.

OpenCV-Free version:

NppiSize getLargestSize(NppiSize first, NppiSize second)
{
    NppiSize smallest{first.width, first.height};
    if (second.height > first.height || first.height == 0) {
        smallest.height = second.height;
    }
    if (second.width > first.width || first.width == 0) {
        smallest.width = second.width;
    }
    return smallest;
}

int main()
{
    int w = 2000;
    int h = 2000;
    int m_channels = 3;
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    int batchSize = 4;

    int dstStep = 640 * m_channels;
    NppiSize dstSize{.width = 640, .height = 640};
    NppiRect dstRoi{.x = 0, .y = 0, .width = 640, .height = 640};

    // MixedBufferBatch struct is on CPU
    NppiResizeBatchCXR mixedBufferBatch[batchSize];
    NppiSize largestSize = {0, 0};

    // Pointers
    void *m_inputBuffer;
    void *m_resizedBuffer;
    (cudaMalloc(reinterpret_cast<void **>(&m_inputBuffer), batchSize * w * h * m_channels));
    cudaMemset(m_inputBuffer, 1, 5012);
    cudaMemset(m_inputBuffer, 2, batchSize * w * h * m_channels-5012);
    (cudaMalloc(reinterpret_cast<void **>(&m_resizedBuffer), batchSize * 640 * 640 * m_channels));

    cudaDeviceSynchronize();

    auto *inbuf = reinterpret_cast<uchar *>(m_inputBuffer);
    auto *outbuf = reinterpret_cast<uchar *>(m_resizedBuffer);

    for (int idx=0;idx<batchSize;idx++) {
        largestSize = getLargestSize(largestSize, {w, h});
        size_t N = w * h * m_channels;

        int srcStep = w * m_channels;
        mixedBufferBatch[idx].nSrcStep = srcStep;
        mixedBufferBatch[idx].nDstStep = dstStep;
        mixedBufferBatch[idx].pSrc = inbuf;
        mixedBufferBatch[idx].pDst = outbuf;
        inbuf += N;    // on GPU
        outbuf += (640 * 640 * m_channels);
        idx++;
    }
//CUDA_EXCEPTION_14 (Warp Illegal Address)
// cuda-gdb backtrace:
//#0  0x0000555559a339e0 in void resizeNaiveBatchBy2Rows<unsigned char, 3u, SampleLinearImageBatchBy2Rows<unsigned char,
// 3> >(SampleLinearImageBatchBy2Rows<unsigned char, 3>,
// float, float, nppc::ClipRect, NppiRect, NppiResizeBatchCXR*, unsigned int)<<<(20,320,1),(128,1,1)>>> ()
    NppiRect largestSizeRect{.x = 0, .y = 0, .width = largestSize.width, .height = largestSize.height};
    NppStatus resultStatus = nppiResizeBatch_8u_C3R(
                    largestSize, largestSizeRect, dstSize, dstRoi,
                    NPPI_INTER_LINEAR, mixedBufferBatch, batchSize
            );
//    NppStatus resultStatus =
//            nppiResize_8u_C3R((uchar*)(m_inputBuffer), w * m_channels, largestSize, largestSizeRect,
//                      (uchar*)(m_resizedBuffer), 640 * 3, dstSize, dstRoi, NPPI_INTER_LINEAR);

    {
        cudaError_t err = cudaDeviceSynchronize();
        if (err != cudaSuccess) {
            std::cout << "ERROR OCCURED: " << err <<"\n";
        }
    }
    if (resultStatus == NPP_RESIZE_NO_OPERATION_ERROR) {
        std::cout << "No operation! \n";
    }
    cudaDeviceSynchronize();

    cudaFree(m_inputBuffer);
    cudaFree(m_resizedBuffer);
    return 0;
}

/*
add_executable(test_npp_resize ./npp_resize_batch.cu)
target_link_libraries(test_npp_resize
        PUBLIC
        ${CUDA_npp_LIBRARY}
        ${CUDA_nppi_LIBRARY}
        ${CUDA_LIBRARIES}
#        ${OpenCV_LIBS}
        )
*/
  1. according to the documentation the argument for the function parameter pBatchList must be a device pointer. Your code is not providing that.

  2. You should either increment idx in the body of the for-loop, or else in the for-loop statement. Not both.

When I fix those 2 issues, your code runs without errors for me:

$ cat t2089.cu
#include <nppi_geometry_transforms.h>
#include <iostream>
typedef unsigned char uchar;

NppiSize getLargestSize(NppiSize first, NppiSize second)
{
    NppiSize smallest{first.width, first.height};
    if (second.height > first.height || first.height == 0) {
        smallest.height = second.height;
    }
    if (second.width > first.width || first.width == 0) {
        smallest.width = second.width;
    }
    return smallest;
}

int main()
{
    int w = 2000;
    int h = 2000;
    int m_channels = 3;
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    int batchSize = 4;

    int dstStep = 640 * m_channels;
    NppiSize dstSize{.width = 640, .height = 640};
    NppiRect dstRoi{.x = 0, .y = 0, .width = 640, .height = 640};

    // MixedBufferBatch struct is on CPU
    NppiResizeBatchCXR mixedBufferBatch[batchSize];
    NppiSize largestSize = {0, 0};

    // Pointers
    void *m_inputBuffer;
    void *m_resizedBuffer;
    (cudaMalloc(reinterpret_cast<void **>(&m_inputBuffer), batchSize * w * h * m_channels));
    cudaMemset(m_inputBuffer, 1, 5012);
    cudaMemset(m_inputBuffer, 2, batchSize * w * h * m_channels-5012);
    (cudaMalloc(reinterpret_cast<void **>(&m_resizedBuffer), batchSize * 640 * 640 * m_channels));

    cudaDeviceSynchronize();

    auto *inbuf = reinterpret_cast<uchar *>(m_inputBuffer);
    auto *outbuf = reinterpret_cast<uchar *>(m_resizedBuffer);

    for (int idx=0;idx<batchSize;idx++) {
        largestSize = getLargestSize(largestSize, {w, h});
        size_t N = w * h * m_channels;

        int srcStep = w * m_channels;
        mixedBufferBatch[idx].nSrcStep = srcStep;
        mixedBufferBatch[idx].nDstStep = dstStep;
        mixedBufferBatch[idx].pSrc = inbuf;
        mixedBufferBatch[idx].pDst = outbuf;
        inbuf += N;    // on GPU
        outbuf += (640 * 640 * m_channels);
//        idx++;
    }
    NppiRect largestSizeRect{.x = 0, .y = 0, .width = largestSize.width, .height = largestSize.height};
    NppiResizeBatchCXR *d_mixedBufferBatch;
    cudaMalloc(&d_mixedBufferBatch, sizeof(NppiResizeBatchCXR)*batchSize);
    cudaMemcpy(d_mixedBufferBatch, mixedBufferBatch, sizeof(NppiResizeBatchCXR)*batchSize, cudaMemcpyHostToDevice);
    NppStatus resultStatus = nppiResizeBatch_8u_C3R(
                    largestSize, largestSizeRect, dstSize, dstRoi,
                    NPPI_INTER_LINEAR, d_mixedBufferBatch, batchSize
            );

    {
        cudaError_t err = cudaDeviceSynchronize();
        if (err != cudaSuccess) {
            std::cout << "ERROR OCCURED: " << err <<"\n";
        }
    }
    if (resultStatus == NPP_RESIZE_NO_OPERATION_ERROR) {
        std::cout << "No operation! \n";
    }
    cudaDeviceSynchronize();

    cudaFree(m_inputBuffer);
    cudaFree(m_resizedBuffer);
    return 0;
}
$ nvcc -o t2089 t2089.cu -lnppig
$ compute-sanitizer ./t2089
========= COMPUTE-SANITIZER
========= ERROR SUMMARY: 0 errors
$

Hi,

I have missed it right in the documentation! Thanks a lot for pointing these out, the other (old) batch question in the forum Image batch resize with npp should also look at this.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.