Using nppiResizeBatch_8u_C3R causes exception wrap illegal address

syther666 · August 9, 2022, 11:06pm

I am changing some working code repository that originally uses nppiResize_8u_C3R to use nppiResizeBatch_8u_C3R instead for advantages here NVIDIA 2D Image And Signal Performance Primitives (NPP): ResizeBatch. So far nppiResize_8u_C3R invokes without error but whenever the batched version is called with similar stride and pointer parameters it outputs an CUDA_EXCEPTION_14 (Warp Illegal Address). Code used below:


NppiSize getLargestSize(NppiSize first, NppiSize second)
{
    NppiSize smallest{first.width, first.height};
    if (second.height > first.height || first.height == 0) {
        smallest.height = second.height;
    }
    if (second.width > first.width || first.width == 0) {
        smallest.width = second.width;
    }
    return smallest;
}

int main()
{
    cv::Mat image = cv::imread("./tests/data/persons.png");
    int w = 4000;
    int h = 4000;
    int m_channels = 3;
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    int batchSize = 2;

    std::vector<cv::Mat> images;
    for (int i = 0; i < batchSize; i++) {
        cv::Mat imgClone = image.clone();
//         imgClone.resize(1000 + i * 1000);
        images.emplace_back(imgClone);
    }
    int dstStep = 640 * m_channels;
    NppiSize dstSize{.width = 640, .height = 640};
    NppiRect dstRoi{.x = 0, .y = 0, .width = 640, .height = 640};

    // MixedBufferBatch struct is on CPU
    NppiResizeBatchCXR mixedBufferBatch[batchSize];
    NppiSize largestSize = {0, 0};

    // Pointers
    void *m_inputBuffer;
    void *m_resizedBuffer;
    (cudaMalloc(reinterpret_cast<void **>(&m_inputBuffer), batchSize * w * h * m_channels));
    (cudaMalloc(reinterpret_cast<void **>(&m_resizedBuffer), batchSize * 640 * 640 * m_channels));
    {
        auto *inbuf = (uchar *)(m_inputBuffer);
        auto *outbuf = (uchar *)(m_resizedBuffer);

        int idx = 0;
        for (cv::Mat &image: images) {
            largestSize = getLargestSize(largestSize, {image.cols, image.rows});
            assert(image.elemSize() == 3);
            size_t N = image.rows * image.cols * image.elemSize();
            cudaMemcpyAsync(inbuf, image.data, N, cudaMemcpyHostToDevice, stream);

            mixedBufferBatch[idx].nSrcStep = image.cols * image.elemSize();// * image.rows;
            mixedBufferBatch[idx].nDstStep = dstStep;
            mixedBufferBatch[idx].pSrc = inbuf;
            mixedBufferBatch[idx].pDst = outbuf;
            inbuf += N;    // on GPU
            outbuf += (640 * 640 * m_channels);
            idx++;
        }
    }
    {
        cudaError_t err = cudaDeviceSynchronize();
        if (err != cudaSuccess) {
            std::cout << "[Before Resize] ERROR OCCURED: " << err <<"\n";
        }
    }
    NppiRect largestSizeRect{.x = 0, .y = 0, .width = largestSize.width, .height = largestSize.height};
    NppStatus resultStatus = nppiResizeBatch_8u_C3R(
                    largestSize, largestSizeRect, dstSize, dstRoi,
                    NPPI_INTER_LINEAR, mixedBufferBatch, images.size()
            );
//    NppStatus resultStatus =
//            nppiResize_8u_C3R((unsigned char *) (m_inputBuffer), largestSize.width * m_channels, largestSize, largestSizeRect,
//                              (unsigned char *) (m_resizedBuffer), dstStep, dstSize, dstRoi,
//                              NPPI_INTER_LINEAR);

//    The device encountered a load or store instruction on an invalid memory address.
//    This leaves the process in an inconsistent state and any further CUDA work will
//    return the same error. To continue using CUDA, the process must be terminated and relaunched.
    cudaError_t err = cudaDeviceSynchronize();
    if (err != cudaSuccess) {
        std::cout << "[After resize] ERROR OCCURED: " << err <<"\n";
    }

    cv::Mat saveII(640, 640, CV_8UC3, cv::Scalar(10, 10, 10));
    cudaMemcpy(saveII.data, m_resizedBuffer, 640 * 640*3, cudaMemcpyDeviceToHost);
    (cudaStreamSynchronize(stream));
    cv::imwrite("saveII.png", saveII);


    cudaFree(m_inputBuffer);
    cudaFree(m_resizedBuffer);
    return 0;
}

If you uncomment the nppiResize_8u_C3R section instead of using nppiResizeBatch_8u_C3R function, you’ll see it doesn’t raise the 700 error and correctly outputs a resized RGB image as expected. Would appreciate any inputs on this peculiar issue.

OpenCV-Free version:

NppiSize getLargestSize(NppiSize first, NppiSize second)
{
    NppiSize smallest{first.width, first.height};
    if (second.height > first.height || first.height == 0) {
        smallest.height = second.height;
    }
    if (second.width > first.width || first.width == 0) {
        smallest.width = second.width;
    }
    return smallest;
}

int main()
{
    int w = 2000;
    int h = 2000;
    int m_channels = 3;
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    int batchSize = 4;

    int dstStep = 640 * m_channels;
    NppiSize dstSize{.width = 640, .height = 640};
    NppiRect dstRoi{.x = 0, .y = 0, .width = 640, .height = 640};

    // MixedBufferBatch struct is on CPU
    NppiResizeBatchCXR mixedBufferBatch[batchSize];
    NppiSize largestSize = {0, 0};

    // Pointers
    void *m_inputBuffer;
    void *m_resizedBuffer;
    (cudaMalloc(reinterpret_cast<void **>(&m_inputBuffer), batchSize * w * h * m_channels));
    cudaMemset(m_inputBuffer, 1, 5012);
    cudaMemset(m_inputBuffer, 2, batchSize * w * h * m_channels-5012);
    (cudaMalloc(reinterpret_cast<void **>(&m_resizedBuffer), batchSize * 640 * 640 * m_channels));

    cudaDeviceSynchronize();

    auto *inbuf = reinterpret_cast<uchar *>(m_inputBuffer);
    auto *outbuf = reinterpret_cast<uchar *>(m_resizedBuffer);

    for (int idx=0;idx<batchSize;idx++) {
        largestSize = getLargestSize(largestSize, {w, h});
        size_t N = w * h * m_channels;

        int srcStep = w * m_channels;
        mixedBufferBatch[idx].nSrcStep = srcStep;
        mixedBufferBatch[idx].nDstStep = dstStep;
        mixedBufferBatch[idx].pSrc = inbuf;
        mixedBufferBatch[idx].pDst = outbuf;
        inbuf += N;    // on GPU
        outbuf += (640 * 640 * m_channels);
        idx++;
    }
//CUDA_EXCEPTION_14 (Warp Illegal Address)
// cuda-gdb backtrace:
//#0  0x0000555559a339e0 in void resizeNaiveBatchBy2Rows<unsigned char, 3u, SampleLinearImageBatchBy2Rows<unsigned char,
// 3> >(SampleLinearImageBatchBy2Rows<unsigned char, 3>,
// float, float, nppc::ClipRect, NppiRect, NppiResizeBatchCXR*, unsigned int)<<<(20,320,1),(128,1,1)>>> ()
    NppiRect largestSizeRect{.x = 0, .y = 0, .width = largestSize.width, .height = largestSize.height};
    NppStatus resultStatus = nppiResizeBatch_8u_C3R(
                    largestSize, largestSizeRect, dstSize, dstRoi,
                    NPPI_INTER_LINEAR, mixedBufferBatch, batchSize
            );
//    NppStatus resultStatus =
//            nppiResize_8u_C3R((uchar*)(m_inputBuffer), w * m_channels, largestSize, largestSizeRect,
//                      (uchar*)(m_resizedBuffer), 640 * 3, dstSize, dstRoi, NPPI_INTER_LINEAR);

    {
        cudaError_t err = cudaDeviceSynchronize();
        if (err != cudaSuccess) {
            std::cout << "ERROR OCCURED: " << err <<"\n";
        }
    }
    if (resultStatus == NPP_RESIZE_NO_OPERATION_ERROR) {
        std::cout << "No operation! \n";
    }
    cudaDeviceSynchronize();

    cudaFree(m_inputBuffer);
    cudaFree(m_resizedBuffer);
    return 0;
}

/*
add_executable(test_npp_resize ./npp_resize_batch.cu)
target_link_libraries(test_npp_resize
        PUBLIC
        ${CUDA_npp_LIBRARY}
        ${CUDA_nppi_LIBRARY}
        ${CUDA_LIBRARIES}
#        ${OpenCV_LIBS}
        )
*/

Robert_Crovella · August 10, 2022, 12:40am

according to the documentation the argument for the function parameter pBatchList must be a device pointer. Your code is not providing that.
You should either increment idx in the body of the for-loop, or else in the for-loop statement. Not both.

When I fix those 2 issues, your code runs without errors for me:

$ cat t2089.cu
#include <nppi_geometry_transforms.h>
#include <iostream>
typedef unsigned char uchar;

NppiSize getLargestSize(NppiSize first, NppiSize second)
{
    NppiSize smallest{first.width, first.height};
    if (second.height > first.height || first.height == 0) {
        smallest.height = second.height;
    }
    if (second.width > first.width || first.width == 0) {
        smallest.width = second.width;
    }
    return smallest;
}

int main()
{
    int w = 2000;
    int h = 2000;
    int m_channels = 3;
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    int batchSize = 4;

    int dstStep = 640 * m_channels;
    NppiSize dstSize{.width = 640, .height = 640};
    NppiRect dstRoi{.x = 0, .y = 0, .width = 640, .height = 640};

    // MixedBufferBatch struct is on CPU
    NppiResizeBatchCXR mixedBufferBatch[batchSize];
    NppiSize largestSize = {0, 0};

    // Pointers
    void *m_inputBuffer;
    void *m_resizedBuffer;
    (cudaMalloc(reinterpret_cast<void **>(&m_inputBuffer), batchSize * w * h * m_channels));
    cudaMemset(m_inputBuffer, 1, 5012);
    cudaMemset(m_inputBuffer, 2, batchSize * w * h * m_channels-5012);
    (cudaMalloc(reinterpret_cast<void **>(&m_resizedBuffer), batchSize * 640 * 640 * m_channels));

    cudaDeviceSynchronize();

    auto *inbuf = reinterpret_cast<uchar *>(m_inputBuffer);
    auto *outbuf = reinterpret_cast<uchar *>(m_resizedBuffer);

    for (int idx=0;idx<batchSize;idx++) {
        largestSize = getLargestSize(largestSize, {w, h});
        size_t N = w * h * m_channels;

        int srcStep = w * m_channels;
        mixedBufferBatch[idx].nSrcStep = srcStep;
        mixedBufferBatch[idx].nDstStep = dstStep;
        mixedBufferBatch[idx].pSrc = inbuf;
        mixedBufferBatch[idx].pDst = outbuf;
        inbuf += N;    // on GPU
        outbuf += (640 * 640 * m_channels);
//        idx++;
    }
    NppiRect largestSizeRect{.x = 0, .y = 0, .width = largestSize.width, .height = largestSize.height};
    NppiResizeBatchCXR *d_mixedBufferBatch;
    cudaMalloc(&d_mixedBufferBatch, sizeof(NppiResizeBatchCXR)*batchSize);
    cudaMemcpy(d_mixedBufferBatch, mixedBufferBatch, sizeof(NppiResizeBatchCXR)*batchSize, cudaMemcpyHostToDevice);
    NppStatus resultStatus = nppiResizeBatch_8u_C3R(
                    largestSize, largestSizeRect, dstSize, dstRoi,
                    NPPI_INTER_LINEAR, d_mixedBufferBatch, batchSize
            );

    {
        cudaError_t err = cudaDeviceSynchronize();
        if (err != cudaSuccess) {
            std::cout << "ERROR OCCURED: " << err <<"\n";
        }
    }
    if (resultStatus == NPP_RESIZE_NO_OPERATION_ERROR) {
        std::cout << "No operation! \n";
    }
    cudaDeviceSynchronize();

    cudaFree(m_inputBuffer);
    cudaFree(m_resizedBuffer);
    return 0;
}
$ nvcc -o t2089 t2089.cu -lnppig
$ compute-sanitizer ./t2089
========= COMPUTE-SANITIZER
========= ERROR SUMMARY: 0 errors
$

syther666 · August 10, 2022, 2:58am

Hi,

I have missed it right in the documentation! Thanks a lot for pointing these out, the other (old) batch question in the forum Image batch resize with npp should also look at this.

system · August 24, 2022, 2:58am

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.