Initcheck fails for an npp API

Hi, I’m confused why initcheck reports errors with this code (cuda10.2, win10, RTX 2080 Ti):

#include "cuda_runtime.h"
#include <nppi.h>
#include <cstdio>
#include <random>
#include <ctime>

template <typename T>
void _check(T result, char const* const func, char const* const file, int const line)
{
    if (result) {
        fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
            static_cast<unsigned int>(result), cudaGetErrorName(result), func);
        cudaDeviceReset();
        exit(EXIT_FAILURE);
    }
}
#define checkCudaErrors(val) _check((val), #val, __FILE__, __LINE__)

constexpr size_t h = 987;
constexpr size_t w = 988;
constexpr size_t c = 3;
constexpr size_t size = h * w * c;

unsigned char h_src[size];
unsigned char h_dst[size];

int main()
{
	std::srand(std::time(nullptr));
    for (size_t i = 0; i < size; ++i)
        h_src[i] = std::rand() / ((RAND_MAX + 1) / 255);
    unsigned char* d_srcDst;
    size_t step = 0;

    checkCudaErrors(cudaMallocPitch(&d_srcDst, &step, w * c, h));
    checkCudaErrors(cudaMemcpy2D(d_srcDst, step, h_src, w * c, w * c, h, cudaMemcpyHostToDevice));

    NppStatus status = nppiMirror_8u_C3IR(d_srcDst, step, NppiSize{w, h}, NPP_HORIZONTAL_AXIS);
    if (status != NPP_NO_ERROR)
    {
        printf("npp error\n");
        return 1;
    }

    checkCudaErrors(cudaMemcpy2D(h_dst, w * c, d_srcDst, step, w * c, h, cudaMemcpyDeviceToHost));

    for (size_t _h = 0; _h < h; ++_h)
        for (size_t _w = 0; _w < w; ++_w)
            for (size_t _c = 0; _c < c; ++_c)
            {
                const unsigned char src_val = h_src[_h * w * c + _w * c + _c];
                const unsigned char dst_val = h_dst[(h - 1 - _h) * w * c + _w * c + _c]; // horizontal flip

                if (src_val != dst_val)
                {
                    printf("wrong!\n");
                    return 1;
                }
            }

    printf("complete\n");
    return 0;
}

Output of initcheck:

========= Uninitialized __global__ memory read of size 1
=========     at 0x00000160 in void inPlaceMirrorKernel<unsigned char, unsigned int=3, NppiAxis=0>(Pixel<unsigned char, unsigned int=3>*, unsigned int, NppiSize, int, int)
=========     by thread (28,3,0) in block (0,3,0)
=========     Address 0xb00ecf454
=========     Device Frame:void inPlaceMirrorKernel<unsigned char, unsigned int=3, NppiAxis=0>(Pixel<unsigned char, unsigned int=3>*, unsigned int, NppiSize, int, int) (void inPlaceMirrorKernel<unsigned char, unsigned int=3, NppiAxis=0>(Pixel<unsigned char, unsigned int=3>*, unsigned int, NppiSize, int, int) : 0x160)
=========     Saved host backtrace up to driver entry point
=========     Host Frame:C:\Windows\system32\nvcuda.dll (cuLaunchKernel + 0x218) [0x1c9718]
=========     Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin\nppig64_10.dll (nppiWarpPerspectiveQuad_8u_P4R_Ctx + 0x7f6) [0x18ef06]
=========     Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin\nppig64_10.dll (nppiWarpPerspectiveQuad_8u_P4R_Ctx + 0x3c4) [0x18ead4]
=========     Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin\nppig64_10.dll [0xfc59]
=========     Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin\nppig64_10.dll [0x149d]
=========     Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin\nppig64_10.dll (nppiMirror_8u_C3IR + 0x81) [0x18e11]
=========     Host Frame:E:\repos\opencv_cuda_test\x64\Release\npp_mirror_bug_report_10.2.exe (main + 0x148) [0xdfd8]
=========     Host Frame:E:\repos\opencv_cuda_test\x64\Release\npp_mirror_bug_report_10.2.exe (__scrt_common_main_seh + 0x10c) [0xe344]
=========     Host Frame:C:\Windows\System32\KERNEL32.DLL (BaseThreadInitThunk + 0x14) [0x17bd4]
=========     Host Frame:C:\Windows\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x21) [0x6ce51]
=========
========= ERROR SUMMARY: 102816 errors

This is a further investigation of this issue.

I’m guessing that the NPP API reads the regions outside the ROI, probably for performance.

In the example code, cudaMallocPitch is used to allocate the device memory. Because of that, it has extra bytes which are not initialized by cudaMemcpy2D at the end of each row. The uninitialized error may stem from the NPP API reading these extra bytes.

To justify my hypothesis, I’ve added this code: checkCudaErrors(cudaMemset(d_srcDst, 0x00, step * h)); before calling the NPP API. The code tries to initialize every byte(even the extra bytes at each row) allocated by cudaMallocPitch. I observed that this fix removed the uninitialized error.

I’ve also tried using cudaMalloc and cudaMemcpy instead of cudaMallocPitch and cudaMemcpy2D. Again, it removed the uninitialized error.

The test code for cudaMalloc version is:

#include "cuda_runtime.h"
#include <nppi.h>
#include <cstdio>
#include <random>
#include <ctime>

template <typename T>
void _check(T result, char const* const func, char const* const file, int const line)
{
    if (result) {
        fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
            static_cast<unsigned int>(result), cudaGetErrorName(result), func);
        cudaDeviceReset();
        exit(EXIT_FAILURE);
    }
}
#define checkCudaErrors(val) _check((val), #val, __FILE__, __LINE__)

constexpr size_t h = 987;
constexpr size_t w = 988;
constexpr size_t c = 3;
constexpr size_t size = h * w * c;

unsigned char h_src[size];
unsigned char h_dst[size];

int main()
{
	std::srand(std::time(nullptr));
    for (size_t i = 0; i < size; ++i)
        h_src[i] = std::rand() / ((RAND_MAX + 1) / 255);
    unsigned char* d_srcDst;

	checkCudaErrors(cudaMalloc(&d_srcDst, size)); // instead of cudaMallocPitch
	checkCudaErrors(cudaMemcpy(d_srcDst, h_src, size, cudaMemcpyHostToDevice)); // instead of cudaMemcpy2D

    NppStatus status = nppiMirror_8u_C3IR(d_srcDst, w * c, NppiSize{w, h}, NPP_HORIZONTAL_AXIS); // step = w * c
    if (status != NPP_NO_ERROR)
    {
        printf("npp error\n");
        return 1;
    }

	checkCudaErrors(cudaMemcpy(h_dst, d_srcDst, size, cudaMemcpyDeviceToHost)); // instead of cudaMemcpy2D

    for (size_t _h = 0; _h < h; ++_h)
        for (size_t _w = 0; _w < w; ++_w)
            for (size_t _c = 0; _c < c; ++_c)
            {
                const unsigned char src_val = h_src[_h * w * c + _w * c + _c];
                const unsigned char dst_val = h_dst[(h - 1 - _h) * w * c + _w * c + _c]; // horizontal flip

                if (src_val != dst_val)
                {
                    printf("wrong!\n");
                    return 1;
                }
            }

    printf("complete\n");
    return 0;
}