Initcheck fails for an npp API

nglee · July 24, 2020, 5:27pm

Hi, I’m confused why initcheck reports errors with this code (cuda10.2, win10, RTX 2080 Ti):

#include "cuda_runtime.h"
#include <nppi.h>
#include <cstdio>
#include <random>
#include <ctime>

template <typename T>
void _check(T result, char const* const func, char const* const file, int const line)
{
    if (result) {
        fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
            static_cast<unsigned int>(result), cudaGetErrorName(result), func);
        cudaDeviceReset();
        exit(EXIT_FAILURE);
    }
}
#define checkCudaErrors(val) _check((val), #val, __FILE__, __LINE__)

constexpr size_t h = 987;
constexpr size_t w = 988;
constexpr size_t c = 3;
constexpr size_t size = h * w * c;

unsigned char h_src[size];
unsigned char h_dst[size];

int main()
{
	std::srand(std::time(nullptr));
    for (size_t i = 0; i < size; ++i)
        h_src[i] = std::rand() / ((RAND_MAX + 1) / 255);
    unsigned char* d_srcDst;
    size_t step = 0;

    checkCudaErrors(cudaMallocPitch(&d_srcDst, &step, w * c, h));
    checkCudaErrors(cudaMemcpy2D(d_srcDst, step, h_src, w * c, w * c, h, cudaMemcpyHostToDevice));

    NppStatus status = nppiMirror_8u_C3IR(d_srcDst, step, NppiSize{w, h}, NPP_HORIZONTAL_AXIS);
    if (status != NPP_NO_ERROR)
    {
        printf("npp error\n");
        return 1;
    }

    checkCudaErrors(cudaMemcpy2D(h_dst, w * c, d_srcDst, step, w * c, h, cudaMemcpyDeviceToHost));

    for (size_t _h = 0; _h < h; ++_h)
        for (size_t _w = 0; _w < w; ++_w)
            for (size_t _c = 0; _c < c; ++_c)
            {
                const unsigned char src_val = h_src[_h * w * c + _w * c + _c];
                const unsigned char dst_val = h_dst[(h - 1 - _h) * w * c + _w * c + _c]; // horizontal flip

                if (src_val != dst_val)
                {
                    printf("wrong!\n");
                    return 1;
                }
            }

    printf("complete\n");
    return 0;
}

Output of initcheck:

========= Uninitialized __global__ memory read of size 1
=========     at 0x00000160 in void inPlaceMirrorKernel<unsigned char, unsigned int=3, NppiAxis=0>(Pixel<unsigned char, unsigned int=3>*, unsigned int, NppiSize, int, int)
=========     by thread (28,3,0) in block (0,3,0)
=========     Address 0xb00ecf454
=========     Device Frame:void inPlaceMirrorKernel<unsigned char, unsigned int=3, NppiAxis=0>(Pixel<unsigned char, unsigned int=3>*, unsigned int, NppiSize, int, int) (void inPlaceMirrorKernel<unsigned char, unsigned int=3, NppiAxis=0>(Pixel<unsigned char, unsigned int=3>*, unsigned int, NppiSize, int, int) : 0x160)
=========     Saved host backtrace up to driver entry point
=========     Host Frame:C:\Windows\system32\nvcuda.dll (cuLaunchKernel + 0x218) [0x1c9718]
=========     Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin\nppig64_10.dll (nppiWarpPerspectiveQuad_8u_P4R_Ctx + 0x7f6) [0x18ef06]
=========     Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin\nppig64_10.dll (nppiWarpPerspectiveQuad_8u_P4R_Ctx + 0x3c4) [0x18ead4]
=========     Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin\nppig64_10.dll [0xfc59]
=========     Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin\nppig64_10.dll [0x149d]
=========     Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin\nppig64_10.dll (nppiMirror_8u_C3IR + 0x81) [0x18e11]
=========     Host Frame:E:\repos\opencv_cuda_test\x64\Release\npp_mirror_bug_report_10.2.exe (main + 0x148) [0xdfd8]
=========     Host Frame:E:\repos\opencv_cuda_test\x64\Release\npp_mirror_bug_report_10.2.exe (__scrt_common_main_seh + 0x10c) [0xe344]
=========     Host Frame:C:\Windows\System32\KERNEL32.DLL (BaseThreadInitThunk + 0x14) [0x17bd4]
=========     Host Frame:C:\Windows\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x21) [0x6ce51]
=========
========= ERROR SUMMARY: 102816 errors

nglee · July 25, 2020, 2:46pm

This is a further investigation of this issue.

I’m guessing that the NPP API reads the regions outside the ROI, probably for performance.

In the example code, cudaMallocPitch is used to allocate the device memory. Because of that, it has extra bytes which are not initialized by cudaMemcpy2D at the end of each row. The uninitialized error may stem from the NPP API reading these extra bytes.

To justify my hypothesis, I’ve added this code: checkCudaErrors(cudaMemset(d_srcDst, 0x00, step * h)); before calling the NPP API. The code tries to initialize every byte(even the extra bytes at each row) allocated by cudaMallocPitch. I observed that this fix removed the uninitialized error.

I’ve also tried using cudaMalloc and cudaMemcpy instead of cudaMallocPitch and cudaMemcpy2D. Again, it removed the uninitialized error.

The test code for cudaMalloc version is:

#include "cuda_runtime.h"
#include <nppi.h>
#include <cstdio>
#include <random>
#include <ctime>

template <typename T>
void _check(T result, char const* const func, char const* const file, int const line)
{
    if (result) {
        fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
            static_cast<unsigned int>(result), cudaGetErrorName(result), func);
        cudaDeviceReset();
        exit(EXIT_FAILURE);
    }
}
#define checkCudaErrors(val) _check((val), #val, __FILE__, __LINE__)

constexpr size_t h = 987;
constexpr size_t w = 988;
constexpr size_t c = 3;
constexpr size_t size = h * w * c;

unsigned char h_src[size];
unsigned char h_dst[size];

int main()
{
	std::srand(std::time(nullptr));
    for (size_t i = 0; i < size; ++i)
        h_src[i] = std::rand() / ((RAND_MAX + 1) / 255);
    unsigned char* d_srcDst;

	checkCudaErrors(cudaMalloc(&d_srcDst, size)); // instead of cudaMallocPitch
	checkCudaErrors(cudaMemcpy(d_srcDst, h_src, size, cudaMemcpyHostToDevice)); // instead of cudaMemcpy2D

    NppStatus status = nppiMirror_8u_C3IR(d_srcDst, w * c, NppiSize{w, h}, NPP_HORIZONTAL_AXIS); // step = w * c
    if (status != NPP_NO_ERROR)
    {
        printf("npp error\n");
        return 1;
    }

	checkCudaErrors(cudaMemcpy(h_dst, d_srcDst, size, cudaMemcpyDeviceToHost)); // instead of cudaMemcpy2D

    for (size_t _h = 0; _h < h; ++_h)
        for (size_t _w = 0; _w < w; ++_w)
            for (size_t _c = 0; _c < c; ++_c)
            {
                const unsigned char src_val = h_src[_h * w * c + _w * c + _c];
                const unsigned char dst_val = h_dst[(h - 1 - _h) * w * c + _w * c + _c]; // horizontal flip

                if (src_val != dst_val)
                {
                    printf("wrong!\n");
                    return 1;
                }
            }

    printf("complete\n");
    return 0;
}

Topic		Replies	Views
nppiMalloc_XX_XX not working at all CUDA Programming and Performance	2	835	March 28, 2012
NPP invalidates cuda unified memory. GPU-Accelerated Libraries	1	1064	February 5, 2019
nppiMalloc_8u_C1 returning 0 CUDA Programming and Performance	1	9872	May 17, 2011
Error when running cuda-memcheck on command line. CUDA-MEMCHECK	3	2613	December 4, 2017
Memory Exceptions using NPP 1.1 ..in samples as well as in my own code CUDA Programming and Performance	1	7469	March 9, 2011
initcheck and malloc/free don't get along CUDA Programming and Performance	1	697	May 19, 2016
nppsMean_32fc corrupts device memory CUDA Programming and Performance	3	837	January 22, 2014
VRam Leak when calling npp functions GPU-Accelerated Libraries npp	5	492	December 31, 2023
Internal Memcheck Error: Initialization failed CUDA-MEMCHECK	2	1544	September 27, 2021
Error running nvprof with NPP-based GPU computation CUDA Programming and Performance	2	630	October 18, 2018

Initcheck fails for an npp API

Related topics