nppiCopy_32f_C3P3R() causes undefined behavior

This code executed serial in for(int i = 0; i < batch; ++I), yoloInput.data() has the correct offset every time. This is to fill the buffer of the neural network with several prepared images.

NppStatus
PreprocessingGpuYolo::transposeToCHW(Blob yoloInput)
{

    NppStatus st;
    if (yoloInput.data() + yoloInput.size() >= yoloInput.data() + (YOLO_PLANAR_OFFSET * 2)) {

        float * const inputArr[3] {yoloInput.data(),
                                   yoloInput.data() + YOLO_PLANAR_OFFSET,
                                   yoloInput.data() + (YOLO_PLANAR_OFFSET * 2)};
        int planarStep = YOLO_SIZE * 1 * sizeof(IMAGE_TYPE);

        st = nppiCopy_32f_C3P3R((Npp32f*)this->dstBorderPtr,
                                         this->yoloConstStep,
                                         inputArr,
                                         planarStep,
                                         yoloSize );
        int rs = cudaDeviceSynchronize();
        std::cout << "transposeToCHW() cudaDeviceSynchronize() return " << rs << std::endl;
        return st;
    } else {

        std::cout << "transposeToCHW() device buffer overflow, exit ..";
        return NPP_ERROR_RESERVED;

    }

}

After an indefinite number of iterations, nppiCopy_32f_C3P3R() stops working correctly and everything crash. cudaDeviceSynchronize() returns 700.

batch = 50
j is 0
transposeToCHW() cudaDeviceSynchronize() return 0
j is 1
transposeToCHW() cudaDeviceSynchronize() return 0
j is 2
transposeToCHW() cudaDeviceSynchronize() return 0
j is 3
transposeToCHW() cudaDeviceSynchronize() return 0
j is 4
transposeToCHW() cudaDeviceSynchronize() return 0
j is 5
transposeToCHW() cudaDeviceSynchronize() return 0
j is 6
transposeToCHW() cudaDeviceSynchronize() return 700
j is 7
Runtime error: this->copyImg() returned 4 at /home/alph/plate_recognition/src/preprocessing.cpp:525

compute-sanitizer has this output:

========= Invalid global write of size 4 bytes
========= at 0x150 in void copyChannelKernel<float, (unsigned int)3, (unsigned int)1>(const T1 *, unsigned int, T1 , unsigned int, NppiSize)
========= by thread (0,5,0) in block (0,0,0)
========= Address 0x7f8846a63200 is out of bounds
========= and is 12801 bytes after the nearest allocation at 0x7f8838000000 of size 245760000 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x21737c]
========= in /lib/x86_64-linux-gnu/libcuda.so
========= Host Frame: [0x1e131b]
========= in /usr/local/cuda-11.6/targets/x86_64-linux/lib/libnppidei.so.11
========= Host Frame: [0x23c178]
========= in /usr/local/cuda-11.6/targets/x86_64-linux/lib/libnppidei.so.11
========= Host Frame: [0x123d06]
========= in /usr/local/cuda-11.6/targets/x86_64-linux/lib/libnppidei.so.11
========= Host Frame:nppiCopy_32f_C3P3R [0x11b858]
========= in /usr/local/cuda-11.6/targets/x86_64-linux/lib/libnppidei.so.11
========= Host Frame:PreprocessingGpuYolo::transposeToCHW(Blob) [0x195d8]
========= in /home/alph/plate_recognition/build/./plate_recognize
========= Host Frame:PreprocessingGpuYolo::run(float
, Blob) [0x1975d]
========= in /home/alph/plate_recognition/build/./plate_recognize
========= Host Frame:main [0x122b1]
========= in /home/alph/plate_recognition/build/./plate_recognize
========= Host Frame:…/csu/libc-start.c:342:__libc_start_main [0x24083]
========= in /lib/x86_64-linux-gnu/libc.so.6
========= Host Frame:_start [0x117ce]
========= in /home/alph/plate_recognition/build/./plate_recognize

========= Invalid global write of size 4 bytes
========= at 0x150 in void copyChannelKernel<float, (unsigned int)3, (unsigned int)1>(const T1 *, unsigned int, T1 , unsigned int, NppiSize)
========= by thread (1,5,0) in block (0,0,0)
========= Address 0x7f8846a63204 is out of bounds
========= and is 12805 bytes after the nearest allocation at 0x7f8838000000 of size 245760000 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x21737c]
========= in /lib/x86_64-linux-gnu/libcuda.so
========= Host Frame: [0x1e131b]
========= in /usr/local/cuda-11.6/targets/x86_64-linux/lib/libnppidei.so.11
========= Host Frame: [0x23c178]
========= in /usr/local/cuda-11.6/targets/x86_64-linux/lib/libnppidei.so.11
========= Host Frame: [0x123d06]
========= in /usr/local/cuda-11.6/targets/x86_64-linux/lib/libnppidei.so.11
========= Host Frame:nppiCopy_32f_C3P3R [0x11b858]
========= in /usr/local/cuda-11.6/targets/x86_64-linux/lib/libnppidei.so.11
========= Host Frame:PreprocessingGpuYolo::transposeToCHW(Blob) [0x195d8]
========= in /home/alph/plate_recognition/build/./plate_recognize
========= Host Frame:PreprocessingGpuYolo::run(float
, Blob) [0x1975d]
========= in /home/alph/plate_recognition/build/./plate_recognize
========= Host Frame:main [0x122b1]
========= in /home/alph/plate_recognition/build/./plate_recognize
========= Host Frame:…/csu/libc-start.c:342:__libc_start_main [0x24083]
========= in /lib/x86_64-linux-gnu/libc.so.6
========= Host Frame:_start [0x117ce]
========= in /home/alph/plate_recognition/build/./plate_recognize

(Total 3968 similar results)

Linux ubuntu 20.04, geforce RTX 3060 TI, CUDA 11.6

Can you post a complete minimal reproducer?

when I was doing a minimal reproducer, I found a bug in my code, sorry for bothering.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.