This code executed serial in for(int i = 0; i < batch; ++I), yoloInput.data() has the correct offset every time. This is to fill the buffer of the neural network with several prepared images.
NppStatus
PreprocessingGpuYolo::transposeToCHW(Blob yoloInput)
{
NppStatus st;
if (yoloInput.data() + yoloInput.size() >= yoloInput.data() + (YOLO_PLANAR_OFFSET * 2)) {
float * const inputArr[3] {yoloInput.data(),
yoloInput.data() + YOLO_PLANAR_OFFSET,
yoloInput.data() + (YOLO_PLANAR_OFFSET * 2)};
int planarStep = YOLO_SIZE * 1 * sizeof(IMAGE_TYPE);
st = nppiCopy_32f_C3P3R((Npp32f*)this->dstBorderPtr,
this->yoloConstStep,
inputArr,
planarStep,
yoloSize );
int rs = cudaDeviceSynchronize();
std::cout << "transposeToCHW() cudaDeviceSynchronize() return " << rs << std::endl;
return st;
} else {
std::cout << "transposeToCHW() device buffer overflow, exit ..";
return NPP_ERROR_RESERVED;
}
}
After an indefinite number of iterations, nppiCopy_32f_C3P3R() stops working correctly and everything crash. cudaDeviceSynchronize() returns 700.
batch = 50
j is 0
transposeToCHW() cudaDeviceSynchronize() return 0
j is 1
transposeToCHW() cudaDeviceSynchronize() return 0
j is 2
transposeToCHW() cudaDeviceSynchronize() return 0
j is 3
transposeToCHW() cudaDeviceSynchronize() return 0
j is 4
transposeToCHW() cudaDeviceSynchronize() return 0
j is 5
transposeToCHW() cudaDeviceSynchronize() return 0
j is 6
transposeToCHW() cudaDeviceSynchronize() return 700
j is 7
Runtime error: this->copyImg() returned 4 at /home/alph/plate_recognition/src/preprocessing.cpp:525
compute-sanitizer has this output:
========= Invalid global write of size 4 bytes
========= at 0x150 in void copyChannelKernel<float, (unsigned int)3, (unsigned int)1>(const T1 *, unsigned int, T1 , unsigned int, NppiSize)
========= by thread (0,5,0) in block (0,0,0)
========= Address 0x7f8846a63200 is out of bounds
========= and is 12801 bytes after the nearest allocation at 0x7f8838000000 of size 245760000 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x21737c]
========= in /lib/x86_64-linux-gnu/libcuda.so
========= Host Frame: [0x1e131b]
========= in /usr/local/cuda-11.6/targets/x86_64-linux/lib/libnppidei.so.11
========= Host Frame: [0x23c178]
========= in /usr/local/cuda-11.6/targets/x86_64-linux/lib/libnppidei.so.11
========= Host Frame: [0x123d06]
========= in /usr/local/cuda-11.6/targets/x86_64-linux/lib/libnppidei.so.11
========= Host Frame:nppiCopy_32f_C3P3R [0x11b858]
========= in /usr/local/cuda-11.6/targets/x86_64-linux/lib/libnppidei.so.11
========= Host Frame:PreprocessingGpuYolo::transposeToCHW(Blob) [0x195d8]
========= in /home/alph/plate_recognition/build/./plate_recognize
========= Host Frame:PreprocessingGpuYolo::run(float, Blob) [0x1975d]
========= in /home/alph/plate_recognition/build/./plate_recognize
========= Host Frame:main [0x122b1]
========= in /home/alph/plate_recognition/build/./plate_recognize
========= Host Frame:…/csu/libc-start.c:342:__libc_start_main [0x24083]
========= in /lib/x86_64-linux-gnu/libc.so.6
========= Host Frame:_start [0x117ce]
========= in /home/alph/plate_recognition/build/./plate_recognize
========= Invalid global write of size 4 bytes
========= at 0x150 in void copyChannelKernel<float, (unsigned int)3, (unsigned int)1>(const T1 *, unsigned int, T1 , unsigned int, NppiSize)
========= by thread (1,5,0) in block (0,0,0)
========= Address 0x7f8846a63204 is out of bounds
========= and is 12805 bytes after the nearest allocation at 0x7f8838000000 of size 245760000 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x21737c]
========= in /lib/x86_64-linux-gnu/libcuda.so
========= Host Frame: [0x1e131b]
========= in /usr/local/cuda-11.6/targets/x86_64-linux/lib/libnppidei.so.11
========= Host Frame: [0x23c178]
========= in /usr/local/cuda-11.6/targets/x86_64-linux/lib/libnppidei.so.11
========= Host Frame: [0x123d06]
========= in /usr/local/cuda-11.6/targets/x86_64-linux/lib/libnppidei.so.11
========= Host Frame:nppiCopy_32f_C3P3R [0x11b858]
========= in /usr/local/cuda-11.6/targets/x86_64-linux/lib/libnppidei.so.11
========= Host Frame:PreprocessingGpuYolo::transposeToCHW(Blob) [0x195d8]
========= in /home/alph/plate_recognition/build/./plate_recognize
========= Host Frame:PreprocessingGpuYolo::run(float, Blob) [0x1975d]
========= in /home/alph/plate_recognition/build/./plate_recognize
========= Host Frame:main [0x122b1]
========= in /home/alph/plate_recognition/build/./plate_recognize
========= Host Frame:…/csu/libc-start.c:342:__libc_start_main [0x24083]
========= in /lib/x86_64-linux-gnu/libc.so.6
========= Host Frame:_start [0x117ce]
========= in /home/alph/plate_recognition/build/./plate_recognize
(Total 3968 similar results)
Linux ubuntu 20.04, geforce RTX 3060 TI, CUDA 11.6