Hi all,
I’m debugging a problem in a CUDA application of ours, which I’ve reduced to the following minimal test case:
nppierror.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <npp.h>
#define CHECK_CUDA(S) do { \
cudaError_t e = S; \
if (e != cudaSuccess) { \
fprintf(stderr, "CUDA error at %s:%d: %d\n", __FILE__, __LINE__, e); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
} while (false)
#define CHECK_NPP(S) do { \
NppStatus e = S; \
if (e != NPP_SUCCESS) { \
fprintf(stderr, "NPP error at %s:%d: %d\n", __FILE__, __LINE__, e); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
} while (false)
int main()
{
CHECK_CUDA(cudaSetDevice(0));
size_t free_mem, total_mem;
CHECK_CUDA(cudaMemGetInfo(&free_mem, &total_mem));
printf("%ld free, %ld total\n", free_mem, total_mem);
int width = 1547;
int height = 524281;
float *src;
CHECK_CUDA(cudaMalloc(&src, width * height * sizeof(float)));
float *dst;
CHECK_CUDA(cudaMalloc(&dst, width * height * sizeof(float)));
// Copy from src to dst using nppiCopy_32f_C1R
NppiSize roi = { width, height };
int step = width * sizeof(float);
CHECK_NPP(nppiCopy_32f_C1R(src, step, dst, step, roi));
CHECK_CUDA(cudaDeviceSynchronize());
CHECK_CUDA(cudaFree(src));
CHECK_CUDA(cudaFree(dst));
CHECK_CUDA(cudaDeviceReset());
return 0;
}
The program simply allocates two buffers on the GPU, each of size 1547 * 524281 * 4 bytes, and then tries to copy from one to the other using the nppiCopy_32f_C1R
helper from NPP.
Makefile
all: nppierror
nppierror: nppierror.o
g++ $< -lcudart -lnppidei_static -lnppc_static -lculibos -lm -lz -ldl -lpthread -o $@
nppierror.o: nppierror.cu
nvcc -ccbin clang-4.0 --compiler-options -fPIC -gencode=arch=compute_75,code=compute_75 --std c++11 -c -o $@ $<
clean:
-rm -f nppierror nppierror.o
Building and running
$ make
nvcc -ccbin clang-4.0 --compiler-options -fPIC -gencode=arch=compute_75,code=compute_75 --std c++11 -c -o nppierror.o nppierror.cu
g++ nppierror.o -lcudart -lnppidei_static -lnppc_static -lculibos -lm -lz -ldl -lpthread -o nppierror
$ ./nppierror
11382882304 free, 11554717696 total
NPP error at nppierror.cu:45: -1000
$
The -1000
from nppiCopy_32f_C1R
indicates a kernel execution error:
$ grep -- -1000 /usr/include/nppdefs.h
NPP_CUDA_KERNEL_EXECUTION_ERROR = -1000,
$
This is with a GeForce RTX 2080 Ti with 11 GB GPU memory on Ubuntu 18.04 running CUDA 10.1.168:
$ nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Wed_Apr_24_19:10:27_PDT_2019
Cuda compilation tools, release 10.1, V10.1.168
$ lsb_release -d
Description: Ubuntu 18.04.3 LTS
$ nvidia-smi -L
GPU 0: GeForce RTX 2080 Ti (UUID: GPU-1c73a2db-6272-67c1-2935-affc2dbf8ed0)
$
If I just reduce height
by 1, to 524280
, it doesn’t crash. It should not be an out-of-memory problem, since as you can see by the output, there’s 11 GB of free GPU memory when the program runs.
Has anyone seen a problem like this with large images when using the NPP functions?