Calling NPP helper with large image gives kernel execution error

Hi all,

I’m debugging a problem in a CUDA application of ours, which I’ve reduced to the following minimal test case:

nppierror.cu

#include <stdio.h>
#include <stdlib.h>

#include <cuda.h>
#include <npp.h>

#define CHECK_CUDA(S) do { \
    cudaError_t e = S; \
    if (e != cudaSuccess) { \
        fprintf(stderr, "CUDA error at %s:%d: %d\n", __FILE__, __LINE__, e); \
        cudaDeviceReset(); \
        exit(EXIT_FAILURE); \
    } \
} while (false)

#define CHECK_NPP(S) do { \
    NppStatus e = S; \
    if (e != NPP_SUCCESS) { \
        fprintf(stderr, "NPP error at %s:%d: %d\n", __FILE__, __LINE__, e); \
        cudaDeviceReset(); \
        exit(EXIT_FAILURE); \
    } \
} while (false)

int main()
{
    CHECK_CUDA(cudaSetDevice(0));

    size_t free_mem, total_mem;
    CHECK_CUDA(cudaMemGetInfo(&free_mem, &total_mem));
    printf("%ld free, %ld total\n", free_mem, total_mem);

    int width = 1547;
    int height = 524281;

    float *src;
    CHECK_CUDA(cudaMalloc(&src, width * height * sizeof(float)));

    float *dst;
    CHECK_CUDA(cudaMalloc(&dst, width * height * sizeof(float)));

    // Copy from src to dst using nppiCopy_32f_C1R
    NppiSize roi = { width, height };
    int step = width * sizeof(float);
    CHECK_NPP(nppiCopy_32f_C1R(src, step, dst, step, roi));

    CHECK_CUDA(cudaDeviceSynchronize());
    CHECK_CUDA(cudaFree(src));
    CHECK_CUDA(cudaFree(dst));
    CHECK_CUDA(cudaDeviceReset());

    return 0;
}

The program simply allocates two buffers on the GPU, each of size 1547 * 524281 * 4 bytes, and then tries to copy from one to the other using the nppiCopy_32f_C1R helper from NPP.

Makefile

all: nppierror

nppierror: nppierror.o
        g++ $< -lcudart -lnppidei_static -lnppc_static -lculibos -lm -lz -ldl -lpthread -o $@

nppierror.o: nppierror.cu
        nvcc -ccbin clang-4.0 --compiler-options -fPIC -gencode=arch=compute_75,code=compute_75 --std c++11 -c -o $@ $<

clean:
        -rm -f nppierror nppierror.o

Building and running

$ make
nvcc -ccbin clang-4.0 --compiler-options -fPIC -gencode=arch=compute_75,code=compute_75 --std c++11 -c -o nppierror.o nppierror.cu
g++ nppierror.o -lcudart -lnppidei_static -lnppc_static -lculibos -lm -lz -ldl -lpthread -o nppierror
$ ./nppierror 
11382882304 free, 11554717696 total
NPP error at nppierror.cu:45: -1000
$

The -1000 from nppiCopy_32f_C1R indicates a kernel execution error:

$ grep -- -1000 /usr/include/nppdefs.h 
    NPP_CUDA_KERNEL_EXECUTION_ERROR         = -1000,
$

This is with a GeForce RTX 2080 Ti with 11 GB GPU memory on Ubuntu 18.04 running CUDA 10.1.168:

$ nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Wed_Apr_24_19:10:27_PDT_2019
Cuda compilation tools, release 10.1, V10.1.168
$ lsb_release -d
Description:    Ubuntu 18.04.3 LTS
$ nvidia-smi -L
GPU 0: GeForce RTX 2080 Ti (UUID: GPU-1c73a2db-6272-67c1-2935-affc2dbf8ed0)
$

If I just reduce height by 1, to 524280, it doesn’t crash. It should not be an out-of-memory problem, since as you can see by the output, there’s 11 GB of free GPU memory when the program runs.

Has anyone seen a problem like this with large images when using the NPP functions?