CUDA memory copy (cudaMemcpy) fails after NPP sum function (nppiSum_8u_C3R)

francis.guindon · February 16, 2023, 6:44pm

Hi,

I am running the NPP sum function on a RGB image.
Afterwards I want to perform other CUDA operations, like memory copies and such.

I am working on a Jetson Nano with JP 4.6.3

Here is a bit of code that exemplifies my use case:

#include <npp.h>
#include <iostream>

int main() {
  NppStatus status = NPP_NO_ERROR;
  cudaError_t error = cudaSuccess;

  int width = 1920;
  int height = 1080;
  int stride = width*3;

  size_t size = stride * height;
  NppiSize input_size = { width, height };
  int scratch_buffer_size = 0;

  Npp8u* input_buffer = nullptr;
  Npp8u* input_buffer_host = nullptr;
  Npp8u* scratch_buffer = nullptr;

  error = cudaMalloc ((void **) &input_buffer, sizeof (Npp8u) * size);
  if (cudaSuccess != error) {
    std::cerr << cudaGetErrorString (error) << std::endl;
    return -1;
  }

  cudaDeviceSynchronize ();

  error = cudaMallocHost ((void **) &input_buffer_host, sizeof (Npp8u) * size);
  if (cudaSuccess != error) {
    std::cerr << cudaGetErrorString (error) << std::endl;
    return -1;
  }

  status = nppiSumGetBufferHostSize_8u_C3R (input_size, &scratch_buffer_size);
  if (NPP_NO_ERROR != status) {
    std::cerr << "GetBufferSize - Npp status: " << status << std::endl;
    return -1;
  }

  error = cudaMalloc ((void **) &scratch_buffer, sizeof (Npp8u) * scratch_buffer_size);
  if (cudaSuccess != error) {
    std::cerr << cudaGetErrorString (error) << std::endl;
    return -1;
  }

  Npp64f sum[3] = { 0 };
  status = nppiSum_8u_C3R (input_buffer, stride, input_size, scratch_buffer, sum);
  if (NPP_NO_ERROR != status) {
    std::cerr << "Sum - Npp status: " << status << std::endl;
    return -1;
  }

  std::cout << "Sum: " << sum[0] << " " << sum[1] << " " << sum[2] << std::endl;

  cudaDeviceSynchronize ();

  error = cudaMemcpy (input_buffer_host, input_buffer, sizeof (Npp8u) * size, cudaMemcpyDeviceToHost);
  if (cudaSuccess != error) {
    std::cerr << cudaGetErrorString (error) << std::endl;
    return -1;
  }

  cudaDeviceSynchronize ();

  cudaFree (scratch_buffer);
  cudaFree (input_buffer_host);
  cudaFree (input_buffer);
  
  return 0;
}

I find that the CUDA operations I run after the NPP sum function fail:

Sum: 0 0 0
unspecified launch failure

If I deactivate the NPP sum function, the code works.
If I activate the NPP sum function but deactivate any following CUDA operation, it also works.
I have replicated this behavior with the single channel sum.
I have used the cuda-memcheck tool as well.
It shows an error on the cudaMemcpy operation, even if I deactivate the SUM:

nvidia@ubuntu:~/AWB2023Q1/snippets/build$ cuda-memcheck --tool memcheck ./npp_sum_test
========= CUDA-MEMCHECK
========= Program hit cudaErrorDevicesUnavailable (error 46) due to "all CUDA-capable devices are busy or unavailable" on CUDA API call to cudaMalloc. 
all CUDA-capable devices are busy or unavailable=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib/aarch64-linux-gnu/tegra/libcuda.so.1 [0x32081c]

=========     Host Frame:/usr/local/cuda-10.2/lib64/libcudart.so.10.2 (cudaMalloc + 0x144) [0x3b7bc]
=========     Host Frame:./npp_sum_test [0xeac]
=========     Host Frame:/lib/aarch64-linux-gnu/libc.so.6 (__libc_start_main + 0xe0) [0x207a0]
=========     Host Frame:./npp_sum_test [0xd54]
=========
========= ERROR SUMMARY: 1 error

However, the code itself seems to work fine.

My questions would be along the lines of:
Is my use of the NPP sum function wrong?
Is there some other CUDA management issue I must be considering?
Could it be that the NPP sum function might have some sort of issue?

Thanks in advance for any help : )

SOLUTION: For anyone with a similar question, the result of the sum must be device memory.