Hi,
I am running the NPP sum function on a RGB image.
Afterwards I want to perform other CUDA operations, like memory copies and such.
I am working on a Jetson Nano with JP 4.6.3
Here is a bit of code that exemplifies my use case:
#include <npp.h>
#include <iostream>
int main() {
NppStatus status = NPP_NO_ERROR;
cudaError_t error = cudaSuccess;
int width = 1920;
int height = 1080;
int stride = width*3;
size_t size = stride * height;
NppiSize input_size = { width, height };
int scratch_buffer_size = 0;
Npp8u* input_buffer = nullptr;
Npp8u* input_buffer_host = nullptr;
Npp8u* scratch_buffer = nullptr;
error = cudaMalloc ((void **) &input_buffer, sizeof (Npp8u) * size);
if (cudaSuccess != error) {
std::cerr << cudaGetErrorString (error) << std::endl;
return -1;
}
cudaDeviceSynchronize ();
error = cudaMallocHost ((void **) &input_buffer_host, sizeof (Npp8u) * size);
if (cudaSuccess != error) {
std::cerr << cudaGetErrorString (error) << std::endl;
return -1;
}
status = nppiSumGetBufferHostSize_8u_C3R (input_size, &scratch_buffer_size);
if (NPP_NO_ERROR != status) {
std::cerr << "GetBufferSize - Npp status: " << status << std::endl;
return -1;
}
error = cudaMalloc ((void **) &scratch_buffer, sizeof (Npp8u) * scratch_buffer_size);
if (cudaSuccess != error) {
std::cerr << cudaGetErrorString (error) << std::endl;
return -1;
}
Npp64f sum[3] = { 0 };
status = nppiSum_8u_C3R (input_buffer, stride, input_size, scratch_buffer, sum);
if (NPP_NO_ERROR != status) {
std::cerr << "Sum - Npp status: " << status << std::endl;
return -1;
}
std::cout << "Sum: " << sum[0] << " " << sum[1] << " " << sum[2] << std::endl;
cudaDeviceSynchronize ();
error = cudaMemcpy (input_buffer_host, input_buffer, sizeof (Npp8u) * size, cudaMemcpyDeviceToHost);
if (cudaSuccess != error) {
std::cerr << cudaGetErrorString (error) << std::endl;
return -1;
}
cudaDeviceSynchronize ();
cudaFree (scratch_buffer);
cudaFree (input_buffer_host);
cudaFree (input_buffer);
return 0;
}
I find that the CUDA operations I run after the NPP sum function fail:
Sum: 0 0 0
unspecified launch failure
If I deactivate the NPP sum function, the code works.
If I activate the NPP sum function but deactivate any following CUDA operation, it also works.
I have replicated this behavior with the single channel sum.
I have used the cuda-memcheck
tool as well.
It shows an error on the cudaMemcpy operation, even if I deactivate the SUM:
nvidia@ubuntu:~/AWB2023Q1/snippets/build$ cuda-memcheck --tool memcheck ./npp_sum_test
========= CUDA-MEMCHECK
========= Program hit cudaErrorDevicesUnavailable (error 46) due to "all CUDA-capable devices are busy or unavailable" on CUDA API call to cudaMalloc.
all CUDA-capable devices are busy or unavailable========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/aarch64-linux-gnu/tegra/libcuda.so.1 [0x32081c]
========= Host Frame:/usr/local/cuda-10.2/lib64/libcudart.so.10.2 (cudaMalloc + 0x144) [0x3b7bc]
========= Host Frame:./npp_sum_test [0xeac]
========= Host Frame:/lib/aarch64-linux-gnu/libc.so.6 (__libc_start_main + 0xe0) [0x207a0]
========= Host Frame:./npp_sum_test [0xd54]
=========
========= ERROR SUMMARY: 1 error
However, the code itself seems to work fine.
My questions would be along the lines of:
Is my use of the NPP sum function wrong?
Is there some other CUDA management issue I must be considering?
Could it be that the NPP sum function might have some sort of issue?
Thanks in advance for any help : )
SOLUTION: For anyone with a similar question, the result of the sum must be device memory.