I’ve been using this to measure nppi performance on a Jetson TX2.
#include <stdlib.h>
#include <string.h>
#include <iostream>
#include <chrono>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <cuda_runtime_api.h>
#include <cuda.h>
#include <nppi.h>
int
main(int argc, char *argv[])
{
const int NFRAMES = 10000;
const int w = 4024;
const int h = 3036;
const int bayer_size = w * h * 1;
const int rgb_size = w * h * 4;
NppStatus ret;
char *src;
char *cuda_src;
char *cuda_dst;
cudaError_t cudaerr;
int src_line_step = w * 1;
NppiSize src_size = {w, h};
NppiRect src_roi = {0, 0, w, h};
int dst_line_step = w * 4;
int fd = open("/dev/urandom", 0);
src = (char *) mmap(NULL, bayer_size, PROT_READ, 0, fd, 0);
cudaerr = cudaHostAlloc((void **) &cuda_src, bayer_size, cudaHostAllocMapped);
if (cudaerr) {
printf("cudaHostAlloc 1 failed with error code %d.\n", cudaerr);
exit(1);
}
cudaerr = cudaHostAlloc((void **) &cuda_dst, rgb_size, cudaHostAllocMapped);
if (cudaerr) {
printf("cudaHostAlloc 2 failed with error code %d.\n", cudaerr);
exit(1);
}
cudaMemcpy(cuda_src, src, bayer_size, cudaMemcpyDefault);
printf("Processing %d frames...\n", NFRAMES);
auto start = std::chrono::steady_clock::now();
decltype(start) last = std::chrono::steady_clock::now();
for (int i = 0; i < NFRAMES; ++i) {
ret = nppiCFAToRGBA_8u_C1AC4R((const Npp8u *) cuda_src,
src_line_step,
src_size,
src_roi,
(Npp8u *) cuda_dst,
dst_line_step,
NPPI_BAYER_RGGB,
NPPI_INTER_UNDEFINED,
255);
if (ret) {
printf("Error %d in NPP\n", ret);
exit(1);
}
if (i % 100 == 0) {
auto now = std::chrono::steady_clock::now();
std::chrono::duration<double> d = now-last;
printf("i: %d (time: %f ms)\n", i, d.count() * 1000);
last=now;
}
}
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double> diff = end - start;
printf("Time: %f\n", diff.count());
printf("Frame Time: %f\n", diff.count() / NFRAMES);
printf("Effective Frame Rate: %f\n", 1 / (diff.count() / NFRAMES));
return 0;
}
If you run this, you will see that after around 900 frames, the process slows down immensely (like thousands of times slower). If I use cudaMalloc instead of cudaHostAlloc, performance is constant.
To give you some context, the reason I started trying cudaHostAlloc was because in my application, copying to and from cuda was the main bottleneck and most of the program’s time was actually spent copying, rather than debayering. I was hoping I could use memory mapped buffers to get around the extra copies.