help me help you with modern CMake and CUDA: mwe for NPP

Here is a minimal test case for libnppig:

$ cat ut_nppig.cpp
#include <nppi_geometry_transforms.h>
#include <cuda_runtime_api.h>
#include <assert.h>

int main(){

/**
 * 1 channel 8-bit unsigned image mirror.
 */
  const int simgrows = 32;
  const int simgcols = 32;
  Npp8u *d_pSrc, *d_pDst;
  NppiSize oROI;  oROI.width = simgcols;  oROI.height = simgrows;
  const int simgsize = simgrows*simgcols*sizeof(d_pSrc[0]);
  const int dimgsize = oROI.width*oROI.height*sizeof(d_pSrc[0]);
  const int simgpix  = simgrows*simgcols;
  const int dimgpix  = oROI.width*oROI.height;
  const int nSrcStep = simgcols*sizeof(d_pSrc[0]);
  const int nDstStep = oROI.width*sizeof(d_pDst[0]);
  const NppiAxis flip = NPP_VERTICAL_AXIS;
  Npp8u *h_img = new Npp8u[simgpix];
  for (int i = 0; i < simgrows; i++)
    for (int j = 0; j < simgcols; j++) h_img[i*simgcols+j] = simgcols-j-1;
  cudaError_t err = cudaMalloc((void **)&d_pSrc, simgsize);
  assert(err == cudaSuccess);
  err = cudaMalloc((void **)&d_pDst, dimgsize);
  assert(err == cudaSuccess);
  err = cudaMemcpy(d_pSrc, h_img, simgsize, cudaMemcpyHostToDevice);
  assert(err == cudaSuccess);
  // set image to pixval initially
  err = cudaMemset(d_pDst, 0, dimgsize);
  assert(err == cudaSuccess);
  // perform mirror op
  NppStatus ret =  nppiMirror_8u_C1R(d_pSrc, nSrcStep, d_pDst, nDstStep, oROI, flip);
  assert(ret == NPP_NO_ERROR);
  err = cudaMemcpy(h_img, d_pDst, dimgsize, cudaMemcpyDeviceToHost);
  assert(err == cudaSuccess);
  // test for R to L flip
  for (int i = 0; i < oROI.height; i++)
    for (int j = 0; j < oROI.width; j++) assert(h_img[i*oROI.width+j] == j);
  return 0;
}

$ cat bld_nppig
#!/bin/bash
# static linking to CUDA libraries
g++ -I/usr/local/cuda/include ut_nppig.cpp -L/usr/local/cuda/lib64 -lnppig_static -lnppc_static -lculibos -lcudart_static -lpthread -ldl -lrt -o ut_nppig_static
# dynamic linking
g++ -I/usr/local/cuda/include ut_nppig.cpp -L/usr/local/cuda/lib64 -lnppig  -lcudart -o ut_nppig_dynamic
$ ./bld_nppig
$ ./ut_nppig_static
$ ./ut_nppig_dynamic
$

Tested on CUDA 9.2, CentOS 7, g++ 4.8.5-4
https://docs.nvidia.com/cuda/npp/index.html
this assumes a standard, proper CUDA install
This is a very simple test case to demonstrate minimal library linking. For more complete NPP sample codes, refer to the CUDA sample codes:

https://docs.nvidia.com/cuda/cuda-samples/index.html#cudalibraries