NPP invalidates cuda unified memory.

The following code crashes with a “Bus error” in the second memcpy (the one inside the loop).

#include <stdlib.h>
#include <string.h>
#include <iostream>
#include <chrono>

#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <unistd.h>

#include <cuda_runtime_api.h>
#include <cuda.h>
#include <nppi.h>

main(int argc, char *argv[])
  const int NFRAMES = 1000;
  const int w = 4024;
  const int h = 3036;
  const int bayer_size = w * h * 1;
  const int rgb_size = w * h * 4;
  NppStatus ret;
  char *src;
  char *dst;
  char *cuda_src;
  char *cuda_dst;
  cudaError_t cudaerr;
  int src_line_step = w * 1;
  NppiSize src_size = {w, h};
  NppiRect src_roi = {0, 0, w, h};
  int dst_line_step = w * 4;

  src = (char *) malloc(bayer_size);
  memset(src, 0, bayer_size);

  dst = (char *) malloc(rgb_size);

  cudaerr = cudaMallocManaged((void **) &cuda_src, bayer_size);
  if (cudaerr) {
    printf("cudaMalloc 1 failed with error code %d.\n", cudaerr);
  cudaerr = cudaMallocManaged((void **) &cuda_dst, rgb_size);
  if (cudaerr) {
    printf("cudaMalloc 2 failed with error code %d.\n", cudaerr);

  memcpy(cuda_src, src, bayer_size);

  printf("Processing %d frames...\n", NFRAMES);
  auto start = std::chrono::steady_clock::now();
  for (int i = 0; i < NFRAMES; ++i) {
    ret = nppiCFAToRGBA_8u_C1AC4R((const Npp8u *) cuda_src,
                                  (Npp8u *) cuda_dst,
    if (ret < 0) {
      printf("Error %d in NPP\n", ret);

    memcpy(dst, cuda_dst, rgb_size);
  auto end = std::chrono::steady_clock::now();

  std::chrono::duration<double> diff = end - start;
  printf("Time: %f\n", diff.count());
  printf("Frame Time: %f\n", diff.count() / NFRAMES);
  printf("Effective Frame Rate: %f\n", 1 / (diff.count() / NFRAMES));

  return 0;

Using cuda-gdb, the value of cuda_dst variable is reported as “cannot access memory at address 0x101cdd000”. Before the function call though, the memory is valid and accessible. Commenting the call to nppiCFAToRGBA_8u_C1AC4R fixes the crash. So it seems NPP is doing something that invalidates the memory.

If I use cudaMemcpy instead of memcpy, this works fine (although slowly). Any ideas?

Put a cudaDeviceSynchronize() after the nppi call, before the memcpy

and read about why this matters:

Concurrent (host + device) UM access is not available on Jetson TX2 but should be available on Jetson Xavier (AGX). As a general rule, check the relevant device property when using UM to be sure that the UM concurrent access mode is supported. If it is not, then the usual approach is to place a cudaDeviceSynchronize() call after a kernel launch but before host access to managed data.