NPP invalidates cuda unified memory.

elektito · February 5, 2019, 2:21pm

The following code crashes with a “Bus error” in the second memcpy (the one inside the loop).

#include <stdlib.h>
#include <string.h>
#include <iostream>
#include <chrono>

#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <unistd.h>

#include <cuda_runtime_api.h>
#include <cuda.h>
#include <nppi.h>

int
main(int argc, char *argv[])
{
  const int NFRAMES = 1000;
  const int w = 4024;
  const int h = 3036;
  const int bayer_size = w * h * 1;
  const int rgb_size = w * h * 4;
  NppStatus ret;
  char *src;
  char *dst;
  char *cuda_src;
  char *cuda_dst;
  cudaError_t cudaerr;
  int src_line_step = w * 1;
  NppiSize src_size = {w, h};
  NppiRect src_roi = {0, 0, w, h};
  int dst_line_step = w * 4;

  src = (char *) malloc(bayer_size);
  memset(src, 0, bayer_size);

  dst = (char *) malloc(rgb_size);

  cudaerr = cudaMallocManaged((void **) &cuda_src, bayer_size);
  if (cudaerr) {
    printf("cudaMalloc 1 failed with error code %d.\n", cudaerr);
    exit(1);
  }
  cudaerr = cudaMallocManaged((void **) &cuda_dst, rgb_size);
  if (cudaerr) {
    printf("cudaMalloc 2 failed with error code %d.\n", cudaerr);
    exit(1);
  }

  memcpy(cuda_src, src, bayer_size);

  printf("Processing %d frames...\n", NFRAMES);
  auto start = std::chrono::steady_clock::now();
  for (int i = 0; i < NFRAMES; ++i) {
    ret = nppiCFAToRGBA_8u_C1AC4R((const Npp8u *) cuda_src,
                                  src_line_step,
                                  src_size,
                                  src_roi,
                                  (Npp8u *) cuda_dst,
                                  dst_line_step,
                                  NPPI_BAYER_RGGB,
                                  NPPI_INTER_UNDEFINED,
                                  255);
    if (ret < 0) {
      printf("Error %d in NPP\n", ret);
      exit(1);
    }

    memcpy(dst, cuda_dst, rgb_size);
  }
  auto end = std::chrono::steady_clock::now();

  std::chrono::duration<double> diff = end - start;
  printf("Time: %f\n", diff.count());
  printf("Frame Time: %f\n", diff.count() / NFRAMES);
  printf("Effective Frame Rate: %f\n", 1 / (diff.count() / NFRAMES));

  return 0;
}

Using cuda-gdb, the value of cuda_dst variable is reported as “cannot access memory at address 0x101cdd000”. Before the function call though, the memory is valid and accessible. Commenting the call to nppiCFAToRGBA_8u_C1AC4R fixes the crash. So it seems NPP is doing something that invalidates the memory.

If I use cudaMemcpy instead of memcpy, this works fine (although slowly). Any ideas?

Robert_Crovella · February 5, 2019, 3:17pm

Put a cudaDeviceSynchronize() after the nppi call, before the memcpy

and read about why this matters:

[url]https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-gpu-exclusive[/url]

Concurrent (host + device) UM access is not available on Jetson TX2 but should be available on Jetson Xavier (AGX). As a general rule, check the relevant device property when using UM to be sure that the UM concurrent access mode is supported. If it is not, then the usual approach is to place a cudaDeviceSynchronize() call after a kernel launch but before host access to managed data.

Topic		Replies	Views
CUDA memory copy (cudaMemcpy) fails after NPP sum function (nppiSum_8u_C3R) GPU-Accelerated Libraries npp	0	694	February 16, 2023
nppiFilterMedian_32f_C1R - cuda-memcheck Invalid Access GPU-Accelerated Libraries	0	467	August 10, 2020
cuda-gdb accessing non-resident UVM memory crashes cuda-gdb Jetson TX1	6	1566	October 18, 2021
Unified Memory not reachable by GPU? Jetson Nano cuda	3	618	November 17, 2021
Initcheck fails for an npp API CUDA-MEMCHECK	1	627	July 25, 2020
How to avoid "memcpy“ which called by "cudaMemcpy"? Jetson AGX Xavier cuda	2	472	June 20, 2022
Error running nvprof with NPP-based GPU computation CUDA Programming and Performance	2	574	October 18, 2018
Error on accessing GPU CUDA Programming and Performance	2	685	May 29, 2011
Invalid Managed Memory Access Jetson TX2	2	1216	October 18, 2021
Concurrent CPU and GPU processing Jetson TX1	12	1628	October 18, 2021

NPP invalidates cuda unified memory.

Related topics