The following code crashes with a “Bus error” in the second memcpy (the one inside the loop).
#include <stdlib.h>
#include <string.h>
#include <iostream>
#include <chrono>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <unistd.h>
#include <cuda_runtime_api.h>
#include <cuda.h>
#include <nppi.h>
int
main(int argc, char *argv[])
{
const int NFRAMES = 1000;
const int w = 4024;
const int h = 3036;
const int bayer_size = w * h * 1;
const int rgb_size = w * h * 4;
NppStatus ret;
char *src;
char *dst;
char *cuda_src;
char *cuda_dst;
cudaError_t cudaerr;
int src_line_step = w * 1;
NppiSize src_size = {w, h};
NppiRect src_roi = {0, 0, w, h};
int dst_line_step = w * 4;
src = (char *) malloc(bayer_size);
memset(src, 0, bayer_size);
dst = (char *) malloc(rgb_size);
cudaerr = cudaMallocManaged((void **) &cuda_src, bayer_size);
if (cudaerr) {
printf("cudaMalloc 1 failed with error code %d.\n", cudaerr);
exit(1);
}
cudaerr = cudaMallocManaged((void **) &cuda_dst, rgb_size);
if (cudaerr) {
printf("cudaMalloc 2 failed with error code %d.\n", cudaerr);
exit(1);
}
memcpy(cuda_src, src, bayer_size);
printf("Processing %d frames...\n", NFRAMES);
auto start = std::chrono::steady_clock::now();
for (int i = 0; i < NFRAMES; ++i) {
ret = nppiCFAToRGBA_8u_C1AC4R((const Npp8u *) cuda_src,
src_line_step,
src_size,
src_roi,
(Npp8u *) cuda_dst,
dst_line_step,
NPPI_BAYER_RGGB,
NPPI_INTER_UNDEFINED,
255);
if (ret < 0) {
printf("Error %d in NPP\n", ret);
exit(1);
}
memcpy(dst, cuda_dst, rgb_size);
}
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double> diff = end - start;
printf("Time: %f\n", diff.count());
printf("Frame Time: %f\n", diff.count() / NFRAMES);
printf("Effective Frame Rate: %f\n", 1 / (diff.count() / NFRAMES));
return 0;
}
Using cuda-gdb, the value of cuda_dst variable is reported as “cannot access memory at address 0x101cdd000”. Before the function call though, the memory is valid and accessible. Commenting the call to nppiCFAToRGBA_8u_C1AC4R fixes the crash. So it seems NPP is doing something that invalidates the memory.
If I use cudaMemcpy instead of memcpy, this works fine (although slowly). Any ideas?