cudaMemcpy is not working in Jetson Xavier

I’m creating code that uses Jetson utils to get an IP Camera stream, overlay it, and save it.

First, I implement custom Mat class(like opencv Mat), and utilized it to apply a producer-consumer pattern.

namespace rs {
class Mat {
  template <typename T = uchar3>
  void create(int _rows, int _cols, T* _data = nullptr) {
    this->cols = _cols;
    this->rows = _rows;
    auto size  = _cols * _rows * sizeof(uchar3);

    cudaError_t res = cudaSuccess;
    if (data && std::is_same<T, uchar3>::value) {
      res = cudaMalloc((void**)&data, size);
      res = cudaMemcpy(this->data, _data, size, cudaMemcpyDeviceToDevice);
    else if (size > 0 && _data == nullptr) {
      res = cudaMalloc((void**)&data, size);
      res = cudaMemset(this->data, 0, size);

  ////// deep copy //////
  void copyTo(Mat& dest) const
      dest.cols = this->cols;
      dest.rows = this->rows;
      auto size = dest.cols * dest.rows * sizeof(uchar3);

      cudaError_t res = cudaMalloc((void**)&, size);
      res = cudaMemcpy(, this->data, size, cudaMemcpyDeviceToDevice); // Error !!
      ...  // break point

  Mat() { create(0, 0); }
  Mat(int _rows, int _cols) { create(_rows, _cols); }
  Mat(const rs::Size& _size) { create(_size.height, _size.width); }

  ////// shallow copy //////
  Mat(const Mat& other)
    this->rows = other.rows;
    this->cols = other.cols;
    this->data =;
    this->ref  = other.ref;

    if (ref) {

When using the copyTo function as shown below,

only_polygon_img and only_text_img’s data is allocated in memory,
but the img’s data is not actually copied, only the 0x00 values are retrieved.

When I run Cpature on Jetson-Utils, in GDB contains the values just fine.

And I know that is located in the Device location.

So I used cudaMemcpyDeviceToDevice in the copyTo function.

// in grabber.cpp
rs::Mat img;
input->Capture(&, &status); // videoSource capture success

// in detector.cpp
rs::Mat img = grab_buffer.front();

rs::Mat only_text_img;
rs::Mat only_polygon_img;
img.copyTo(only_text_img);         // is empty
img.copyTo(only_polygon_img);  // is empty

// (example) save image : segmentation fault (core dumped)
imageWriter* writer = imageWriter::Create("test.jpg", options);
writer->Render(, only_text_img.cols, only_text_img.rows); 

All res values will return cudaSuccess. Is there something I’m missing?

I’m spending a lot of time thinking about this.

Can someone help me with this?

I would suggest running your code with compute-sanitizer memcheck and leakcheck and fix all reported errors, if any.

Please show a complete minimal reproducer, (single file, with main function, only using Mat, no other dependencies) for your problem. Then others can help you debugging and you may even find a bug yourself when creating this reproducer.

Thanks for the advice!
I’m new to CUDA so my question may be stupid. Please bear with me.

I created the simplest program as below for testing.

#include <cuda_runtime.h>
#include <memory.h>
#include <stdio.h>

int main()
  cudaError_t res;
  const auto total_size = 1 * sizeof(uchar3);

  // Make test data
  auto h1_data = new uchar3[total_size];
  h1_data[0].x = h1_data[0].y = h1_data[0].z = 10;

  // Step 1. Generate Device Memory
  uchar3* d1_data = nullptr;
  res = cudaMalloc((void**)&d1_data, total_size);
  res = cudaMemcpy(d1_data, h1_data, total_size, cudaMemcpyHostToDevice);

  // Step 2. Copy Device to Device
  uchar3* d2_data = nullptr;
  res = cudaMalloc((void**)&d2_data, total_size);
  res = cudaMemcpy(d2_data, d1_data, total_size, cudaMemcpyDeviceToDevice);

  // Step 3. Copy Device to Host & Printf
  uchar3* h2_data = new uchar3[total_size];
  res = cudaMemcpy(h2_data, d2_data, total_size, cudaMemcpyDeviceToHost);
  printf("%d %d %d\n", h2_data[0].x, h2_data[0].y, h2_data[0].z);

  return 0;

In the code, d1_data and d2_data appear to contain no data when checked with gdb.

But as a result, when I output h2_data, it comes out as 10, 10, 10.

CUDA memory does not seem to be directly referenced by the user (it can only be referenced by copying to Host).

Is there any way to output the cuda memory, like printf?

And is there any way to check if an arbitrary data (ex. h1_data or d1_data) exists in Host or Device?

device memory cannot be accessed from the host. From the host, you need to first copy the data to host memory. printf can be used within a kernel.

cudaPointerGetAttributes can determine if a pointer is a host pointer or device pointer.

1 Like

Thank you for your response.
I think there was a misunderstanding because the values were not being output to gdb.
The rest looks like it needs to be checked in jetson-utils!

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.