Cuda Segmentation Fault during Video Processing

Hello everyone,

The following program suppose to process live stream but I get segmentation fault after the first frame of the video. I think I need to free the cuda memory allocated after every frame but I could not find a correct way to do it.

Inference code:

void doInference(IExecutionContext & context, cudaStream_t & stream, void ** buffers, float * input, float * output, int batchSize) {
  // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
  CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
  context.enqueue(batchSize, buffers, stream, nullptr);
  CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
  cudaStreamSynchronize(stream);
}

Main Code :

int main(int argc, char ** argv) {
  cudaSetDevice(DEVICE);
  // create a model using the API directly and serialize it to a stream
  char * trtModelStream {
    nullptr
  };
  size_t size {
    0
  };
  std::string engine_name = STR2(NET);
  engine_name = "best_new_640" + engine_name + ".engine";
  if (argc == 2 && std::string(argv[1]) == "-s") {
    IHostMemory * modelStream {
      nullptr
    };
    APIToModel(BATCH_SIZE, & modelStream);
    assert(modelStream != nullptr);
    std::ofstream p(engine_name, std::ios::binary);
    if (!p) {
      std::cerr << "could not open plan output file" << std::endl;
      return -1;
    }
    p.write(reinterpret_cast <
      const char * > (modelStream -> data()), modelStream -> size());
    modelStream -> destroy();
    return 0;
  } else if (argc == 3 && std::string(argv[1]) == "-d") {
    std::ifstream file(engine_name, std::ios::binary);
    if (file.good()) {
      file.seekg(0, file.end);
      size = file.tellg();
      file.seekg(0, file.beg);
      trtModelStream = new char[size];
      assert(trtModelStream);
      file.read(trtModelStream, size);
      file.close();
    }
  } else {
    std::cerr << "arguments not right!" << std::endl;
    std::cerr << "./yolov5 -s  // serialize model to plan file" << std::endl;
    std::cerr << "./yolov5 -d ../samples  // deserialize plan file and run inference" << std::endl;
    return -1;
  }

  // prepare input data ---------------------------
  static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
  static float prob[BATCH_SIZE * OUTPUT_SIZE];
  IRuntime * runtime = createInferRuntime(gLogger);
  assert(runtime != nullptr);
  ICudaEngine * engine = runtime -> deserializeCudaEngine(trtModelStream, size);
  assert(engine != nullptr);
  IExecutionContext * context = engine -> createExecutionContext();
  assert(context != nullptr);
  delete[] trtModelStream;
  assert(engine -> getNbBindings() == 2);
  void * buffers[2];
  // In order to bind the buffers, we need to know the names of the input and output tensors.
  // Note that indices are guaranteed to be less than IEngine::getNbBindings()
  const int inputIndex = engine -> getBindingIndex(INPUT_BLOB_NAME);
  const int outputIndex = engine -> getBindingIndex(OUTPUT_BLOB_NAME);
  assert(inputIndex == 0);
  assert(outputIndex == 1);
  std::cout << "input index is:" << inputIndex << std::endl;
  std::cout << "output index is:" << outputIndex << std::endl;
  // Create GPU buffers on device
  CHECK(cudaMalloc( & buffers[inputIndex], BATCH_SIZE * 3 * INPUT_H * INPUT_W * sizeof(float)));
  CHECK(cudaMalloc( & buffers[outputIndex], BATCH_SIZE * OUTPUT_SIZE * sizeof(float)));
  // Create stream
  cudaStream_t stream;
  CHECK(cudaStreamCreate( & stream));

  /*
      This part is added after.     
  */

  cv::VideoCapture cap("/home/nvidia/Downloads/2.mp4");

  // if not success, exit program
  if (cap.isOpened() == false) {
    std::cout << "Cannot open the video file" << std::endl;
    std::cin.get(); //wait for any key press
    return -1;
  }

  //get the frames rate of the video
  double fps = cap.get(cv::CAP_PROP_FPS);
  std::cout << "Frames per seconds : " << fps << std::endl;

  std::string window_name = "My First Video";

  cv::namedWindow(window_name, cv::WINDOW_NORMAL); //create a window
  int n = 0;

  float flat_array[BATCH_SIZE * 3 * INPUT_H * INPUT_W];

  /*
      Until here.
  */

  while (1) {
    // Run inference
    cap.grab();
    cv::Mat frame;
    bool bSuccess = cap.read(frame);

    //Breaking the while loop at the end of the video
    if (bSuccess == false) {
      std::cout << "Found the end of the video" << std::endl;
      break;
    }

    if (n % 1 == 0) {
      cv::Mat pr_img = preprocess_img(frame); // letterbox BGR to RGB
      int i, b = 0;
      cv::imshow("test array", pr_img);

      size_t sizeInBytes = pr_img.total() * pr_img.elemSize();
      std::cout << sizeInBytes << std::endl;
      std::cout << COUNT_OF(pr_img.data) << std::endl;

      //uchar* uc_pixel=nullptr;
      for (int row = 0; row < INPUT_H; ++row) {
        uchar * uc_pixel = pr_img.data + row * pr_img.step;
        //std::cout<<"__ROW "<<row<<std::endl;
        for (int col = 0; col < INPUT_W; ++col) {
          data[b * 3 * INPUT_H * INPUT_W + i] = (float) uc_pixel[2] / 255.0;
          data[b * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] = (float) uc_pixel[1] / 255.0;
          data[b * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] = (float) uc_pixel[0] / 255.0;
          uc_pixel += 3;
          //std::cout<<"INPUT_W "<<col<<"ROW "<<row<<std::endl;
          ++i;
        }
      }
      std::cout << "data_length" << COUNT_OF(data);

      // auto start = std::chrono::system_clock::now();
      doInference( * context, stream, buffers, data, prob, BATCH_SIZE);
      // auto end = std::chrono::system_clock::now();
      // std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
      // std::vector<std::vector<Yolo::Detection>> batch_res(1);
      // for (int b = 0; b < 1; b++) {
      //     auto& res = batch_res[b];
      //     nms(res, &prob[b * OUTPUT_SIZE], CONF_THRESH, NMS_THRESH);
      // }

      // auto& res = batch_res[b];
      // //std::cout << res.size() << std::endl;
      // //cv::Mat img = cv::imread(std::string(argv[2]) + "/" + file_names[f - fcount + 1 + b]);
      // for (size_t j = 0; j < res.size(); j++) {
      //     cv::Rect r = get_rect(frame, res[j].bbox);
      //     cv::rectangle(frame, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
      //     cv::putText(frame, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2);
      // }

      cv::imshow(window_name, frame);
      if (cv::waitKey(10) == 27) {
        std::cout << "Esc key is pressed by user. Stoppig the video" << std::endl;
        break;
      }

      std::cout << "image processed" << std::endl;
    }
    n++;
  }

  // Release stream and buffers
  // cudaStreamDestroy(stream);
  // CHECK(cudaFree(buffers[inputIndex]));
  // CHECK(cudaFree(buffers[outputIndex]));
  // // Destroy the engine
  // context->destroy();
  // engine->destroy();
  // runtime->destroy();

  return 0;
}

Segmentation Faults where the data buffer feeds in to the doInference function in a video stream.