Cuda failure: 4 while running trt code on pegasus


I keep getting the error as title while running a tensorrt code on pegasus drive.
The version of TRT is 5 on pegasus and the code can be compiled successfully.

Whenever the code reaches “CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));”, it raises an error “Cuda failure: 4 Aborted(core dump)”.

I don’t know what could possibly go wrong with this data copy function, and I am new to drive and TRT.

Also, there is an warning while compiling my files, “g++: warning: linker input file unused because linking not done”.
Would it be the cause of the error?
But sadly, I do not know what is the cause of this warning as well.

Any help or advice or idea would be helpful!

Below is my TRT inference code:

#include <assert.h>
#include <sys/stat.h>
#include <time.h>
#include <cuda_runtime_api.h>
#include <cudnn.h>
#include <cublas_v2.h>
#include <string.h>

#include “NvInfer.h”
#include “NvCaffeParser.h”
#include “NvInferPlugin.h”
#include “common.h”

#include “all_plugin.h”

static Logger gLogger;
using namespace nvinfer1;
using namespace nvcaffeparser1;
using namespace plugin;
using namespace std;

static const int INPUT_C = 3;
static const int INPUT_H = 512; //720 509
static const int INPUT_W = 1024; //1280 905
static const int OUTPUT_SIZE = INPUT_H * INPUT_W;

const char* INPUT_BLOB_NAME = “data”;
const char* OUTPUT_BLOB_NAME = “deconv6_0_0”;
static int gUseDLACore{-1};

void caffeToGIEModel(const char* deployFile, // name for caffe prototxt
const char* modelFile, // name for model
const std::vectorstd::string& outputs, // network outputs
unsigned int maxBatchSize, // batch size - NB must be at least as large as the batch we want to run with)
nvcaffeparser1::IPluginFactory* pluginFactory, // factory for plugin layers
IHostMemory *gieModelStream) // output stream for the GIE model
// create the builder
builder = createInferBuilder(gLogger);

// parse the caffe model to populate the network, then set the outputs
INetworkDefinition* network = builder->createNetwork();
ICaffeParser* parser = createCaffeParser();

std::cout << "Begin parsing model..." << std::endl;
const IBlobNameToTensor* blobNameToTensor = parser->parse(deployFile, modelFile, *network, DataType::kFLOAT);
std::cout << "End parsing model..." << std::endl;
// specify which tensors are outputs
for (auto& s : outputs)

// Build the engine
builder->setMaxWorkspaceSize(10 << 20);	// we need about 6MB of scratch space for the plugin layer for batch size 5

samplesCommon::enableDLA(builder, gUseDLACore);

std::cout << "Begin building engine..." << std::endl;
ICudaEngine* engine = builder->buildCudaEngine(*network);
std::cout << "End building engine..." << std::endl;

// we don't need the network any more, and we can destroy the parser

// serialize the engine, then close everything down
(*gieModelStream) = engine->serialize();



void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
const ICudaEngine& engine = context.getEngine();
// input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
// of these, but in this case we know that there is exactly one input and one output.
assert(engine.getNbBindings() == 2);
void* buffers[2];

// In order to bind the buffers, we need to know the names of the input and output tensors.
// note that indices are guaranteed to be less than IEngine::getNbBindings()
int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

// create GPU buffers and a stream
CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_C * INPUT_H * INPUT_W * sizeof(float)));
CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

cudaStream_t stream;

// DMA the input to the GPU,  execute the batch asynchronously, and DMA it back:
CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_C * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(batchSize, buffers, stream, nullptr);
std::cout << "d" <<endl;
CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
std::cout << "e" <<endl;

// release the stream and the buffers
std::cout << "a" <<endl;
std::cout << "b" <<endl;
std::cout << "c" <<endl;


int main(int argc, char** argv)
gUseDLACore = samplesCommon::parseDLA(argc, argv);
// create a TensorRT model from the caffe model and serialize it to a stream
PluginFactory parserPluginFactory;
IHostMemory* trtModelStream{ nullptr };
initLibNvInferPlugins(&gLogger, “”);

caffeToGIEModel("./enet_deploy_combine_trt_20190514_noDrop.prototxt", "./decoder_bn_drop_iter_100000.caffemodel", std::vector<std::string>{OUTPUT_BLOB_NAME}, 1, &parserPluginFactory, &trtModelStream);
assert(trtModelStream != nullptr);

// read input image
float* data = new float[INPUT_C * INPUT_H * INPUT_W];
for (int i=0; i<INPUT_C * INPUT_H * INPUT_W; i++)	data[i] = 0.0;

// deserialize the engine
IRuntime* runtime = createInferRuntime(gLogger);
assert(runtime != nullptr);
if (gUseDLACore >= 0)
PluginFactory pluginFactory;
ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream->data(), trtModelStream->size(), &pluginFactory);
assert(engine != nullptr);
IExecutionContext* context = engine->createExecutionContext();
assert(context != nullptr);

// run inference
float* outMAP = new float[OUTPUT_SIZE];
doInference(*context, data, outMAP, 1);
std::cout << "d" <<endl;

// Destroy the engine

// Destroy plugins created by factory


return 0;