Cuda failure: 4 while running trt code on pegasus

Hi,

I keep getting the error as title while running a tensorrt code on pegasus drive.
The version of TRT is 5 on pegasus and the code can be compiled successfully.

Whenever the code reaches “CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));”, it raises an error “Cuda failure: 4 Aborted(core dump)”.

I don’t know what could possibly go wrong with this data copy function, and I am new to drive and TRT.

Also, there is an warning while compiling my plugins.cu files, “g++: warning: plugins.cu: linker input file unused because linking not done”.
Would it be the cause of the error?
But sadly, I do not know what is the cause of this warning as well.

Any help or advice or idea would be helpful!

Below is my TRT inference code:

#include <assert.h>
#include
#include
#include
#include
#include
#include <sys/stat.h>
#include
#include <time.h>
#include <cuda_runtime_api.h>
#include <cudnn.h>
#include <cublas_v2.h>
#include
#include <string.h>
#include
#include
#include

#include “NvInfer.h”
#include “NvCaffeParser.h”
#include “NvInferPlugin.h”
#include “common.h”

#include “all_plugin.h”

static Logger gLogger;
using namespace nvinfer1;
using namespace nvcaffeparser1;
using namespace plugin;
using namespace std;

static const int INPUT_C = 3;
static const int INPUT_H = 512; //720 509
static const int INPUT_W = 1024; //1280 905
static const int OUTPUT_SIZE = INPUT_H * INPUT_W;

const char* INPUT_BLOB_NAME = “data”;
const char* OUTPUT_BLOB_NAME = “deconv6_0_0”;
static int gUseDLACore{-1};

void caffeToGIEModel(const char* deployFile, // name for caffe prototxt
const char* modelFile, // name for model
const std::vectorstd::string& outputs, // network outputs
unsigned int maxBatchSize, // batch size - NB must be at least as large as the batch we want to run with)
nvcaffeparser1::IPluginFactory* pluginFactory, // factory for plugin layers
IHostMemory *gieModelStream) // output stream for the GIE model
{
// create the builder
IBuilder
builder = createInferBuilder(gLogger);

// parse the caffe model to populate the network, then set the outputs
INetworkDefinition* network = builder->createNetwork();
ICaffeParser* parser = createCaffeParser();
parser->setPluginFactory(pluginFactory);

std::cout << “Begin parsing model…” << std::endl;
const IBlobNameToTensor* blobNameToTensor = parser->parse(deployFile, modelFile, *network, DataType::kFLOAT);
std::cout << “End parsing model…” << std::endl;
// specify which tensors are outputs
for (auto& s : outputs)
network->markOutput(*blobNameToTensor->find(s.c_str()));

// Build the engine
builder->setMaxBatchSize(maxBatchSize);
builder->setMaxWorkspaceSize(10 << 20); // we need about 6MB of scratch space for the plugin layer for batch size 5

samplesCommon::enableDLA(builder, gUseDLACore);

std::cout << “Begin building engine…” << std::endl;
ICudaEngine* engine = builder->buildCudaEngine(*network);
assert(engine);
std::cout << “End building engine…” << std::endl;

// we don’t need the network any more, and we can destroy the parser
network->destroy();
parser->destroy();

// serialize the engine, then close everything down
(*gieModelStream) = engine->serialize();

engine->destroy();
builder->destroy();
shutdownProtobufLibrary();
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
const ICudaEngine& engine = context.getEngine();
// input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
// of these, but in this case we know that there is exactly one input and one output.
assert(engine.getNbBindings() == 2);
void* buffers[2];

// In order to bind the buffers, we need to know the names of the input and output tensors.
// note that indices are guaranteed to be less than IEngine::getNbBindings()
int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

// create GPU buffers and a stream
CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_C * INPUT_H * INPUT_W * sizeof(float)));
CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));

// DMA the input to the GPU, execute the batch asynchronously, and DMA it back:
CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_C * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(batchSize, buffers, stream, nullptr);
std::cout << “d” <<endl;
CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
std::cout << “e” <<endl;
cudaStreamSynchronize(stream);

// release the stream and the buffers
cudaStreamDestroy(stream);
std::cout << “a” <<endl;
CHECK(cudaFree(buffers[inputIndex]));
std::cout << “b” <<endl;
CHECK(cudaFree(buffers[outputIndex]));
std::cout << “c” <<endl;
}

int main(int argc, char** argv)
{
gUseDLACore = samplesCommon::parseDLA(argc, argv);
// create a TensorRT model from the caffe model and serialize it to a stream
PluginFactory parserPluginFactory;
IHostMemory* trtModelStream{ nullptr };
initLibNvInferPlugins(&gLogger, “”);

caffeToGIEModel(“./enet_deploy_combine_trt_20190514_noDrop.prototxt”, “./decoder_bn_drop_iter_100000.caffemodel”, std::vectorstd::string{OUTPUT_BLOB_NAME}, 1, &parserPluginFactory, &trtModelStream);
assert(trtModelStream != nullptr);
parserPluginFactory.destroyPlugin();

// read input image
float* data = new float[INPUT_C * INPUT_H * INPUT_W];
for (int i=0; i<INPUT_C * INPUT_H * INPUT_W; i++) data[i] = 0.0;

// deserialize the engine
IRuntime* runtime = createInferRuntime(gLogger);
assert(runtime != nullptr);
if (gUseDLACore >= 0)
{
runtime->setDLACore(gUseDLACore);
}
PluginFactory pluginFactory;
ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream->data(), trtModelStream->size(), &pluginFactory);
assert(engine != nullptr);
trtModelStream->destroy();
IExecutionContext* context = engine->createExecutionContext();
assert(context != nullptr);

// run inference
float* outMAP = new float[OUTPUT_SIZE];
doInference(*context, data, outMAP, 1);
std::cout << “d” <<endl;

// Destroy the engine
context->destroy();
engine->destroy();
runtime->destroy();

// Destroy plugins created by factory
pluginFactory.destroyPlugin();

//visualization

return 0;
}