TensorRT execution fails without error

Hello
I have this inference code

void FaceNetClassifier::doInference(float* inputData, float* output) {
    int size_of_single_input = 3 * 160 * 160 * sizeof(float);
    int size_of_single_output = 128 * sizeof(float);
    int inputIndex = m_engine->getBindingIndex("input");
    int outputIndex = m_engine->getBindingIndex("embeddings");


    void* buffers[2];

    cudaMalloc(&buffers[inputIndex], m_batchSize * size_of_single_input);
    cudaMalloc(&buffers[outputIndex], m_batchSize * size_of_single_output);

    if (buffers[outputIndex] == nullptr)
    {
        std::cout << "Out of memory" << std::endl;
        //exit(1);
    }else{
        std::cout << "Memory allocated" << std::endl;
    }

    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));



    // copy data to GPU and execute
    cout<<"fails here"<<endl;
    CHECK(cudaMemcpyAsync(buffers[inputIndex], inputData, m_batchSize * size_of_single_input, cudaMemcpyHostToDevice, stream));
    cout<<"coped data to GPU"<<endl;
    m_context->enqueue(m_batchSize, &buffers[0], stream, nullptr);

    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], m_batchSize * size_of_single_output, cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);



    // Release the stream and the buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

and below is the application output in Qt

facenet engine created/loaded: 
Parsing Directory: ../imgs
closed images Directory: 
before facenet: 
before inference
Memory allocated
fails here
The program has unexpectedly finished.
The process was ended forcefully.
/home/obed/Documents/qtprojects/build-SmartGate_QtApp-Desktop-Debug/SmartGate_QtApp crashed.

The code seems to fail at the cudaMemcopyAsyc function but i cannot tell why because it does not even raise a cuda error. What could i be doing wrong?

I am using TensorRT 6.0.1, CUDA 10.0,
The engine was successfully generated but execution terminates at inference.
Please help, i am a beginner.

Hi,

You can use “trtexec” command line tool to understand performance and possibly locate bottlenecks. “–verbose” mode will help you debug the issue.

Please find the below links for your reference:
https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#trtexec
https://github.com/NVIDIA/TensorRT/blob/release/6.0/samples/opensource/trtexec/README.md

Also, try setting the logger severity to INFO level to get detailed execution information. Please refer to below link:
https://docs.nvidia.com/deeplearning/sdk/tensorrt-archived/tensorrt-700/tensorrt-developer-guide/index.html#initialize_library

Thanks