enqueue error

Hi, I met a bug in trtexec.cpp.

void doInference(ICudaEngine& engine)
{

    IExecutionContext* context = engine.createExecutionContext();
    std::cout<<"context created "<<std::endl;
    SimpleProfiler profiler = SimpleProfiler("Profiler");

    if (gParams.benchmarkLayers){

        context->setProfiler(&profiler);

    }

    // input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
    // of these, but in this case we know that there is exactly one input and one output.
    
    std::vector<void*> buffers(gParams.inputs.size() + gParams.outputs.size());
    for (size_t i = 0; i < gParams.inputs.size(); i++)
        createMemory(engine, buffers, gParams.inputs[i]);

    for (size_t i = 0; i < gParams.outputs.size(); i++)
        createMemory(engine, buffers, gParams.outputs[i]);
    
    //void* buffers[gParams.inputs.size() + gParams.outputs.size()];

    std::cout<<"memory create is completed"<<std::endl;
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));
    cudaEvent_t start, end;
    unsigned int cudaEventFlags = gParams.useSpinWait ? cudaEventDefault : cudaEventBlockingSync;
    CHECK(cudaEventCreateWithFlags(&start, cudaEventFlags));
    CHECK(cudaEventCreateWithFlags(&end, cudaEventFlags));

    std::ofstream file;
    initiateFile(file);

    std::vector<float> timesGpu(gParams.avgRuns);
    std::vector<float> timesHost(gParams.avgRuns);
    for (int j = 0; j < gParams.iterations; j++)
    {
        float totalGpu{0}, totalHost{0}; // GPU and Host timers
        for (int i = 0; i < gParams.avgRuns; i++)
        {
            auto tStart = std::chrono::high_resolution_clock::now();
            cudaEventRecord(start, stream);
            std::cout<<"enqueue starts"<<std::endl;
            //context->enqueue(gParams.batchSize, &buffers[0], stream, nullptr);
            //context->enqueue(gParams.batchSize, &buffers[0],&buffer[1]);
            context->execute(gParams.batchSize, &buffers[0]);
            //mTrtContext->execute(batchSize, &mTrtCudaBuffer[inputIndex]);
            std::cout<<"enqueue ended here"<<std::endl;
            cudaEventRecord(end, stream);
            cudaEventSynchronize(end);

            auto tEnd = std::chrono::high_resolution_clock::now();
            timesHost[i] = std::chrono::duration<float, std::milli>(tEnd - tStart).count();
            totalHost += timesHost[i];
	    float ms;
            cudaEventElapsedTime(&ms, start, end);
	    timesGpu[i] = ms;
            totalGpu += ms;
        }
        totalGpu /= gParams.avgRuns;
        totalHost /= gParams.avgRuns;
        std::cout << "Average over " << gParams.avgRuns << " runs is " << totalGpu << " ms (host walltime is " << totalHost
                  << " ms, " << static_cast<int>(gParams.pct) << "\% percentile time is " << percentile(gParams.pct, timesGpu) << ")." << std::endl;

        // calculate standard deviation
        float stdGpu = stDev (timesGpu, totalGpu);
        float stdHost = stDev (timesHost, totalHost);

        file << totalGpu << "\t" << stdGpu << "\t" << totalHost << "\t" << stdHost << "\t" << percentile(gParams.pct, timesGpu) << "\n"; 
    }
    file << "\n";

    if (gParams.benchmarkLayers)
    {
    	file << profiler;
        std::cout << profiler;
    }

    file.close();
    cudaStreamDestroy(stream);
    cudaEventDestroy(start);
    cudaEventDestroy(end);

    context->destroy();
}

I kept getting error as:

enqueue starts
engine.cpp (713) - Cuda Error in execute: 700 (an illegal memory access was encountered)
engine.cpp (713) - Cuda Error in execute: 700 (an illegal memory access was encountered)
enqueue ended here
enqueue starts
cuda/genericReformat.cu (1259) - Cuda Error in executeMemcpy: 700 (an illegal memory access was encountered)
cuda/genericReformat.cu (1259) - Cuda Error in executeMemcpy: 700 (an illegal memory access was encountered)
enqueue ended here
enqueue starts
cuda/genericReformat.cu (1259) - Cuda Error in executeMemcpy: 700 (an illegal memory access was encountered)
cuda/genericReformat.cu (1259) - Cuda Error in executeMemcpy: 700 (an illegal memory access was encountered)
enqueue ended here
enqueue starts
cuda/genericReformat.cu (1259) - Cuda Error in executeMemcpy: 700 (an illegal memory access was encountered)
cuda/genericReformat.cu (1259) - Cuda Error in executeMemcpy: 700 (an illegal memory access was encountered)
enqueue ended here
enqueue starts
cuda/genericReformat.cu (1259) - Cuda Error in executeMemcpy: 700 (an illegal memory access was encountered)
cuda/genericReformat.cu (1259) - Cuda Error in executeMemcpy: 700 (an illegal memory access was encountered)
enqueue ended here
enqueue starts
cuda/genericReformat.cu (1259) - Cuda Error in executeMemcpy: 700 (an illegal memory access was encountered)
cuda/genericReformat.cu (1259) - Cuda Error in executeMemcpy: 700 (an illegal memory access was encountered)
enqueue ended here
Average over 6 runs is 171.985 ms (host walltime is 191.573 ms, 99% percentile time is 1031.91).
enqueue starts
cuda/genericReformat.cu (1259) - Cuda Error in executeMemcpy: 700 (an illegal memory access was encountered)
cuda/genericReformat.cu (1259) - Cuda Error in executeMemcpy: 700 (an illegal memory access was encountered)
enqueue ended here
enqueue starts
cuda/genericReformat.cu (1259) - Cuda Error in executeMemcpy: 700 (an illegal memory access was encountered)
cuda/genericReformat.cu (1259) - Cuda Error in executeMemcpy: 700 (an illegal memory access was encountered)
enqueue ended here
enqueue starts
cuda/genericReformat.cu (1259) - Cuda Error in executeMemcpy: 700 (an illegal memory access was encountered)
cuda/genericReformat.cu (1259) - Cuda Error in executeMemcpy: 700 (an illegal memory access was encountered)
enqueue ended here
enqueue starts
cuda/genericReformat.cu (1259) - Cuda Error in executeMemcpy: 700 (an illegal memory access was encountered)
cuda/genericReformat.cu (1259) - Cuda Error in executeMemcpy: 700 (an illegal memory access was encountered)
enqueue ended here
enqueue starts
cuda/genericReformat.cu (1259) - Cuda Error in executeMemcpy: 700 (an illegal memory access was encountered)
cuda/genericReformat.cu (1259) - Cuda Error in executeMemcpy: 700 (an illegal memory access was encountered)
enqueue ended here
enqueue starts
cuda/genericReformat.cu (1259) - Cuda Error in executeMemcpy: 700 (an illegal memory access was encountered)
cuda/genericReformat.cu (1259) - Cuda Error in executeMemcpy: 700 (an illegal memory access was encountered)
enqueue ended here
Average over 6 runs is 4.2514e-41 ms (host walltime is 0.0303392 ms, 99% percentile time is 7.94116e-41).
enqueue starts
cuda/genericReformat.cu (1259) - Cuda Error in executeMemcpy: 700 (an illegal memory access was encountered)
cuda/genericReformat.cu (1259) - Cuda Error in executeMemcpy: 700 (an illegal memory access was encountered)
enqueue ended here
enqueue starts
cuda/genericReformat.cu (1259) - Cuda Error in executeMemcpy: 700 (an illegal memory access was encountered)
cuda/genericReformat.cu (1259) - Cuda Error in executeMemcpy: 700 (an illegal memory access was encountered)
enqueue ended here
enqueue starts
cuda/genericReformat.cu (1259) - Cuda Error in executeMemcpy: 700 (an illegal memory access was encountered)
cuda/genericReformat.cu (1259) - Cuda Error in executeMemcpy: 700 (an illegal memory access was encountered)
enqueue ended here
enqueue starts
cuda/genericReformat.cu (1259) - Cuda Error in executeMemcpy: 700 (an illegal memory access was encountered)
cuda/genericReformat.cu (1259) - Cuda Error in executeMemcpy: 700 (an illegal memory access was encountered)
enqueue ended here
enqueue starts
cuda/genericReformat.cu (1259) - Cuda Error in executeMemcpy: 700 (an illegal memory access was encountered)
cuda/genericReformat.cu (1259) - Cuda Error in executeMemcpy: 700 (an illegal memory access was encountered)
enqueue ended here
enqueue starts
cuda/genericReformat.cu (1259) - Cuda Error in executeMemcpy: 700 (an illegal memory access was encountered)
cuda/genericReformat.cu (1259) - Cuda Error in executeMemcpy: 700 (an illegal memory access was encountered)
enqueue ended here
Average over 6 runs is 2.39664e-41 ms (host walltime is 0.0171033 ms, 99% percentile time is 4.52213e-41).
Parameter check failed at: engine.cpp::terminateCommonContext::192, condition: cudaEventDestroy(context.start) failure.
Parameter check failed at: engine.cpp::terminateCommonContext::197, condition: cudaEventDestroy(context.stop) failure.
runtime.cpp (31) - Cuda Error in free: 700 (an illegal memory access was encountered)
terminate called after throwing an instance of 'nvinfer1::CudaError'
  what():  std::exception
Abgebrochen (Speicherabzug geschrieben)

Is there anyone who can give me some hints?
Thanks in advance!

Hi,
Can you provide the following information so we can better help?
Provide details on the platforms you are using:
o Linux distro and version
o GPU type
o Nvidia driver version
o CUDA version
o CUDNN version
o Python version [if using python]
o Tensorflow version
o TensorRT version
o If Jetson, OS, hw versions

Also, if possible please share the script & model file to reproduce the issue.

Thanks

Hi, I have solved this problem. But I met new problem using plugin layer.

--------------- Timing upsample(19)
Tactic 0 is the only option, timing skipped
Formats and tactics selection completed in 11.8396 seconds.
After reformat layers: 6 layers
Block size 3145728000
Block size 213811200
Block size 21483008
Block size 13363200
Total Activation Memory: 3394385408
Detected 1 input and 1 output network tensors.
supportsFormat=== type:0format0
type 0format 0
configureWithFormat:30 240 464
Data initialization and engine generation completed in 0.804268 seconds.
Calculating Maxima
Calibrating with batch 0
Tensor segmentation_type_1_30/BiasAdd is uniformly zero; network calibration failed.
Calibration completed in 20.7948 seconds.
[INT8 Quantization] INT8 Inference Tensor Scales: input_1 [0.00787594]
[INT8 Quantization] INT8 Inference Tensor Scales: (Unnamed Layer* 1) [Padding]_output [0.00787594]
[INT8 Quantization] INT8 Inference Tensor Scales: activation_1/Relu [0.000114801]
[INT8 Quantization] INT8 Inference Tensor Scales: max_pooling2d_1/MaxPool [0.000118674]
[INT8 Quantization] INT8 Inference Tensor Scales: segmentation_type_1_30/BiasAdd [9.38656e-10]
[INT8 Quantization] INT8 Inference Tensor Scales: upsample_HL_1804289383 [0.000114801]
[INT8 Quantization] INT8 Inference Tensor Scales: upsample [0.000114801]
Original: 20 layers
After dead-layer removal: 9 layers
Fusing convolution weights from conv0/convolution with scale conv0_bn/FusedBatchNorm_1
Fusing convolution weights from segmentation_type_1_30/convolution with scale segmentation_type_1_30/BiasAdd
After scale fusion: 7 layers
Fusing conv0/convolution with activation_1/Relu
After vertical fusions: 6 layers
After swap: 6 layers
After final dead-layer removal: 6 layers
After tensor merging: 6 layers
After concat removal: 6 layers
[INT8 Quantization] Writing Calibration Cache for calibrator: TRT-5101-EntropyCalibration
Configuring builder for Int8 Mode completed in 20.8078 seconds.
Graph construction and optimization completed in 20.8104 seconds.
supportsFormat=== type:0format0
supportsFormat=== type:1format0
supportsFormat=== type:1format1
supportsFormat=== type:1format2
supportsFormat=== type:3format0
supportsFormat=== type:0format0
supportsFormat=== type:1format0
supportsFormat=== type:1format1
supportsFormat=== type:1format2
supportsFormat=== type:3format0

--------------- Timing <reformat>(9)
../builder/cudnnBuilderUtils.cpp (253) - Cuda Error in findFastestTactic: 9 (invalid configuration argument)
../builder/cudnnBuilderUtils.cpp (253) - Cuda Error in findFastestTactic: 9 (invalid configuration argument)
could not build engine
Engine could not be created
Engine could not be created

@SunilJB

Linux: Ubuntu 18.04
Platform: Nvidia Tegra Xavier
CUDA: 10.1
CUDNN: 7.5
python: 2.7
Tensorflow: 1.11
TensorRT: 5.15

I have uploaded the code and the model, please have a look.
https://drive.google.com/file/d/1pzBAKnpMaplZgfBXdxv-eTQb6tmUgR58/view?usp=sharing

Hope to get your feedback!
Thanks in advance.

Forgot to mention, that this error occurs when using int8 quantization…

Hi,

Code and model seems to be working with TRT 6.0 on desktop GPU.
Could you please try to run with TRT 6.0 version?

Thanks

Hi, thank you for your reply.
Becasuse of some limitations, TensorRT6 is not available on my platform.
Since TensorRT6 supports ResizeBilinear operation, my plugin implementation makes no sense on TensorRT6.
Could you please check my code in TensorRT5 on your desktop GPU?
I would appreciate any feedback from you.
Thanks in advance!

Bests.

Hi,

UFF parser doesn’t support ResizeBilinear op in TRT 6.
You have to create either a custom plugin or use ONNX parser for TRT optimization.

Also, JetPack 4.3 Developer Preview, which is packaged with TRT6 is available as a beta. https://developer.nvidia.com/jetpack-4_3_DP
Stay tuned for official production release of Jetpack 4.3.

Thanks

Hi,

I think you misunderstand my problem.

Custome plugin layer is finished and can recognize ResizeBilinear operation in model.

The thing is I can’t implement quantization using plugin layer, witout quantization it works properly, at least no error message.

And also TRT 6 is not available for my situation…