Segmentation fault (core dumped) error in multiple context and multiple stream in tensorrt

scumierjia · March 28, 2024, 1:52am

environment：nvcr.io/nvidia/tensorrt:23.10-py3
gpu-driver：535
when I run belowing code:
<<<

Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.

Licensed under the Apache License, Version 2.0 (the “License”);

you may not use this file except in compliance with the License.

You may obtain a copy of the License at

Apache License, Version 2.0 | Apache Software Foundations

Unless required by applicable law or agreed to in writing, software

distributed under the License is distributed on an “AS IS” BASIS,

WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

See the License for the specific language governing permissions and

limitations under the License.

import numpy as np
import tensorrt as trt
from cuda import cudart

shape = [2, 3, 4, 5]
nContext = 2 # count of context
np.random.seed(31193)
np.set_printoptions(precision=3, linewidth=200, suppress=True)
cudart.cudaDeviceSynchronize()

logger = trt.Logger(trt.Logger.ERROR)
builder = trt.Builder(logger)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
profileList = [builder.create_optimization_profile() for _ in range(nContext)]
config = builder.create_builder_config()

inputT0 = network.add_input(“inputT0”, trt.float32, [-1, -1, -1, -1])
inputT1 = network.add_input(“inputT1”, trt.float32, [-1, -1, -1, -1])
layer = network.add_elementwise(inputT0, inputT1, trt.ElementWiseOperation.SUM)
network.mark_output(layer.get_output(0))

for profile in profileList:
profile.set_shape(inputT0.name, shape, shape, [k * nContext for k in shape]) # “* nContext” is just for this example, not required in real use case
profile.set_shape(inputT1.name, shape, shape, [k * nContext for k in shape])
config.add_optimization_profile(profile)

engineString = builder.build_serialized_network(network, config)
engine = trt.Runtime(logger).deserialize_cuda_engine(engineString)
nIO = engine.num_bindings
nInput = np.sum([engine.binding_is_input(i) for i in range(engine.num_bindings)])
nOutput = nIO - nInput
nIO, nInput, nOutput = nIO // nContext, nInput // nContext, nOutput // nContext

streamList = [cudart.cudaStreamCreate()[1] for _ in range(nContext)]
contextList = [engine.create_execution_context() for index in range(nContext)]

first inference

bufferH = # a list of buffers for all Context (all OptimizationProfile)
for index in range(nContext):
stream = streamList[index]
context = contextList[index]
context.set_optimization_profile_async(index, stream)
bindingPad = nIO * index # skip bindings of previous OptimizationProfile occupied
inputShape = [k * (index + 1) for k in shape] # we use different shape for various context in this example, not required in real use case
context.set_binding_shape(bindingPad + 0, inputShape)
context.set_binding_shape(bindingPad + 1, inputShape)
print(“Context%d binding all? %s” % (index, “Yes” if context.all_binding_shapes_specified else “No”))
for i in range(nIO):
print(i, "Input " if engine.binding_is_input(i) else “Output”, engine.get_binding_shape(i), context.get_binding_shape(i))
for i in range(nInput):
bufferH.append(np.arange(np.prod(inputShape)).astype(np.float32).reshape(inputShape))
for i in range(nOutput):
bufferH.append(np.empty(context.get_binding_shape(bindingPad + nInput + i), dtype=trt.nptype(engine.get_binding_dtype(bindingPad + nInput + i))))

bufferD =
for i in range(len(bufferH)):
bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1])

for index in range(nContext):
print(“Use Context %d” % index)
stream = streamList[index]
context = contextList[index]
context.set_optimization_profile_async(index, stream)
bindingPad = nIO * index
inputShape = [k * (index + 1) for k in shape]
context.set_binding_shape(bindingPad + 0, inputShape)
context.set_binding_shape(bindingPad + 1, inputShape)
for i in range(nIO * nContext):
print(“[%2d]%s->” % (i, "Input " if i < nInput else “Output”), engine.get_binding_dtype(i), engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i))
for i in range(nInput):
cudart.cudaMemcpyAsync(bufferD[bindingPad + i], bufferH[bindingPad + i].ctypes.data, bufferH[bindingPad + i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)

bufferList = [int(0) for b in bufferD[:bindingPad]] + [int(b) for b in bufferD[bindingPad:(bindingPad + nInput + nOutput)]] + [int(0) for b in bufferD[(bindingPad + nInput + nOutput):]]
# divide the buffers into three parts, and fill int(0) for the parts beside the buffer of this context uses

context.execute_async_v2(bufferList, stream)

for i in range(nOutput):
    cudart.cudaMemcpyAsync(bufferH[bindingPad + nInput + i].ctypes.data, bufferD[bindingPad + nInput + i], bufferH[bindingPad + nInput + i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)

for index in range(nContext):
cudart.cudaStreamSynchronize(stream)

for index in range(nContext):
bindingPad = nIO * index
print(“check result of context %d: %s” % (index, np.all(bufferH[bindingPad + 2] == bufferH[bindingPad + 0] + bufferH[bindingPad + 1])))

for stream in streamList:
cudart.cudaStreamDestroy(stream)

for b in bufferD:
cudart.cudaFree(b)

second inference

bufferH = # a list of buffers for all Context (all OptimizationProfile)
for index in range(nContext):
stream = streamList[index]
context = contextList[index]
context.set_optimization_profile_async(index, stream)
bindingPad = nIO * index # skip bindings of previous OptimizationProfile occupied
inputShape = [k * (index + 1) for k in shape] # we use different shape for various context in this example, not required in real use case
context.set_binding_shape(bindingPad + 0, inputShape)
context.set_binding_shape(bindingPad + 1, inputShape)
print(“Context%d binding all? %s” % (index, “Yes” if context.all_binding_shapes_specified else “No”))
for i in range(nIO):
print(i, "Input " if engine.binding_is_input(i) else “Output”, engine.get_binding_shape(i), context.get_binding_shape(i))
for i in range(nInput):
bufferH.append(np.arange(np.prod(inputShape)).astype(np.float32).reshape(inputShape))
for i in range(nOutput):
bufferH.append(np.empty(context.get_binding_shape(bindingPad + nInput + i), dtype=trt.nptype(engine.get_binding_dtype(bindingPad + nInput + i))))

bufferD =
for i in range(len(bufferH)):
bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1])

for index in range(nContext):
print(“Use Context %d” % index)
stream = streamList[index]
context = contextList[index]
context.set_optimization_profile_async(index, stream)
bindingPad = nIO * index
inputShape = [k * (index + 1) for k in shape]
context.set_binding_shape(bindingPad + 0, inputShape)
context.set_binding_shape(bindingPad + 1, inputShape)
for i in range(nIO * nContext):
print(“[%2d]%s->” % (i, "Input " if i < nInput else “Output”), engine.get_binding_dtype(i), engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i))
for i in range(nInput):
cudart.cudaMemcpyAsync(bufferD[bindingPad + i], bufferH[bindingPad + i].ctypes.data, bufferH[bindingPad + i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)

bufferList = [int(0) for b in bufferD[:bindingPad]] + [int(b) for b in bufferD[bindingPad:(bindingPad + nInput + nOutput)]] + [int(0) for b in bufferD[(bindingPad + nInput + nOutput):]]
# divide the buffers into three parts, and fill int(0) for the parts beside the buffer of this context uses

context.execute_async_v2(bufferList, stream)

for i in range(nOutput):
    cudart.cudaMemcpyAsync(bufferH[bindingPad + nInput + i].ctypes.data, bufferD[bindingPad + nInput + i], bufferH[bindingPad + nInput + i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)

for index in range(nContext):
cudart.cudaStreamSynchronize(stream)

for index in range(nContext):
bindingPad = nIO * index
print(“check result of context %d: %s” % (index, np.all(bufferH[bindingPad + 2] == bufferH[bindingPad + 0] + bufferH[bindingPad + 1])))

for stream in streamList:
cudart.cudaStreamDestroy(stream)

for b in bufferD:
cudart.cudaFree(b)
<<<

raise error:Segmentation fault (core dumped)
If this means multicontext can only inference once. if take the model as a server, how to using multicontext exeute multiple requests?

AakankshaS · March 30, 2024, 7:46am

Hi @scumierjia ,
can you please help us with teh onnx model and supported file to debug the issue further.

Thanks

scumierjia · April 7, 2024, 2:41am

Thanks for your reply. But I don’t have the onnx model. I just use below code, and my purpose is to execute multiple inference. But it seems like raise error.
main.py.zip (1.8 KB)

983352910 · May 31, 2024, 6:25am

Hello, have you solved it? I am facing the same problem

scumierjia · August 8, 2024, 9:20am

No.I‘m still waiting for reply

Topic		Replies	Views
TensorRT 8 segmentation fault when creating two contexts concurrently TensorRT	9	2881	March 5, 2024
Tensor rt multi-contexts with multi-threads TensorRT cudnn	2	86	September 2, 2024
Multiple tensorrt engine contexts for different models TensorRT	3	1926	March 16, 2023
Trt多context对象并发推理内存冲突 TensorRT	0	151	March 29, 2025
error when run two different context parallel in TensorRT7 TensorRT	1	877	February 11, 2020
Error run 2 context parallel in TensorRT7 TensorRT	13	2747	July 5, 2021
error when run two different context parallel in TensorRT7 TensorRT	1	1094	February 19, 2020
[executionContext.cpp::executeInternal::652] Error Code 1: Cuda Runtime (an illegal memory access was encountered) \| Cuda failure: 700 TensorRT tensorrt	5	3159	April 11, 2022
Multiple calls of enqueueV2 TensorRT	15	2309	September 19, 2021
Tensorrt multiple process TensorRT tensorrt	2	1686	February 21, 2024