Segmentation fault (core dumped) error in multiple context and multiple stream in tensorrt

environment:nvcr.io/nvidia/tensorrt:23.10-py3
gpu-driver:535
when I run belowing code:
<<<

Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.

Licensed under the Apache License, Version 2.0 (the “License”);

you may not use this file except in compliance with the License.

You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software

distributed under the License is distributed on an “AS IS” BASIS,

WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

See the License for the specific language governing permissions and

limitations under the License.

import numpy as np
import tensorrt as trt
from cuda import cudart

shape = [2, 3, 4, 5]
nContext = 2 # count of context
np.random.seed(31193)
np.set_printoptions(precision=3, linewidth=200, suppress=True)
cudart.cudaDeviceSynchronize()

logger = trt.Logger(trt.Logger.ERROR)
builder = trt.Builder(logger)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
profileList = [builder.create_optimization_profile() for _ in range(nContext)]
config = builder.create_builder_config()

inputT0 = network.add_input(“inputT0”, trt.float32, [-1, -1, -1, -1])
inputT1 = network.add_input(“inputT1”, trt.float32, [-1, -1, -1, -1])
layer = network.add_elementwise(inputT0, inputT1, trt.ElementWiseOperation.SUM)
network.mark_output(layer.get_output(0))

for profile in profileList:
profile.set_shape(inputT0.name, shape, shape, [k * nContext for k in shape]) # “* nContext” is just for this example, not required in real use case
profile.set_shape(inputT1.name, shape, shape, [k * nContext for k in shape])
config.add_optimization_profile(profile)

engineString = builder.build_serialized_network(network, config)
engine = trt.Runtime(logger).deserialize_cuda_engine(engineString)
nIO = engine.num_bindings
nInput = np.sum([engine.binding_is_input(i) for i in range(engine.num_bindings)])
nOutput = nIO - nInput
nIO, nInput, nOutput = nIO // nContext, nInput // nContext, nOutput // nContext

streamList = [cudart.cudaStreamCreate()[1] for _ in range(nContext)]
contextList = [engine.create_execution_context() for index in range(nContext)]

first inference

bufferH = # a list of buffers for all Context (all OptimizationProfile)
for index in range(nContext):
stream = streamList[index]
context = contextList[index]
context.set_optimization_profile_async(index, stream)
bindingPad = nIO * index # skip bindings of previous OptimizationProfile occupied
inputShape = [k * (index + 1) for k in shape] # we use different shape for various context in this example, not required in real use case
context.set_binding_shape(bindingPad + 0, inputShape)
context.set_binding_shape(bindingPad + 1, inputShape)
print(“Context%d binding all? %s” % (index, “Yes” if context.all_binding_shapes_specified else “No”))
for i in range(nIO):
print(i, "Input " if engine.binding_is_input(i) else “Output”, engine.get_binding_shape(i), context.get_binding_shape(i))
for i in range(nInput):
bufferH.append(np.arange(np.prod(inputShape)).astype(np.float32).reshape(inputShape))
for i in range(nOutput):
bufferH.append(np.empty(context.get_binding_shape(bindingPad + nInput + i), dtype=trt.nptype(engine.get_binding_dtype(bindingPad + nInput + i))))

bufferD =
for i in range(len(bufferH)):
bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1])

for index in range(nContext):
print(“Use Context %d” % index)
stream = streamList[index]
context = contextList[index]
context.set_optimization_profile_async(index, stream)
bindingPad = nIO * index
inputShape = [k * (index + 1) for k in shape]
context.set_binding_shape(bindingPad + 0, inputShape)
context.set_binding_shape(bindingPad + 1, inputShape)
for i in range(nIO * nContext):
print(“[%2d]%s->” % (i, "Input " if i < nInput else “Output”), engine.get_binding_dtype(i), engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i))
for i in range(nInput):
cudart.cudaMemcpyAsync(bufferD[bindingPad + i], bufferH[bindingPad + i].ctypes.data, bufferH[bindingPad + i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)

bufferList = [int(0) for b in bufferD[:bindingPad]] + [int(b) for b in bufferD[bindingPad:(bindingPad + nInput + nOutput)]] + [int(0) for b in bufferD[(bindingPad + nInput + nOutput):]]
# divide the buffers into three parts, and fill int(0) for the parts beside the buffer of this context uses

context.execute_async_v2(bufferList, stream)

for i in range(nOutput):
    cudart.cudaMemcpyAsync(bufferH[bindingPad + nInput + i].ctypes.data, bufferD[bindingPad + nInput + i], bufferH[bindingPad + nInput + i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)

for index in range(nContext):
cudart.cudaStreamSynchronize(stream)

for index in range(nContext):
bindingPad = nIO * index
print(“check result of context %d: %s” % (index, np.all(bufferH[bindingPad + 2] == bufferH[bindingPad + 0] + bufferH[bindingPad + 1])))

for stream in streamList:
cudart.cudaStreamDestroy(stream)

for b in bufferD:
cudart.cudaFree(b)

second inference

bufferH = # a list of buffers for all Context (all OptimizationProfile)
for index in range(nContext):
stream = streamList[index]
context = contextList[index]
context.set_optimization_profile_async(index, stream)
bindingPad = nIO * index # skip bindings of previous OptimizationProfile occupied
inputShape = [k * (index + 1) for k in shape] # we use different shape for various context in this example, not required in real use case
context.set_binding_shape(bindingPad + 0, inputShape)
context.set_binding_shape(bindingPad + 1, inputShape)
print(“Context%d binding all? %s” % (index, “Yes” if context.all_binding_shapes_specified else “No”))
for i in range(nIO):
print(i, "Input " if engine.binding_is_input(i) else “Output”, engine.get_binding_shape(i), context.get_binding_shape(i))
for i in range(nInput):
bufferH.append(np.arange(np.prod(inputShape)).astype(np.float32).reshape(inputShape))
for i in range(nOutput):
bufferH.append(np.empty(context.get_binding_shape(bindingPad + nInput + i), dtype=trt.nptype(engine.get_binding_dtype(bindingPad + nInput + i))))

bufferD =
for i in range(len(bufferH)):
bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1])

for index in range(nContext):
print(“Use Context %d” % index)
stream = streamList[index]
context = contextList[index]
context.set_optimization_profile_async(index, stream)
bindingPad = nIO * index
inputShape = [k * (index + 1) for k in shape]
context.set_binding_shape(bindingPad + 0, inputShape)
context.set_binding_shape(bindingPad + 1, inputShape)
for i in range(nIO * nContext):
print(“[%2d]%s->” % (i, "Input " if i < nInput else “Output”), engine.get_binding_dtype(i), engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i))
for i in range(nInput):
cudart.cudaMemcpyAsync(bufferD[bindingPad + i], bufferH[bindingPad + i].ctypes.data, bufferH[bindingPad + i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)

bufferList = [int(0) for b in bufferD[:bindingPad]] + [int(b) for b in bufferD[bindingPad:(bindingPad + nInput + nOutput)]] + [int(0) for b in bufferD[(bindingPad + nInput + nOutput):]]
# divide the buffers into three parts, and fill int(0) for the parts beside the buffer of this context uses

context.execute_async_v2(bufferList, stream)

for i in range(nOutput):
    cudart.cudaMemcpyAsync(bufferH[bindingPad + nInput + i].ctypes.data, bufferD[bindingPad + nInput + i], bufferH[bindingPad + nInput + i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)

for index in range(nContext):
cudart.cudaStreamSynchronize(stream)

for index in range(nContext):
bindingPad = nIO * index
print(“check result of context %d: %s” % (index, np.all(bufferH[bindingPad + 2] == bufferH[bindingPad + 0] + bufferH[bindingPad + 1])))

for stream in streamList:
cudart.cudaStreamDestroy(stream)

for b in bufferD:
cudart.cudaFree(b)
<<<

raise error:Segmentation fault (core dumped)
If this means multicontext can only inference once. if take the model as a server, how to using multicontext exeute multiple requests?

Hi @scumierjia ,
can you please help us with teh onnx model and supported file to debug the issue further.

Thanks

Thanks for your reply. But I don’t have the onnx model. I just use below code, and my purpose is to execute multiple inference. But it seems like raise error.
main.py.zip (1.8 KB)