Why can my inference function output more than 200 results when my batch_size is 1, but when I increase my batch_size, my inference results are extremely reduced, and even when I set the batch_size to 128, I can only output one result. Inference result, but it is worth noting that this inference result is the same as the first inference result when batch_size is 1
this is the inference code:
def slide_batches_inference(image_array_batches:np.ndarray ,img_size)->np.ndarray:
“”"推理一个小batch的窗口图像
Args:
image_array_batches (np.ndarray): [Batch, 1, img_size, img_size] (128,1,1024,1024)的图像
model: 寒武纪的magic model
img_size: 推理的图像尺寸
Returns:
preds (np.ndarray): 模型输出文件,如果是yolov8 obb的话,为 [Batch, 20, 21504], 然后再去做 NMS 后处理
"""
preprocessed_image_array_batches = img_batches_preprocess(image_array_batches, normalization=True)
inputs = preprocessed_image_array_batches # 存了当前 batch 要推理的数据 (batch_size, 1, 1024, 1024)
logger = trt.Logger(trt.Logger.WARNING)
runtime = trt.Runtime(logger)
with open("/JHC/yolov8m_planeship_hbb.plan", "rb") as f:
model_data = f.read()
engine = runtime.deserialize_cuda_engine(model_data)
创建执行上下文
context = engine.create_execution_context()
# context = engine.create_execution_context()
定义输入和输出大小
input_size = [batch_size, 1, 1024, 1024]
output_size = [image_array_batches.shape[0], 19, 21504]
计算缓冲区大小
input_buffer_size = int(np.prod(input_size) * np.float32().nbytes)
output_buffer_size = int(np.prod(output_size) * np.float32().nbytes)
分配缓冲区
input_buffer = cuda.mem_alloc(input_buffer_size)
output_buffer = cuda.mem_alloc(output_buffer_size)
获取输入和输出张量的名称
input_name = engine.get_tensor_name(0) # 假设输入张量是第一个绑定
output_name = engine.get_tensor_name(1) # 假设输出张量是第二个绑定
设置张量地址
context.set_tensor_address(input_name, int(input_buffer))
context.set_tensor_address(output_name, int(output_buffer))
创建 CUDA 流
stream =cuda.Stream()
异步将输入数据从主机拷贝到设备(cudaMemcpyAsync)
# input_data = np.random.rand(*input_size).astype(np.float32)
# input_data = inputs[0].ravel()
input_data = inputs
cuda.memcpy_htod_async(input_buffer, input_data, stream)
执行推理
context.execute_async_v3(stream_handle=stream.handle)
获取输出数据
outputs = np.empty(output_size, dtype=np.float32)
cuda.memcpy_dtoh_async(outputs, output_buffer, stream)
stream.synchronize()
preds = torch.from_numpy(outputs)
return preds
Thank you very much!!!!