Description
I tried to use DLA-based model inference on Jetson Agx Orin Developer Kit and the speed was much lower than non-DLA model inference. I tested both the pointpillar vfe model and the model_bn model from the jetson_dla_tutorial repository.
I will show the steps of my work:
I have an additional question: 1. Regarding the gpu+dla model inference, do I need to make any special modifications to the inference code?
Model conversion:
DLA
/usr/src/tensorrt/bin/trtexec --onnx=model_bn.onnx
–shapes=input:8x3x640x640
–saveEngine=model_bn_.engine
–exportProfile=model_bn_.json
–int8 --useDLACore=0 --allowGPUFallback --useSpinWait --separateProfileRun --verbose > model_bn_.log
Environment
System environment
Device: agx orin developer kit
jatback version: 6.0
cuda: 12.2
tensorrt: 8.6.2
Model inference code
int main(int argc, char** argv)
{
// 参数设置
const char* enginePath = “/project/ansy_dla/model_bn_.engine”; // 引擎文件路径
const int batchSize = 8; // 批处理大小
const int inputSize = 3 * 640 * 640; // 输入维度
const int outputSize = 10; // 输出维度
// 1. 加载TensorRT引擎
initLibNvInferPlugins(&gLogger, "");
std::ifstream engineFile(enginePath, std::ios::binary);
if(!engineFile.good()){
std::cerr << "无法打开引擎文件: " << enginePath << std::endl;
return -1;
}
engineFile.seekg(0, std::ios::end);
size_t engineSize = engineFile.tellg();
engineFile.seekg(0, std::ios::beg);
std::vector<char> engineData(engineSize);
engineFile.read(engineData.data(), engineSize);
engineFile.close();
// 创建运行时
IRuntime* runtime = createInferRuntime(gLogger);
ICudaEngine* engine = runtime->deserializeCudaEngine(engineData.data(), engineSize);
IExecutionContext* context = engine->createExecutionContext();
// 2. 准备内存绑定
const int inputIndex = engine->getBindingIndex("input");
const int outputIndex = engine->getBindingIndex("output");
// 分配GPU内存
void* deviceBuffers[2];
cudaMalloc(&deviceBuffers[inputIndex], batchSize * inputSize * sizeof(float));
cudaMalloc(&deviceBuffers[outputIndex], batchSize * outputSize * sizeof(float));
// 3. 生成随机输入数据
std::vector<float> hostInput(batchSize * inputSize);
std::default_random_engine generator;
std::normal_distribution<float> distribution(0.0f, 1.0f); // 正态分布
for(auto& v : hostInput){
v = distribution(generator);
}
// 复制数据到GPU
cudaMemcpy(deviceBuffers[inputIndex], hostInput.data(),
batchSize * inputSize * sizeof(float), cudaMemcpyHostToDevice);
// 4. 执行推理
cudaStream_t stream;
cudaStreamCreate(&stream);
auto start = std::chrono::high_resolution_clock::now();
// 异步执行
context->enqueueV2(deviceBuffers, stream, nullptr);
cudaStreamSynchronize(stream);
auto end = std::chrono::high_resolution_clock::now();
float latency = std::chrono::duration<float, std::milli>(end - start).count();
// 5. 获取输出结果
std::vector<float> hostOutput(batchSize * outputSize);
cudaMemcpy(hostOutput.data(), deviceBuffers[outputIndex],
batchSize * outputSize * sizeof(float), cudaMemcpyDeviceToHost);
// 打印结果
std::cout << "\n总推理耗时: " << latency << " ms" << std::endl;
std::cout << "输出结果示例: ";
for(int i=0; i<10; ++i){
std::cout << hostOutput[i] << " ";
}
std::cout << std::endl;
// 6. 清理资源
cudaStreamDestroy(stream);
cudaFree(deviceBuffers[inputIndex]);
cudaFree(deviceBuffers[outputIndex]);
context->destroy();
engine->destroy();
runtime->destroy();
return 0;
}