Please provide the following info (tick the boxes after creating this topic):
Software Version
DRIVE OS 6.0.8.1
DRIVE OS 6.0.6
DRIVE OS 6.0.5
DRIVE OS 6.0.4 (rev. 1)
DRIVE OS 6.0.4 SDK
other
Target Operating System
Linux
QNX
other
Hardware Platform
DRIVE AGX Orin Developer Kit (940-63710-0010-300)
DRIVE AGX Orin Developer Kit (940-63710-0010-200)
DRIVE AGX Orin Developer Kit (940-63710-0010-100)
DRIVE AGX Orin Developer Kit (940-63710-0010-D00)
DRIVE AGX Orin Developer Kit (940-63710-0010-C00)
DRIVE AGX Orin Developer Kit (not sure its number)
other
SDK Manager Version
1.9.3.10904
other
Host Machine Version
native Ubuntu Linux 20.04 Host installed with SDK Manager
native Ubuntu Linux 20.04 Host installed with DRIVE OS Docker Containers
native Ubuntu Linux 18.04 Host installed with DRIVE OS Docker Containers
other
Hello, I am trying to run inference using TensorRT 8.6.1 on the Drive OS Docker Containers for the Drive AGX Orin available on NGC. The following snippets of code include the variable declarations, buffer creation for the model i/o and inference using enqueueV3.
Initializations
// Engine related variables
nvinfer1::IRuntime *runtime_;
nvinfer1::ICudaEngine *engine_;
nvinfer1::IExecutionContext *contexts_;
std::mutex engineMutex;
cudaStream_t dataStream, inferStream;
// CPU Buffers
float *input_cpu;
std::vector<float *> outputs_cpu;
// GPU Buffers
std::vector<void *> gpu_buffers;
// Other Inits
int numIO, numInputs = 0, numOutputs = 0;
std::vector<int> tensor_sizes_;
std::vector<char const*> tensor_names_;
Memory Buffer Creation
checkCudaErrorCode(cudaStreamCreate(&dataStream));
numIO = engine_->getNbIOTensors();
gpu_buffers.resize(numIO);
float summbsize = 0.0;
std::cout << "\nNumber of inputs and outputs = " << numIO << std::endl;
for (int i = 0; i < numIO; ++i) {
const auto tensorName = engine_->getIOTensorName(i);
const auto tensorType = engine_->getTensorIOMode(tensorName);
const auto tensorShape = engine_->getTensorShape(tensorName);
const auto tensorDatatype = engine_->getTensorDataType(tensorName);
const auto tensorSize = std::accumulate(tensorShape.d, tensorShape.d + tensorShape.nbDims, 1.0, std::multiplies<double>());
tensor_names_.emplace_back(tensorName);
tensor_sizes_.emplace_back(tensorSize);
float mbsize = (float)(tensorSize*sizeof(float))/(1024*1024);
summbsize += mbsize;
// Summary
std::cout << "*****************************************"<< std::endl;
std::cout << "[INFO] Tensor[" << i << "] -> (" << dataModeToString(tensorType) << ")" << std::endl;
std::cout << "[INFO]\t dims\t\t = (" << tensorShape.d[0] << ", " << tensorShape.d[1] << ", " << tensorShape.d[2] << ", " << tensorShape.d[3] << ")" << std::endl;
std::cout << "[INFO]\t data_type\t = " << dataTypeToString(tensorDatatype)<< std::endl;
std::cout << "[INFO]\t size\t\t = " << tensorSize << " elements" << std::endl;
std::cout << "[INFO]\t size\t\t = " << mbsize << " Mb" << std::endl;
std::cout << "[INFO]\t total_size\t = " << summbsize << " Mb" << std::endl;
if (tensorType == nvinfer1::TensorIOMode::kINPUT) {
// Allocate memory for the input
checkCudaErrorCode(cudaMallocAsync(&gpu_buffers[i], tensorSize * sizeof(float), dataStream));
numInputs += 1;
} else if (tensorType == nvinfer1::TensorIOMode::kOUTPUT) {
// Allocate memory for the output
checkCudaErrorCode(cudaMallocAsync(&gpu_buffers[i], tensorSize * sizeof(float), dataStream));
numOutputs += 1;
} else {
// Throw error for tensor type
throw std::runtime_error("Error, IO Tensor is neither an input or output!");
}
}
checkCudaErrorCode(cudaStreamSynchronize(dataStream));
// Allocate buffer for input on CPU
input_cpu = new float[tensor_sizes_[0]]; // HARDCODED
// Allocate buffer for outputs on CPU
outputs_cpu.resize(numOutputs);
std::cout << "*****************************************"<< std::endl;
Inference
debugit("hi1");
std::unique_lock<std::mutex> lock(engineMutex);
checkCudaErrorCode(cudaStreamCreate(&inferStream));
debugit("hi2");
// Copy input to GPU memory
for (int i = 0; i < numInputs; ++i) {
checkCudaErrorCode(cudaMemcpyAsync(gpu_buffers[i], input_cpu, tensor_sizes_[i] * sizeof(float), cudaMemcpyHostToDevice, inferStream));
}
debugit("hi3");
// Set the address of the input and output buffers
for (int i = 0; i < numIO; ++i) {
bool status = contexts_->setTensorAddress(tensor_names_[i], gpu_buffers[i]);
}
debugit("hi4");
// Run inference.
bool status = contexts_->enqueueV3(inferStream);
debugit("hi5");
// Copy output from GPU memory
for (int i = 0; i < numOutputs; ++i) {
checkCudaErrorCode(cudaMemcpyAsync(outputs_cpu[i], gpu_buffers[i+1], tensor_sizes_[i+1] * sizeof(float), cudaMemcpyDeviceToHost, inferStream));
}
debugit("hi6");
checkCudaErrorCode(cudaStreamSynchronize(inferStream));
Output
Number of inputs and outputs = 21
*****************************************
[INFO] Tensor[0] -> (kINPUT)
[INFO] dims = (1, 3, 640, 2304)
[INFO] data_type = kFLOAT (float32)
[INFO] size = 4.42368e+06 elements
[INFO] size = 16.875 Mb
[INFO] total_size = 16.875 Mb
*****************************************
[INFO] Tensor[1] -> (kOUTPUT)
[INFO] dims = (1, 8, 320, 1152)
[INFO] data_type = kFLOAT (float32)
[INFO] size = 2.94912e+06 elements
[INFO] size = 11.25 Mb
[INFO] total_size = 28.125 Mb
*****************************************
[INFO] Tensor[2] -> (kOUTPUT)
[INFO] dims = (1, 2, 160, 576)
[INFO] data_type = kFLOAT (float32)
[INFO] size = 184320 elements
[INFO] size = 0.703125 Mb
[INFO] total_size = 28.8281 Mb
*****************************************
[INFO] Tensor[3] -> (kOUTPUT)
[INFO] dims = (1, 2, 320, 1152)
[INFO] data_type = kFLOAT (float32)
[INFO] size = 737280 elements
[INFO] size = 2.8125 Mb
[INFO] total_size = 31.6406 Mb
*****************************************
[INFO] Tensor[4] -> (kOUTPUT)
[INFO] dims = (1, 2, 160, 576)
[INFO] data_type = kFLOAT (float32)
[INFO] size = 184320 elements
[INFO] size = 0.703125 Mb
[INFO] total_size = 32.3438 Mb
*****************************************
[INFO] Tensor[5] -> (kOUTPUT)
[INFO] dims = (1, 2, 320, 1152)
[INFO] data_type = kFLOAT (float32)
[INFO] size = 737280 elements
[INFO] size = 2.8125 Mb
[INFO] total_size = 35.1562 Mb
*****************************************
[INFO] Tensor[6] -> (kOUTPUT)
[INFO] dims = (1, 2, 320, 1152)
[INFO] data_type = kFLOAT (float32)
[INFO] size = 737280 elements
[INFO] size = 2.8125 Mb
[INFO] total_size = 37.9688 Mb
*****************************************
[INFO] Tensor[7] -> (kOUTPUT)
[INFO] dims = (1, 1, 320, 1152)
[INFO] data_type = kINT32 (int32)
[INFO] size = 368640 elements
[INFO] size = 1.40625 Mb
[INFO] total_size = 39.375 Mb
*****************************************
[INFO] Tensor[8] -> (kOUTPUT)
[INFO] dims = (1, 5, 320, 1152)
[INFO] data_type = kFLOAT (float32)
[INFO] size = 1.8432e+06 elements
[INFO] size = 7.03125 Mb
[INFO] total_size = 46.4062 Mb
*****************************************
[INFO] Tensor[9] -> (kOUTPUT)
[INFO] dims = (1, 1, 320, 1152)
[INFO] data_type = kINT32 (int32)
[INFO] size = 368640 elements
[INFO] size = 1.40625 Mb
[INFO] total_size = 47.8125 Mb
*****************************************
[INFO] Tensor[10] -> (kOUTPUT)
[INFO] dims = (1, 7, 320, 1152)
[INFO] data_type = kFLOAT (float32)
[INFO] size = 2.58048e+06 elements
[INFO] size = 9.84375 Mb
[INFO] total_size = 57.6562 Mb
*****************************************
[INFO] Tensor[11] -> (kOUTPUT)
[INFO] dims = (1, 2, 160, 576)
[INFO] data_type = kFLOAT (float32)
[INFO] size = 184320 elements
[INFO] size = 0.703125 Mb
[INFO] total_size = 58.3594 Mb
*****************************************
[INFO] Tensor[12] -> (kOUTPUT)
[INFO] dims = (1, 1, 320, 1152)
[INFO] data_type = kINT32 (int32)
[INFO] size = 368640 elements
[INFO] size = 1.40625 Mb
[INFO] total_size = 59.7656 Mb
*****************************************
[INFO] Tensor[13] -> (kOUTPUT)
[INFO] dims = (1, 4, 320, 1152)
[INFO] data_type = kFLOAT (float32)
[INFO] size = 1.47456e+06 elements
[INFO] size = 5.625 Mb
[INFO] total_size = 65.3906 Mb
*****************************************
[INFO] Tensor[14] -> (kOUTPUT)
[INFO] dims = (1, 2, 320, 1152)
[INFO] data_type = kFLOAT (float32)
[INFO] size = 737280 elements
[INFO] size = 2.8125 Mb
[INFO] total_size = 68.2031 Mb
*****************************************
[INFO] Tensor[15] -> (kOUTPUT)
[INFO] dims = (1, 19, 320, 1152)
[INFO] data_type = kFLOAT (float32)
[INFO] size = 7.00416e+06 elements
[INFO] size = 26.7188 Mb
[INFO] total_size = 94.9219 Mb
*****************************************
[INFO] Tensor[16] -> (kOUTPUT)
[INFO] dims = (1, 8, 160, 576)
[INFO] data_type = kFLOAT (float32)
[INFO] size = 737280 elements
[INFO] size = 2.8125 Mb
[INFO] total_size = 97.7344 Mb
*****************************************
[INFO] Tensor[17] -> (kOUTPUT)
[INFO] dims = (1, 1, 160, 576)
[INFO] data_type = kINT32 (int32)
[INFO] size = 92160 elements
[INFO] size = 0.351562 Mb
[INFO] total_size = 98.0859 Mb
*****************************************
[INFO] Tensor[18] -> (kOUTPUT)
[INFO] dims = (1, 3, 160, 576)
[INFO] data_type = kFLOAT (float32)
[INFO] size = 276480 elements
[INFO] size = 1.05469 Mb
[INFO] total_size = 99.1406 Mb
*****************************************
[INFO] Tensor[19] -> (kOUTPUT)
[INFO] dims = (1, 2, 160, 576)
[INFO] data_type = kFLOAT (float32)
[INFO] size = 184320 elements
[INFO] size = 0.703125 Mb
[INFO] total_size = 99.8438 Mb
*****************************************
[INFO] Tensor[20] -> (kOUTPUT)
[INFO] dims = (1, 2, 160, 576)
[INFO] data_type = kFLOAT (float32)
[INFO] size = 184320 elements
[INFO] size = 0.703125 Mb
[INFO] total_size = 100.547 Mb
*****************************************
[DEBUG] NOTICE hi1
[DEBUG] NOTICE hi2
[DEBUG] NOTICE hi3
[DEBUG] NOTICE hi4
[DEBUG] NOTICE hi5
CUDA operation failed with code: 1(cudaErrorInvalidValue), with message: invalid argument
terminate called after throwing an instance of 'std::runtime_error'
what(): CUDA operation failed with code: 1(cudaErrorInvalidValue), with message: invalid argument
Aborted (core dumped)
As you can see, there is one input tensor and twenty output tensors of varying sizes. Other functions referred to in the code are -
extern int count = 1;
void debugit(auto a){
std::cout << "[DEBUG]\t NOTICE " << a << std::endl;
count++;
}
void checkCudaErrorCode(cudaError_t code) {
if (code != 0) {
std::string errMsg = "CUDA operation failed with code: " + std::to_string(code) + "(" + cudaGetErrorName(code) + "), with message: " + cudaGetErrorString(code);
std::cout << errMsg << std::endl;
throw std::runtime_error(errMsg);
}
}
From the debug functions, it is observed that the error occurs in the last cudaMemcpyAsync where I try to copy the outputs from the device to the CPU memory. The error code 1 - suggests an argument is invalid, but I am unable to figure out which one. Please help, thanks!