TensorRT 8.6.1 on Drive OS Docker - CudaMemcpyAsync Invalid Argument

Please provide the following info (tick the boxes after creating this topic):
Software Version
DRIVE OS 6.0.8.1
DRIVE OS 6.0.6
DRIVE OS 6.0.5
DRIVE OS 6.0.4 (rev. 1)
DRIVE OS 6.0.4 SDK
other

Target Operating System
Linux
QNX
other

Hardware Platform
DRIVE AGX Orin Developer Kit (940-63710-0010-300)
DRIVE AGX Orin Developer Kit (940-63710-0010-200)
DRIVE AGX Orin Developer Kit (940-63710-0010-100)
DRIVE AGX Orin Developer Kit (940-63710-0010-D00)
DRIVE AGX Orin Developer Kit (940-63710-0010-C00)
DRIVE AGX Orin Developer Kit (not sure its number)
other

SDK Manager Version
1.9.3.10904
other

Host Machine Version
native Ubuntu Linux 20.04 Host installed with SDK Manager
native Ubuntu Linux 20.04 Host installed with DRIVE OS Docker Containers
native Ubuntu Linux 18.04 Host installed with DRIVE OS Docker Containers
other

Hello, I am trying to run inference using TensorRT 8.6.1 on the Drive OS Docker Containers for the Drive AGX Orin available on NGC. The following snippets of code include the variable declarations, buffer creation for the model i/o and inference using enqueueV3.

Initializations

// Engine related variables
nvinfer1::IRuntime *runtime_;
nvinfer1::ICudaEngine *engine_;
nvinfer1::IExecutionContext *contexts_;
std::mutex engineMutex;
cudaStream_t dataStream, inferStream;

// CPU Buffers
float *input_cpu;
std::vector<float *> outputs_cpu;

// GPU Buffers
std::vector<void *> gpu_buffers;

// Other Inits
int numIO, numInputs = 0, numOutputs = 0; 
std::vector<int> tensor_sizes_;
std::vector<char const*> tensor_names_;

Memory Buffer Creation

checkCudaErrorCode(cudaStreamCreate(&dataStream));

    numIO = engine_->getNbIOTensors();
    gpu_buffers.resize(numIO);
    float summbsize = 0.0;
    
    std::cout << "\nNumber of inputs and outputs = " << numIO << std::endl;
    for (int i = 0; i < numIO; ++i) {
        
        const auto tensorName = engine_->getIOTensorName(i);      
        const auto tensorType = engine_->getTensorIOMode(tensorName);
        const auto tensorShape = engine_->getTensorShape(tensorName);
        const auto tensorDatatype = engine_->getTensorDataType(tensorName);
        const auto  tensorSize = std::accumulate(tensorShape.d, tensorShape.d + tensorShape.nbDims, 1.0, std::multiplies<double>());
        tensor_names_.emplace_back(tensorName);
        tensor_sizes_.emplace_back(tensorSize);
        
        float mbsize = (float)(tensorSize*sizeof(float))/(1024*1024); 
        summbsize += mbsize;

        // Summary
        std::cout << "*****************************************"<< std::endl;
        std::cout << "[INFO] Tensor[" << i << "] ->  (" << dataModeToString(tensorType) << ")" << std::endl;
        std::cout << "[INFO]\t   dims\t\t = (" << tensorShape.d[0] << ", " << tensorShape.d[1] << ", " << tensorShape.d[2] << ", " << tensorShape.d[3] << ")"  << std::endl;
        std::cout << "[INFO]\t   data_type\t = " << dataTypeToString(tensorDatatype)<< std::endl;
        std::cout << "[INFO]\t   size\t\t = " << tensorSize << " elements"  << std::endl;
        std::cout << "[INFO]\t   size\t\t = " << mbsize << " Mb"  << std::endl;
        std::cout << "[INFO]\t   total_size\t = " << summbsize << " Mb"  << std::endl;

        if (tensorType == nvinfer1::TensorIOMode::kINPUT) {          
            // Allocate memory for the input
            checkCudaErrorCode(cudaMallocAsync(&gpu_buffers[i], tensorSize * sizeof(float), dataStream));
            numInputs += 1;
        } else if (tensorType == nvinfer1::TensorIOMode::kOUTPUT) {          
            // Allocate memory for the output
            checkCudaErrorCode(cudaMallocAsync(&gpu_buffers[i],  tensorSize * sizeof(float), dataStream));
            numOutputs += 1;
        } else {
            // Throw error for tensor type
            throw std::runtime_error("Error, IO Tensor is neither an input or output!");
        }
    }

    checkCudaErrorCode(cudaStreamSynchronize(dataStream));

    // Allocate buffer for input on CPU
    input_cpu = new float[tensor_sizes_[0]]; // HARDCODED

    // Allocate buffer for outputs on CPU
    outputs_cpu.resize(numOutputs);
    std::cout << "*****************************************"<< std::endl;

Inference

debugit("hi1");
std::unique_lock<std::mutex> lock(engineMutex);
checkCudaErrorCode(cudaStreamCreate(&inferStream));

debugit("hi2");
// Copy input to GPU memory
for (int i = 0; i < numInputs; ++i) {
    checkCudaErrorCode(cudaMemcpyAsync(gpu_buffers[i], input_cpu, tensor_sizes_[i] * sizeof(float), cudaMemcpyHostToDevice, inferStream));
}

debugit("hi3");
// Set the address of the input and output buffers
for (int i = 0; i < numIO; ++i) {
    bool status = contexts_->setTensorAddress(tensor_names_[i], gpu_buffers[i]);
}

debugit("hi4");
// Run inference.
bool status = contexts_->enqueueV3(inferStream);

debugit("hi5");
// Copy output from GPU memory
for (int i = 0; i < numOutputs; ++i) {
    checkCudaErrorCode(cudaMemcpyAsync(outputs_cpu[i], gpu_buffers[i+1], tensor_sizes_[i+1] * sizeof(float), cudaMemcpyDeviceToHost, inferStream));
}

debugit("hi6");
checkCudaErrorCode(cudaStreamSynchronize(inferStream));

Output

Number of inputs and outputs = 21
*****************************************
[INFO] Tensor[0] ->  (kINPUT)
[INFO]	   dims		 = (1, 3, 640, 2304)
[INFO]	   data_type	 = kFLOAT (float32)
[INFO]	   size		 = 4.42368e+06 elements
[INFO]	   size		 = 16.875 Mb
[INFO]	   total_size	 = 16.875 Mb
*****************************************
[INFO] Tensor[1] ->  (kOUTPUT)
[INFO]	   dims		 = (1, 8, 320, 1152)
[INFO]	   data_type	 = kFLOAT (float32)
[INFO]	   size		 = 2.94912e+06 elements
[INFO]	   size		 = 11.25 Mb
[INFO]	   total_size	 = 28.125 Mb
*****************************************
[INFO] Tensor[2] ->  (kOUTPUT)
[INFO]	   dims		 = (1, 2, 160, 576)
[INFO]	   data_type	 = kFLOAT (float32)
[INFO]	   size		 = 184320 elements
[INFO]	   size		 = 0.703125 Mb
[INFO]	   total_size	 = 28.8281 Mb
*****************************************
[INFO] Tensor[3] ->  (kOUTPUT)
[INFO]	   dims		 = (1, 2, 320, 1152)
[INFO]	   data_type	 = kFLOAT (float32)
[INFO]	   size		 = 737280 elements
[INFO]	   size		 = 2.8125 Mb
[INFO]	   total_size	 = 31.6406 Mb
*****************************************
[INFO] Tensor[4] ->  (kOUTPUT)
[INFO]	   dims		 = (1, 2, 160, 576)
[INFO]	   data_type	 = kFLOAT (float32)
[INFO]	   size		 = 184320 elements
[INFO]	   size		 = 0.703125 Mb
[INFO]	   total_size	 = 32.3438 Mb
*****************************************
[INFO] Tensor[5] ->  (kOUTPUT)
[INFO]	   dims		 = (1, 2, 320, 1152)
[INFO]	   data_type	 = kFLOAT (float32)
[INFO]	   size		 = 737280 elements
[INFO]	   size		 = 2.8125 Mb
[INFO]	   total_size	 = 35.1562 Mb
*****************************************
[INFO] Tensor[6] ->  (kOUTPUT)
[INFO]	   dims		 = (1, 2, 320, 1152)
[INFO]	   data_type	 = kFLOAT (float32)
[INFO]	   size		 = 737280 elements
[INFO]	   size		 = 2.8125 Mb
[INFO]	   total_size	 = 37.9688 Mb
*****************************************
[INFO] Tensor[7] ->  (kOUTPUT)
[INFO]	   dims		 = (1, 1, 320, 1152)
[INFO]	   data_type	 = kINT32 (int32)
[INFO]	   size		 = 368640 elements
[INFO]	   size		 = 1.40625 Mb
[INFO]	   total_size	 = 39.375 Mb
*****************************************
[INFO] Tensor[8] ->  (kOUTPUT)
[INFO]	   dims		 = (1, 5, 320, 1152)
[INFO]	   data_type	 = kFLOAT (float32)
[INFO]	   size		 = 1.8432e+06 elements
[INFO]	   size		 = 7.03125 Mb
[INFO]	   total_size	 = 46.4062 Mb
*****************************************
[INFO] Tensor[9] ->  (kOUTPUT)
[INFO]	   dims		 = (1, 1, 320, 1152)
[INFO]	   data_type	 = kINT32 (int32)
[INFO]	   size		 = 368640 elements
[INFO]	   size		 = 1.40625 Mb
[INFO]	   total_size	 = 47.8125 Mb
*****************************************
[INFO] Tensor[10] ->  (kOUTPUT)
[INFO]	   dims		 = (1, 7, 320, 1152)
[INFO]	   data_type	 = kFLOAT (float32)
[INFO]	   size		 = 2.58048e+06 elements
[INFO]	   size		 = 9.84375 Mb
[INFO]	   total_size	 = 57.6562 Mb
*****************************************
[INFO] Tensor[11] ->  (kOUTPUT)
[INFO]	   dims		 = (1, 2, 160, 576)
[INFO]	   data_type	 = kFLOAT (float32)
[INFO]	   size		 = 184320 elements
[INFO]	   size		 = 0.703125 Mb
[INFO]	   total_size	 = 58.3594 Mb
*****************************************
[INFO] Tensor[12] ->  (kOUTPUT)
[INFO]	   dims		 = (1, 1, 320, 1152)
[INFO]	   data_type	 = kINT32 (int32)
[INFO]	   size		 = 368640 elements
[INFO]	   size		 = 1.40625 Mb
[INFO]	   total_size	 = 59.7656 Mb
*****************************************
[INFO] Tensor[13] ->  (kOUTPUT)
[INFO]	   dims		 = (1, 4, 320, 1152)
[INFO]	   data_type	 = kFLOAT (float32)
[INFO]	   size		 = 1.47456e+06 elements
[INFO]	   size		 = 5.625 Mb
[INFO]	   total_size	 = 65.3906 Mb
*****************************************
[INFO] Tensor[14] ->  (kOUTPUT)
[INFO]	   dims		 = (1, 2, 320, 1152)
[INFO]	   data_type	 = kFLOAT (float32)
[INFO]	   size		 = 737280 elements
[INFO]	   size		 = 2.8125 Mb
[INFO]	   total_size	 = 68.2031 Mb
*****************************************
[INFO] Tensor[15] ->  (kOUTPUT)
[INFO]	   dims		 = (1, 19, 320, 1152)
[INFO]	   data_type	 = kFLOAT (float32)
[INFO]	   size		 = 7.00416e+06 elements
[INFO]	   size		 = 26.7188 Mb
[INFO]	   total_size	 = 94.9219 Mb
*****************************************
[INFO] Tensor[16] ->  (kOUTPUT)
[INFO]	   dims		 = (1, 8, 160, 576)
[INFO]	   data_type	 = kFLOAT (float32)
[INFO]	   size		 = 737280 elements
[INFO]	   size		 = 2.8125 Mb
[INFO]	   total_size	 = 97.7344 Mb
*****************************************
[INFO] Tensor[17] ->  (kOUTPUT)
[INFO]	   dims		 = (1, 1, 160, 576)
[INFO]	   data_type	 = kINT32 (int32)
[INFO]	   size		 = 92160 elements
[INFO]	   size		 = 0.351562 Mb
[INFO]	   total_size	 = 98.0859 Mb
*****************************************
[INFO] Tensor[18] ->  (kOUTPUT)
[INFO]	   dims		 = (1, 3, 160, 576)
[INFO]	   data_type	 = kFLOAT (float32)
[INFO]	   size		 = 276480 elements
[INFO]	   size		 = 1.05469 Mb
[INFO]	   total_size	 = 99.1406 Mb
*****************************************
[INFO] Tensor[19] ->  (kOUTPUT)
[INFO]	   dims		 = (1, 2, 160, 576)
[INFO]	   data_type	 = kFLOAT (float32)
[INFO]	   size		 = 184320 elements
[INFO]	   size		 = 0.703125 Mb
[INFO]	   total_size	 = 99.8438 Mb
*****************************************
[INFO] Tensor[20] ->  (kOUTPUT)
[INFO]	   dims		 = (1, 2, 160, 576)
[INFO]	   data_type	 = kFLOAT (float32)
[INFO]	   size		 = 184320 elements
[INFO]	   size		 = 0.703125 Mb
[INFO]	   total_size	 = 100.547 Mb
*****************************************
[DEBUG]	 NOTICE hi1
[DEBUG]	 NOTICE hi2
[DEBUG]	 NOTICE hi3
[DEBUG]	 NOTICE hi4
[DEBUG]	 NOTICE hi5
CUDA operation failed with code: 1(cudaErrorInvalidValue), with message: invalid argument
terminate called after throwing an instance of 'std::runtime_error'
  what():  CUDA operation failed with code: 1(cudaErrorInvalidValue), with message: invalid argument
Aborted (core dumped)

As you can see, there is one input tensor and twenty output tensors of varying sizes. Other functions referred to in the code are -

extern int count = 1;
void debugit(auto a){
  std::cout << "[DEBUG]\t NOTICE " << a << std::endl;
  count++;
}

void checkCudaErrorCode(cudaError_t code) {
  if (code != 0) {
      std::string errMsg = "CUDA operation failed with code: " + std::to_string(code) + "(" + cudaGetErrorName(code) + "), with message: " + cudaGetErrorString(code);
      std::cout << errMsg << std::endl;
      throw std::runtime_error(errMsg);
  }
}

From the debug functions, it is observed that the error occurs in the last cudaMemcpyAsync where I try to copy the outputs from the device to the CPU memory. The error code 1 - suggests an argument is invalid, but I am unable to figure out which one. Please help, thanks!

Dear @gopalan_iyengar,
Could you double check the used buffer pointers are valid?

Also check usingcudaGetLastError(CUDA Runtime API :: CUDA Toolkit Documentation) before cudaMemcpyAsync for loop to make sure error is not propagated from above .

Hello Siva,
Thanks for your response.
I solved this error - I only allocated memory for the output buffer array. However, I allocated memory to every element of the output pointer array and the code ran successfully.
This was the fix -

    // Allocate buffer for outputs on CPU
    outputs_cpu.resize(numOutputs);
    for (int i = 0; i < numOutputs; i++) {
      outputs_cpu[i] = new float[tensor_sizes_[i+1]]; 
    }

Thanks a lot for your help!

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.