Please provide the following info (tick the boxes after creating this topic):
Software Version
DRIVE OS 7.0.3
other
Target Operating System
Linux
QNX
other
Hardware Platform
DRIVE AGX Thor Developer Kit (940-63960-0010-000)
DRIVE AGX Thor Developer Kit (940-63960-0012-000)
other
Host Machine Version
native Ubuntu Linux 24.04 Host installed with DRIVE OS Docker Containers
other Ubuntu Linux 22.04 Host installed with DRIVE OS Docker Containers
Issue Description
I would like to use import cuda with Python 3 on the DRIVE Thor target side. Could you please provide instructions on how to install it?
Error String
Logs
Provide logs in text box instead of image.
Please paste the complete application log here. If there are multiple logs, please use multiple text box.
Hi,
I would like to use this script with import cuda .
If there are any alternative solutions or recommended approaches in DRIVE Thor target, could you please let me know?
import ctypes
from typing import Optional, List, Union
import numpy as np
import tensorrt as trt
from cuda import cuda, cudart
def check_cuda_err(err):
if isinstance(err, cuda.CUresult):
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError("Cuda Error: {}".format(err))
if isinstance(err, cudart.cudaError_t):
if err != cudart.cudaError_t.cudaSuccess:
raise RuntimeError("Cuda Runtime Error: {}".format(err))
else:
raise RuntimeError("Unknown error type: {}".format(err))
def cuda_call(call):
err, res = call[0], call[1:]
check_cuda_err(err)
if len(res) == 1:
res = res[0]
return res
class HostDeviceMem:
"""Pair of host and device memory, where the host memory is wrapped in a numpy array"""
def __init__(self, size: int, dtype: Optional[np.dtype] = None):
dtype = dtype or np.dtype(np.uint8)
nbytes = size * dtype.itemsize
host_mem = cuda_call(cudart.cudaMallocHost(nbytes))
pointer_type = ctypes.POINTER(np.ctypeslib.as_ctypes_type(dtype))
self._host = np.ctypeslib.as_array(ctypes.cast(host_mem, pointer_type), (size,))
self._device = cuda_call(cudart.cudaMalloc(nbytes))
self._nbytes = nbytes
@property
def host(self) -> np.ndarray:
return self._host
@host.setter
def host(self, data: Union[np.ndarray, bytes]):
if isinstance(data, np.ndarray):
if data.size > self.host.size:
raise ValueError(
f"Tried to fit an array of size {data.size} into host memory of size {self.host.size}"
)
np.copyto(self.host[:data.size], data.flat, casting='safe')
else:
assert self.host.dtype == np.uint8
self.host[:self.nbytes] = np.frombuffer(data, dtype=np.uint8)
@property
def device(self) -> int:
return self._device
@property
def nbytes(self) -> int:
return self._nbytes
def __str__(self):
return f"Host:\n{self.host}\nDevice:\n{self.device}\nSize:\n{self.nbytes}\n"
def __repr__(self):
return self.__str__()
def free(self):
cuda_call(cudart.cudaFree(self.device))
cuda_call(cudart.cudaFreeHost(self.host.ctypes.data))
# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
# If engine uses dynamic shapes, specify a profile to find the maximum input & output size.
def allocate_buffers(engine: trt.ICudaEngine, profile_idx: Optional[int] = None):
inputs = []
outputs = []
bindings = []
stream = cuda_call(cudart.cudaStreamCreate())
tensor_names = [engine.get_tensor_name(i) for i in range(engine.num_io_tensors)]
for binding in tensor_names:
# get_tensor_profile_shape returns (min_shape, optimal_shape, max_shape)
# Pick out the max shape to allocate enough memory for the binding.
shape = engine.get_tensor_shape(binding) if profile_idx is None else engine.get_tensor_profile_shape(binding, profile_idx)[-1]
shape_valid = np.all([s >= 0 for s in shape])
if not shape_valid and profile_idx is None:
raise ValueError(f"Binding {binding} has dynamic shape, " +\
"but no profile was specified.")
size = trt.volume(shape)
trt_type = engine.get_tensor_dtype(binding)
# Allocate host and device buffers
try:
dtype = np.dtype(trt.nptype(trt_type))
bindingMemory = HostDeviceMem(size, dtype)
except TypeError: # no numpy support: create a byte array instead (BF16, FP8, INT4)
size = int(size * trt_type.itemsize)
bindingMemory = HostDeviceMem(size)
# Append the device buffer to device bindings.
bindings.append(int(bindingMemory.device))
# Append to the appropriate list.
if engine.get_tensor_mode(binding) == trt.TensorIOMode.INPUT:
inputs.append(bindingMemory)
else:
outputs.append(bindingMemory)
return inputs, outputs, bindings, stream
Dear @wencheng-lin ,
Do you need to handle gpu memory buffers in TensorRT samples?
Dear @SivaRamaKrishnaNV
Yes, currently I just need to handle host memory and device memory buffer without using python cuda if we cannot install python cuda module.
Thanks in advance.
Dear @SivaRamaKrishnaNV
Can I use the cudart.cudaMalloc as below part of script in order to allocate buffe and transfer host to device?
cuda = ctypes.CDLL('/usr/lib/libcuda.so')
cudart = ctypes.CDLL('/usr/local/cuda/lib64/libcudart.so')
for i in range(num_io_tensors):
print("Allocating device buffer for:", io_tensor_names[i])
print("bufferH in bytes:", bufferH[i].nbytes)
bufferD.append(cudart.cudaMalloc(ctypes.byref(ctypes.c_void_p()), bufferH[i].nbytes))
print(bufferD[i])
print("Copying data to device")
for i in range(num_input_io_tensors):
print("bufferD[", i, "] =", bufferD[i])
cudart.cudaMemcpy(ctypes.c_void_p(bufferD[i]), bufferH[i].ctypes.data, bufferH[i].nbytes, cudaMemcpyHostToDevice)
for i in range(num_io_tensors):
context.set_tensor_address(io_tensor_names[i], int(bufferD[i]))
print("Starting inference")
context.execute_async_v3(0)
for i in range(num_input_io_tensors, num_io_tensors):
cudart.cudaMemcpy(bufferH[i].ctypes.data, bufferD[i], bufferH[i].nbytes, cudaMemcpyDeviceToHost)
cudart.cudaStreamSynchronize(0)
Yes. You can use . Please see allocate_buffers method used in TensorRT python samples.
Dear @wencheng-lin ,
DO you have any questions further or we can close the topic?
Dear @SivaRamaKrishnaNV
Since the issue has been resolved, we can close this topic.
Thank you for your support.
1 Like