Hi,
I am trying to create a GStreamer plugin using the python bindings that would use PyCUDA to accelerate processing but the PyCUDA calls will fails inside some of the virtual methods. This is weird because I have no problems calling GPU-accelerated PyTorch.
My setup:
- Jetson Nano with JetPack 4.6
- Over a clean JetPack I intalled:
sudo apt install \
python-gi-dev \
python-gst-1.0 \
gstreamer1.0-python3-plugin-loader \
gstreamer1.0-plugins-base \
libgstreamer1.0-dev \
python3-pip
pip3 install cython
pip3 install numpy pycuda
Here is a minimal example that reproduces the issue:
import gi
gi.require_version('Gst', '1.0')
gi.require_version('GstBase', '1.0')
from gi.repository import Gst, GObject, GstBase
from pycuda.compiler import SourceModule
import numpy
import pycuda.autoinit
import pycuda.driver as drv
Gst.init(None)
class GstCUDAAdd(GstBase.BaseTransform):
"""
Class that inherits from BaseTransform
"""
__gstmetadata__=(
'CUDAAdd',
'Generic',
'Add using PyCUDA',
'Miguel Taylor <miguel.taylor@ridgerun.com>')
__gsttemplates__=(
Gst.PadTemplate.new(
"src",
Gst.PadDirection.SRC,
Gst.PadPresence.ALWAYS,
Gst.Caps.new_any()),
Gst.PadTemplate.new(
"sink",
Gst.PadDirection.SINK,
Gst.PadPresence.ALWAYS,
Gst.Caps.new_any()))
__gproperties__={}
def __init__(self):
"""
Initialize new GstCUDAAdd
"""
self.add()
GstBase.BaseTransform.__init__(self)
GstBase.BaseTransform.set_in_place(self, False)
def do_transform(self, inbuf, outbuf):
"""
Implementation of GstBaseTransform class 'transform'
Parameters
----------
inbuf: Gst.Buffer
Input buffer (read only)
outbuf: Gst.Buffer
Output buffer (read only, allocated)
Returns
----------
return: Gst.FlowReturn
The result of passing data to a pad.
"""
self.add()
return Gst.FlowReturn.OK
def add(self):
# translate function to python
mod = SourceModule("""
__global__ void add_them(float *dest, float *a, float *b)
{
const int i = threadIdx.x;
dest[i] = a[i] + b[i];
}
""")
add_them = mod.get_function("add_them")
# make 2 sets of 400 random floats
a = numpy.random.randn(400).astype(numpy.float32)
b = numpy.random.randn(400).astype(numpy.float32)
# create a set of 0s
dest = numpy.zeros_like(a)
# replace 0s with results of a + b
add_them(drv.Out(dest), drv.In(a), drv.In(b), block=(400,1,1), grid=(1,1))
# should print block of 0s -> (a+b) - (a+b)
print(dest - (a + b))
GObject.type_register(GstCUDAAdd)
__gstelementfactory__=("cudaadd", Gst.Rank.NONE, GstCUDAAdd)
If I call self.add()
in the class init it works correctly, but inside other methods like do_transform
or do_prepare_output_buffer
it will fail with one of the following errors:
File "/home/nvidia/.local/lib/python3.6/site-packages/pycuda/compiler.py", line 363, in __init__
self.module = module_from_buffer(cubin)
pycuda._driver.LogicError
File "/home/nvidia/.local/lib/python3.6/site-packages/pycuda/compiler.py", line 363, in __init__
self.module = module_from_buffer(cubin)
pycuda._driver.LogicError: cuModuleLoadDataEx failed: invalid device context -
Do you know what could be going on or how to fix this issue?