I have a python program in which I am using PyCUDA to pre-process some data using the GPU before the results are then fed into a CNN implemented using Tensorflow.
The pre-processing function works fine on it’s own. But when I add the line to load the CNN model…
self.model = load_model(model_path)
… the GPU process fails to execute with the following message:
Traceback (most recent call last):
File “/home/mantis/Desktop/D3/D3T3/d3t3.py”, line 656, in update
g_cc_data = sp2.gcc_phat_fbanks_gpu3(new_cpsd, self.fbwn_gpu, self.steer_gpu, self.ccfb_gpu, self.zoom, self.eps)
File “/home/mantis/Desktop/D3/D3T3/sigpack_v2.py”, line 323, in gcc_phat_fbanks_gpu3
block=(BLOCK_X, BLOCK_Y, 1), grid=(GRID_X, GRID_Y, 1)
File “/usr/local/lib/python3.6/dist-packages/pycuda-2019.1.2-py3.6-linux-aarch64.egg/pycuda/driver.py”, line 436, in function_call
func._set_block_shape(*block)
pycuda._driver.LogicError: cuFuncSetBlockShape failed: invalid resource handle
For reference, the offending GPU function is as follows:
mod = SourceModule(“”"
include <pycuda-complex.hpp>
include <stdio.h>typedef pycuda::complex<float> cmplx; __global__ void cc_gpu(float *fbwn, cmplx *cpsd_phat, cmplx *steer, float *cc, int BINS, int WINDOW, int BANKS) { // fbwn : input, array shape (40, 4096), of type float32 // cpsd_phat : input, vector shape (4096,), of type complex64 // steer : input, array shape (4096, 121), of type complex64 // cc : output, array shape (40, 121), of type float32 int x = threadIdx.x + blockIdx.x * blockDim.x; int y = threadIdx.y + blockIdx.y * blockDim.y; int WIDTH = blockDim.x * gridDim.x; int HEIGHT = blockDim.y * gridDim.y; int XM = (x + WIDTH) % WIDTH; int YM = (y + HEIGHT) % HEIGHT; int INDEX = XM + YM * WIDTH; // CUDA uses pseudo-mutimensional array indexing (i.e. it's a 1D flat array) int bank = y; int delay = x; float Pvalue = 0.f; if(x < WINDOW && y < BANKS) { for (int bin = 0; bin < BINS; bin++) { if (fbwn[bank * BINS + bin] > 0) { //Pvalue += fbwn[y,f] * (cpsd_phat[f] * steer[f,x]).real(); // Python equivalent code for reference Pvalue += (fbwn[bank * BINS + bin] * (cpsd_phat[bin] * steer[bin * WINDOW + delay]).real()); } } } cc[INDEX] = Pvalue; } """)
def gcc_phat_fbanks_gpu3(cpsd, fbwn_gpu, steer_gpu, ccfb_gpu, zoom, eps):
"""GCC-PHAT on filter banks Args: cpsd : 1D array of length 'nbins' of Cross Power Spectral Density, i.e. a measure of both the power shared by a given frequency for two signals, and the phase shift between the two signals at that frequency. Each element is of type complex128. Typical shape is (4096,) of complex numbers if the FFT has 4096 bins. fbwn : 2D array containing Normalised filter bank weights, indexed by (filter bank, frequency bin) The sum of values for each filter bank = 1.0, although their frequency distribution can vary between banks. Typical shape is (40, 4096) of float64 elements. steer : 2D array of steering factors. Indexed by (frequency bin, sample delay). Each element is of type complex128. Typical shape is (4096, 121) for +/- 60 sample delays and 4096 frequency bins zoom : constant used to dimension sample delay axis. E.g zoom = 60 gives a range of +/- 60 sample delays eps : (default 0.0) small constant added to the CPSD_PHAT denominator for numerical stability, as well as to suppress low engergy bins. Return: fbcc : 2D array of GCC-PHAT on filter banks. Index is (filter bank, sample delay). Typical shape = (40, 121). Each element is type float64, normalised to between -1.0 and 1.0 WINDOW = np.int32((2 * zoom) + 1) BANKS = np.int32(len(fbwn_gpu[:,0])) BINS = np.int32(len(cpsd)) cpsd_phat = ( cpsd / (np.abs(cpsd) + eps) ).astype(np.complex64) #Array shape (4096,) cpsd_phat_gpu = gpuarray.to_gpu(cpsd_phat) BLOCK_X = 32 # Number of threads in block X direction (X * Y must be < 1025) BLOCK_Y = 32 # Number of threads in block Y direction (X * Y must be < 1025) GRID_X = math.ceil(WINDOW / BLOCK_X) # Required number of blocks in X direction of grid GRID_Y = math.ceil(BANKS / BLOCK_Y) # Required number of blocks in Y direction of grid func = mod.get_function("cc_gpu") func(fbwn_gpu, cpsd_phat_gpu, steer_gpu, ccfb_gpu, BINS, WINDOW, BANKS, block=(BLOCK_X, BLOCK_Y, 1), grid=(GRID_X, GRID_Y, 1) ) return ccfb_gpu.get()[:BANKS,:WINDOW]
Could anybody advise how to fix this error?