PyCUDA 'invalid resource handle' when Tensorflow model is loaded

I have a python program in which I am using PyCUDA to pre-process some data using the GPU before the results are then fed into a CNN implemented using Tensorflow.

The pre-processing function works fine on it’s own. But when I add the line to load the CNN model…

self.model = load_model(model_path)

… the GPU process fails to execute with the following message:

Traceback (most recent call last):
File “/home/mantis/Desktop/D3/D3T3/d3t3.py”, line 656, in update
g_cc_data = sp2.gcc_phat_fbanks_gpu3(new_cpsd, self.fbwn_gpu, self.steer_gpu, self.ccfb_gpu, self.zoom, self.eps)
File “/home/mantis/Desktop/D3/D3T3/sigpack_v2.py”, line 323, in gcc_phat_fbanks_gpu3
block=(BLOCK_X, BLOCK_Y, 1), grid=(GRID_X, GRID_Y, 1)
File “/usr/local/lib/python3.6/dist-packages/pycuda-2019.1.2-py3.6-linux-aarch64.egg/pycuda/driver.py”, line 436, in function_call
func._set_block_shape(*block)
pycuda._driver.LogicError: cuFuncSetBlockShape failed: invalid resource handle

For reference, the offending GPU function is as follows:

mod = SourceModule("""
#include <pycuda-complex.hpp>
#include <stdio.h>

typedef pycuda::complex<float> cmplx;
__global__ void cc_gpu(float *fbwn, cmplx *cpsd_phat, cmplx *steer, float *cc, int BINS, int WINDOW, int BANKS)
{

    // fbwn      : input, array shape (40, 4096), of type float32
    // cpsd_phat : input, vector shape (4096,), of type complex64
    // steer     : input, array shape (4096, 121), of type complex64
    // cc        : output, array shape (40, 121), of type float32

    int x = threadIdx.x + blockIdx.x * blockDim.x;     
    int y = threadIdx.y + blockIdx.y * blockDim.y;

    int WIDTH = blockDim.x * gridDim.x;
    int HEIGHT = blockDim.y * gridDim.y;

    int XM = (x + WIDTH) % WIDTH;
    int YM = (y + HEIGHT) % HEIGHT;

    int INDEX = XM + YM * WIDTH; // CUDA uses pseudo-mutimensional array indexing (i.e. it's a 1D flat array)

    int bank = y;
    int delay = x;

    float Pvalue = 0.f; 

    if(x < WINDOW && y < BANKS)
    {  
        for (int bin = 0; bin < BINS; bin++)
        {
            if (fbwn[bank * BINS + bin] > 0)
            {
                //Pvalue += fbwn[y,f] * (cpsd_phat[f] * steer[f,x]).real(); // Python equivalent code for reference                                        
                Pvalue += (fbwn[bank * BINS + bin] * (cpsd_phat[bin] * steer[bin * WINDOW + delay]).real());
            }
        }
    }  
    cc[INDEX] = Pvalue; 
}
""")

def gcc_phat_fbanks_gpu3(cpsd, fbwn_gpu, steer_gpu, ccfb_gpu, zoom, eps):

"""GCC-PHAT on filter banks

Args:
    cpsd  : 1D array of length 'nbins' of Cross Power Spectral Density, i.e. a measure of 
            both the power shared by a given frequency for two signals, and the phase 
            shift between the two signals at that frequency. Each element is of type complex128. 
            Typical shape is (4096,) of complex numbers if the FFT has 4096 bins.
            
    fbwn  : 2D array containing Normalised filter bank weights, indexed by (filter bank, frequency bin)
            The sum of values for each filter bank = 1.0, although their frequency distribution 
            can vary between banks. Typical shape is (40, 4096) of float64 elements. 
               
    steer : 2D array of steering factors. Indexed by (frequency bin, sample delay). 
            Each element is of type complex128. Typical shape is (4096, 121) for +/- 60 sample 
            delays and 4096 frequency bins
            
    zoom  : constant used to dimension sample delay axis. E.g zoom = 60 gives a range of +/- 60 sample delays

    eps   : (default 0.0) small constant added to the CPSD_PHAT denominator for
            numerical stability, as well as to suppress low engergy
            bins.
Return:
    fbcc : 2D array of GCC-PHAT on filter banks. Index is (filter bank, sample delay). 
           Typical shape = (40, 121). Each element is type float64, 
           normalised to between -1.0 and 1.0
           
WINDOW = np.int32((2 * zoom) + 1)
BANKS = np.int32(len(fbwn_gpu[:,0]))
BINS = np.int32(len(cpsd))
  
cpsd_phat = ( cpsd / (np.abs(cpsd) + eps) ).astype(np.complex64) #Array shape (4096,)
cpsd_phat_gpu = gpuarray.to_gpu(cpsd_phat)

BLOCK_X = 32 # Number of threads in block X direction (X * Y must be < 1025)
BLOCK_Y = 32 # Number of threads in block Y direction (X * Y must be < 1025)

GRID_X = math.ceil(WINDOW / BLOCK_X) # Required number of blocks in X direction of grid
GRID_Y = math.ceil(BANKS / BLOCK_Y)  # Required number of blocks in Y direction of grid
          
func = mod.get_function("cc_gpu")
func(fbwn_gpu, cpsd_phat_gpu, steer_gpu, ccfb_gpu, BINS, WINDOW, BANKS,
     block=(BLOCK_X, BLOCK_Y, 1), grid=(GRID_X, GRID_Y, 1)
     )

return ccfb_gpu.get()[:BANKS,:WINDOW]

Could anybody advise how to fix this error?