PyCUDA WARNING: a clean-up operation failed

I’m a designer with no coding skills. I recently discovered that using CUDA is more than 10 times faster than using the CPU for image synthesis. However, when I run the image synthesis script made with PyCUDA using GPT, I encounter an error:

PyCUDA WARNING: a clean-up operation failed (dead context maybe?)
cuMemFree failed: an illegal memory access was encountered
PyCUDA WARNING: a clean-up operation failed (dead context maybe?)
cuMemFree failed: an illegal memory access was encountered
Traceback (most recent call last):
  File "mapHE4b.py", line 102, in merge_diamond_images_pycuda
    image_gpu = cuda.mem_alloc(image.nbytes)
pycuda._driver.LogicError: cuMemAlloc failed: an illegal memory access was encountered

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "mapHE4b.py", line 153, in <module>
    merge_diamond_images_pycuda(map_id)
  File "mapHE4b.py", line 135, in merge_diamond_images_pycuda
    except cuda.CudaError as e:
AttributeError: module 'pycuda.driver' has no attribute 'CudaError'
PyCUDA WARNING: a clean-up operation failed (dead context maybe?)
cuModuleUnload failed: an illegal memory access was encountered

The error occurs only when synthesizing more than 3? images; it works fine with fewer images, and the hardware resources are far from being fully utilized.

Here is the script content:

import subprocess
import sys
import time
import os
import numpy as np
import cv2
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

def check_and_install_package(package_name, import_name=None):
    if import_name is None:
        import_name = package_name
    try:
        __import__(import_name)
    except ImportError:
        print(f"{package_name} not found, installing...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])

# Ensure necessary packages are installed
check_and_install_package('setuptools')
check_and_install_package('pycuda')
check_and_install_package('opencv-python')

# CUDA kernel for image merging
cuda_code = """
__global__ void merge_images(unsigned char* canvas, const unsigned char* image, 
                             int canvas_width, int canvas_height, 
                             int image_width, int image_height,
                             int x_offset, int y_offset) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    
    if (x < image_width && y < image_height) {
        int canvas_x = x + x_offset;
        int canvas_y = y + y_offset;
        
        if (canvas_x >= 0 && canvas_x < canvas_width && canvas_y >= 0 && canvas_y < canvas_height) {
            int canvas_idx = (canvas_y * canvas_width + canvas_x) * 4;
            int image_idx = (y * image_width + x) * 4;
            
            float alpha = image[image_idx + 3] / 255.0f;
            for (int c = 0; c < 3; c++) {
                canvas[canvas_idx + c] = (unsigned char)(
                    alpha * image[image_idx + c] + 
                    (1 - alpha) * canvas[canvas_idx + c]
                );
            }
            canvas[canvas_idx + 3] = max(canvas[canvas_idx + 3], image[image_idx + 3]);
        }
    }
}
"""

# Compile the CUDA kernel
mod = SourceModule(cuda_code)
merge_images = mod.get_function("merge_images")

def merge_diamond_images_pycuda(map_id):
    start_time = time.time()
    
    # Get all png files in the current directory
    png_files = [f for f in os.listdir('.') if f.endswith('.png') and f.startswith(str(map_id))]
    
    # Filter out files that do not match the format
    valid_files = [f for f in png_files if len(f.split('_')) == 3]
    
    # Extract unique N and M values
    n_values = sorted(set(int(f.split('_')[1]) for f in valid_files))
    m_values = sorted(set(int(f.split('_')[2].split('.')[0]) for f in valid_files))
    
    sN = len(n_values)
    sM = len(m_values)
    
    # Calculate overall canvas size
    canvas_width = int((sN + sM) / 2 * 6144)
    canvas_height = int((sN + sM) / 2 * 3072)
    
    # If there are no valid image files, skip processing
    if not valid_files:
        print(f"No valid image files found, skipping processing")
        return
    
    try:
        # Create canvas
        canvas = cuda.mem_alloc(canvas_width * canvas_height * 4)
        cuda.memset_d8(canvas, 0, canvas_width * canvas_height * 4)
        
        # Draw diamond images
        for n in reversed(n_values):
            for m in reversed(m_values):
                filename = f"{map_id}_{n}_{m}.png"
                if filename in valid_files:
                    image = cv2.imread(filename, cv2.IMREAD_UNCHANGED)
                    if image.shape[2] == 3:
                        image = cv2.cvtColor(image, cv2.COLOR_BGR2BGRA)
                    
                    x = (n_values.index(n) + m_values.index(m)) * 3072 - 768
                    y = (sN - 1 - n_values.index(n) + m_values.index(m)) * 1536 - 792
                    
                    # Allocate memory on GPU for the image
                    image_gpu = cuda.mem_alloc(image.nbytes)
                    cuda.memcpy_htod(image_gpu, image)
                    
                    # Set up grid and block dimensions
                    block_dim = (32, 32, 1)
                    grid_dim = ((image.shape[1] + block_dim[0] - 1) // block_dim[0],
                                (image.shape[0] + block_dim[1] - 1) // block_dim[1])
                    
                    # Call CUDA kernel
                    merge_images(canvas, image_gpu, np.int32(canvas_width), np.int32(canvas_height),
                                 np.int32(image.shape[1]), np.int32(image.shape[0]),
                                 np.int32(x), np.int32(y),
                                 block=block_dim, grid=grid_dim)
                    
                    # Free GPU memory for the image
                    image_gpu.free()
        
        # Copy result back to CPU
        result = np.zeros((canvas_height, canvas_width, 4), dtype=np.uint8)
        cuda.memcpy_dtoh(result, canvas)
        
        # Save the result
        output_filename = f"{map_id}_all.png"
        cv2.imwrite(output_filename, result)
        
        end_time = time.time()
        elapsed_time = end_time - start_time
        
        print(f"Merge complete, saved as {output_filename}")
        print(f"Processing time: {elapsed_time:.2f} seconds")
    
    except cuda.MemoryError:
        print("Insufficient GPU memory. Try using smaller images or increasing GPU memory.")
    except cuda.CudaError as e:
        print(f"CUDA error: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        # Free GPU memory
        if 'canvas' in locals():
            canvas.free()

# Get all unique map_ids
map_ids = sorted(set(f.split('_')[0] for f in os.listdir('.') if f.endswith('.png')))

# Process all map_ids or a specified map_id
map_id = None  # Process images for all map_ids
if map_id:
    merge_diamond_images_pycuda(map_id)
else:
    for map_id in map_ids:
        merge_diamond_images_pycuda(map_id)

I had GPT review all relevant error resolution methods, but it did not return the correct result.
Could someone please help me out?