Call to cuLaunchKernel results in CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES

Can you help me solve this [701] error? I am performing block image processing for four cycles. The first cycle runs normally, but the second cycle reports an error?

import sys

import numpy as np
from numba import cuda
import time as t
import cv2


def read_unsigned_shorts(file_path):
    with open(file_path, 'rb') as file:
        # 读取所有数据并转换为numpy数组,dtype=np.uint16表示每个像素值是8位无符号整数
        raw_data = np.fromfile(file, dtype=np.uint16)
        return raw_data


# 新增函数:读取8位三通道RAW图像
def read_8bit_rgb_raw(file_path, width, height):
    """
    读取8位三通道RAW格式的RGB图像。

    参数:
        file_path (str): 文件路径。
        width (int): 图像宽度。
        height (int): 图像高度。

    返回:
        numpy.ndarray: 形状为(height, width, 3)的RGB图像数组。
    """
    with open(file_path, 'rb') as file:
        # 读取所有数据并转换为numpy数组,dtype=np.uint8表示每个像素值是8位无符号整数
        raw_data = np.fromfile(file, dtype=np.uint8)

        # 确保我们读取的数据与指定尺寸匹配
        if len(raw_data) != width * height * 3:
            raise ValueError("文件大小与指定的维度不匹配")

        return raw_data


def rgb_to_hsl_cuda(rgb, height, width):
    """
    使用CUDA加速将RGB图像转换为HSL图像。

    参数:
        rgb (numpy.ndarray): 形状为(height, width, 3)的RGB图像数组,类型为np.uint8。

    返回:
        numpy.ndarray: 形状为(height, width, 3)的HSL图像数组,类型为np.uint16。
    """

    # 将输入数据复制到设备
    rgb_d = cuda.to_device(rgb)
    h_d = cuda.device_array((height, width), dtype=np.float32)
    s_d = cuda.device_array((height, width), dtype=np.float32)
    l_d = cuda.device_array((height, width), dtype=np.float32)

    # 启动核函数
    _rgb_to_hsl_kernel[128, 74](rgb_d, h_d, s_d, l_d, height, width)

    # 等待GPU
    cuda.synchronize()

    # 释放显存

    return h_d, s_d, l_d  # 返回显存数组句柄


@cuda.jit
def _rgb_to_hsl_kernel(rgb, h, s, l, width, height):
    row = cuda.grid(1)  # 行数
    # 每个函数处理一行
    if row < height:
        # 每行处理逻辑
        for col in range(width):
            # 提取每个像素的rgb值
            r = rgb[row, col, 0]
            g = rgb[row, col, 1]
            b = rgb[row, col, 2]
            # Step 1: 计算极值
            C_max = max(r, g, b)
            C_min = min(r, g, b)
            delta = C_max - C_min

            # Step 2: 亮度L (0-255)
            L = (C_max + C_min) / 2

            # Step 3: 饱和度S (0-255)
            if delta == 0:
                S = 0
            elif L <= 127:  # L≤0.5的等效条件
                denominator = C_max + C_min
                S = (delta * 255) / denominator if denominator > 0 else 0
            else:
                denominator = 510 - C_max - C_min
                S = (delta * 255) / denominator if denominator > 0 else 0

            # Step 4: 色相H (0-255)
            if delta == 0:
                H = 0
            else:
                if C_max == r:
                    H_temp = ((g - b) * 60) / delta
                elif C_max == g:
                    H_temp = ((b - r) * 60) / delta + 120
                else:  # C_max == b
                    H_temp = ((r - g) * 60) / delta + 240

                H_temp = H_temp % 360 if H_temp >= 0 else H_temp + 360
                H = (H_temp * 255) / 360
            # 将结果存储到数组中
            h[row, col] = H
            s[row, col] = S
            l[row, col] = L


def hsl_to_rgb_cuda(h_d, s_d, l_d, height, width):
    """
    使用CUDA加速将HSL图像转换为RGB图像。

    参数:
        h (numpy.ndarray): 形状为(height* width)的H通道数组,类型为np.float32。
        s (numpy.ndarray): 形状为(height* width)的S通道数组,类型为np.float32。
        l (numpy.ndarray): 形状为(height* width)的L通道数组,类型为np.float32。
        height (int): 图像高度。
        width (int): 图像宽度。

    返回:
        numpy.ndarray: 形状为(height* width* 3)的RGB图像数组,类型为np.uint8。
    """

    # 在设备上开辟RGB数组
    rgb_d = cuda.device_array((height, width, 3), dtype=np.uint8)

    # 启动核函数
    _hsl_to_rgb_kernel[128, 74](h_d, s_d, l_d, rgb_d, width, height)

    # 等待GPU
    cuda.synchronize()

    # 将结果从设备拷贝回主机
    rgb = rgb_d.copy_to_host()

    # 释放显存
    cuda.close()

    return rgb


@cuda.jit
def _hsl_to_rgb_kernel(h, s, l, rgb, width, height):
    row = cuda.grid(1)
    if row < height:
        for col in range(width):
            # 计算索引

            h_val = h[row, col] * 360.0 / 255.0  # 反归一化H值
            s_val = s[row, col] / 255.0  # 归一化S值
            l_val = l[row, col] / 255.0  # 归一化L值

            # HSL到RGB转换算法
            c = (1 - abs(2 * l_val - 1)) * s_val
            x = c * (1 - abs((h_val / 60.0) % 2 - 1))
            m = l_val - c / 2

            # 初始化RGB临时值
            r_temp, g_temp, b_temp = 0, 0, 0

            # 根据H值区间确定颜色分量
            if h_val < 60:
                r_temp, g_temp, b_temp = c, x, 0
            elif h_val < 120:
                r_temp, g_temp, b_temp = x, c, 0
            elif h_val < 180:
                r_temp, g_temp, b_temp = 0, c, x
            elif h_val < 240:
                r_temp, g_temp, b_temp = 0, x, c
            elif h_val < 300:
                r_temp, g_temp, b_temp = x, 0, c
            else:
                r_temp, g_temp, b_temp = c, 0, x

            # 计算最终RGB值并转换为uint8
            rgb[row, col, 0] = int(round((r_temp + m) * 255))
            rgb[row, col, 1] = int(round((g_temp + m) * 255))
            rgb[row, col, 2] = int(round((b_temp + m) * 255))


def upcy_cuda(s, h, height, width):
    # 新建内插后数组
    s2_d = cuda.device_array((2 * height, 2 * width), dtype=np.float32)
    h2_d = cuda.device_array((2 * height, 2 * width), dtype=np.float32)
    # 启动核函数
    _upcy_kernel[128, 80](s, h, s2_d, h2_d, 2 * height, 2 * width)
    #  等待GPU
    cuda.synchronize()
    # 返回内插后的句柄
    return s2_d, h2_d


@cuda.jit
def _upcy_kernel(s, h, s2, h2, height, width):
    # 大图像的行数
    row2 = cuda.grid(1)
    if row2 < height:
        for col2 in range(width):  # 大图像的列数
            row, col = int(row2 / 2), int(col2 / 2)  # 原始图像索引,并且取了整
            if row2 % 2 == 0 and col2 % 2 == 0:  # 偶数行偶数列
                s2[row2, col2] = s[row, col]
                h2[row2, col2] = h[row, col]  # 直接赋值
            else:
                s2[row2, col2] = (s[row, col] + s[row, col + 1] + s[row + 1, col] + s[row + 1, col + 1]) / 4  # 内插步骤
                h2[row2, col2] = (h[row, col] + h[row, col + 1] + h[row + 1, col] + h[row + 1, col + 1]) / 4


if __name__ == "__main__":
    file_path = "D:\\学习资料\\数字图像处理\\彩色变换+实践\\实践三\\RGB.raw"
    width = 10992
    height = 10992
    rgb_image = read_8bit_rgb_raw(file_path, width, height)
    rgb_image = rgb_image.reshape(height, width, 3)
    # 读取高分辨率灰度图像
    file_path = "D:\\学习资料\\数字图像处理\\彩色变换+实践\\实践三\\band_CH02.raw"
    gray_image = np.array(read_unsigned_shorts(file_path))
    gray_image = gray_image.reshape(height * 2, width * 2)
    # 将灰度图像转为8位存储
    gray_image = gray_image.astype(np.uint8)

    # 划分子块
    # 定义子块尺寸
    block_width = width // 2
    block_height = height // 2

    # 划分四个子块
    rgb_blocks = [
        rgb_image[:block_height, :block_width, :],  # 左上
        rgb_image[:block_height, block_width:, :],  # 右上
        rgb_image[block_height:, :block_width, :],  # 左下
        rgb_image[block_height:, block_width:, :]]
    gray_blocks = [
        gray_image[:2 * block_height, :2 * block_width],  # 左上
        gray_image[:2 * block_height, 2 * block_width:],  # 右上
        gray_image[2 * block_height:, :2 * block_width],  # 左下
        gray_image[2 * block_height:, 2 * block_width:]]
    del rgb_image, gray_image
    sys.stdout.flush()
    dawn = t.time()
    # 定义结果数组
    result = []
    # 分块处理
    for i in range(4):
        #  将RGB图像转换为HSL图像,并确保内存连续
        rgb_image = np.ascontiguousarray(rgb_blocks[i])
        gray_image = np.ascontiguousarray(gray_blocks[i])

        # HSL正变换
        h_d, s_d, l_d = rgb_to_hsl_cuda(rgb_image, height // 2, width // 2)
        # 内插S、H
        s2_d, h2_d = upcy_cuda(s_d, h_d, height // 2, width // 2)
        # 将S,H拷贝回内存
        s2 = s2_d.copy_to_host()
        h2 = h2_d.copy_to_host()
        # 清除上下文
        #cuda.close()
        # 将灰度图作为h2_d直接拷贝到设备,再把别的两个也拷贝过去
        h2_d = cuda.to_device(h2)
        s2_d = cuda.to_device(s2)
        l2_d = cuda.to_device(gray_image)
        del h_d, s_d, l_d
        rgb_image1 = hsl_to_rgb_cuda(h2_d, s2_d, l2_d, height, width)
        result.append(rgb_image1)
        #cuda.close()
        del s2_d, h2_d, l2_d
        print('处理子块 {}'.format(i))
    # 合并子块
    top = np.hstack((result[0], result[1]))
    bottom = np.hstack((result[2], result[3]))
    final_image = np.vstack((top, bottom))
    dusk = t.time()
    print('\t(done in {:.2f}s)'.format(dusk - dawn))

The error message is as follows:

已连接到 pydev 调试器(内部版本号 232.9559.58)处理子块 0
Traceback (most recent call last):
  File "C:\Program Files\JetBrains\PyCharm Community Edition 2023.2.1\plugins\python-ce\helpers\pydev\pydevd.py", line 2199, in <module>
    main()
  File "C:\Program Files\JetBrains\PyCharm Community Edition 2023.2.1\plugins\python-ce\helpers\pydev\pydevd.py", line 2181, in main
    globals = debugger.run(setup['file'], None, None, is_module)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\JetBrains\PyCharm Community Edition 2023.2.1\plugins\python-ce\helpers\pydev\pydevd.py", line 1493, in run
    return self._exec(is_module, entry_point_fn, module_name, file, globals, locals)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\JetBrains\PyCharm Community Edition 2023.2.1\plugins\python-ce\helpers\pydev\pydevd.py", line 1500, in _exec
    pydev_imports.execfile(file, globals, locals)  # execute the script
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\JetBrains\PyCharm Community Edition 2023.2.1\plugins\python-ce\helpers\pydev\_pydev_imps\_pydev_execfile.py", line 18, in execfile
    exec(compile(contents+"\n", file, 'exec'), glob, loc)
  File "C:\Users\24929\PycharmProjects\exams\图像融合.py", line 256, in <module>
    h_d, s_d, l_d = rgb_to_hsl_cuda(rgb_image, height // 2, width // 2)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\24929\PycharmProjects\exams\图像融合.py", line 58, in rgb_to_hsl_cuda
    _rgb_to_hsl_kernel[128, 74](rgb_d, h_d, s_d, l_d, height, width)
  File "D:\programefiles\ana\Lib\site-packages\numba\cuda\dispatcher.py", line 539, in __call__
    return self.dispatcher.call(args, self.griddim, self.blockdim,
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\programefiles\ana\Lib\site-packages\numba\cuda\dispatcher.py", line 683, in call
    kernel.launch(args, griddim, blockdim, stream, sharedmem)
  File "D:\programefiles\ana\Lib\site-packages\numba\cuda\dispatcher.py", line 327, in launch
    driver.launch_kernel(cufunc.handle,
  File "D:\programefiles\ana\Lib\site-packages\numba\cuda\cudadrv\driver.py", line 2563, in launch_kernel
    driver.cuLaunchKernel(cufunc_handle,
  File "D:\programefiles\ana\Lib\site-packages\numba\cuda\cudadrv\driver.py", line 327, in safe_cuda_api_call
    self._check_ctypes_error(fname, retcode)
  File "D:\programefiles\ana\Lib\site-packages\numba\cuda\cudadrv\driver.py", line 395, in _check_ctypes_error
    raise CudaAPIError(retcode, msg)
numba.cuda.cudadrv.driver.CudaAPIError: [701] Call to cuLaunchKernel results in CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES

I am a beginner in CUDA programming and cannot find a solution online, so I am here for help. Thank you for your assistance

This is often a registers-per-thread issue, so I would start down that avenue. This post covers it in more detail, including the items linked from that post.

as a first step, try changing this:

@cuda.jit
def _rgb_to_hsl_kernel(rgb, h, s, l, width, height):

to this:

@cuda.jit(max_registers=64)
def _rgb_to_hsl_kernel(rgb, h, s, l, width, height):

if that does not resolve the issue, then there is some other problem. (I’m a little skeptical about my own advice because the launch config does not look problematic even with high register usage:

_rgb_to_hsl_kernel[128, 74](rgb_d, h_d, s_d, l_d, height, width)
                  ^^^^^^^^^

but as a diagnostic, I would still try that first.)

I tried this change, but the program still has the same error

It seems that there may be situations where CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES may be used to signal a mismatch between the supplied arguments and the parameters (number, kind, shape) that the kernel was compiled with. It’s not obvious to me that is happening here, but I’ve not really carefully studied this extraction:

which impact the kernel invocation here:

I guess as a diagnostic, it might be interesting to print the shape of rgb_d on each loop pass. I don’t think changes in sizes of dimensions should matter, but changes in number of dimensions might. It’s not obvious to me the extraction would cause that, but I’d be interested in the diagnostic, anyway.

I have made a new discovery: in the loop of my main function, there is no problem calling the kernel function for the first time. However, when the loop reaches the second time, the kernel function bursts CUDA-ERROR_LAUNCH_OUT_SOF-RESOURCES. I copied and pasted the code of the kernel function completely and renamed it. In the second loop, I called this copied and pasted function - at the same position, there will be no error - which means that the kernel function can only be started once, and an error will be thrown on the second start? What is the reason for this? The code I modified is as follows:

import sys

import numpy as np
from numba import cuda
import time as t
import cv2


def read_unsigned_shorts(file_path):
    with open(file_path, 'rb') as file:
        # 读取所有数据并转换为numpy数组,dtype=np.uint16表示每个像素值是8位无符号整数
        raw_data = np.fromfile(file, dtype=np.uint16)
        return raw_data


# 新增函数:读取8位三通道RAW图像
def read_8bit_rgb_raw(file_path, width, height):
    """
    读取8位三通道RAW格式的RGB图像。

    参数:
        file_path (str): 文件路径。
        width (int): 图像宽度。
        height (int): 图像高度。

    返回:
        numpy.ndarray: 形状为(height, width, 3)的RGB图像数组。
    """
    with open(file_path, 'rb') as file:
        # 读取所有数据并转换为numpy数组,dtype=np.uint8表示每个像素值是8位无符号整数
        raw_data = np.fromfile(file, dtype=np.uint8)

        # 确保我们读取的数据与指定尺寸匹配
        if len(raw_data) != width * height * 3:
            raise ValueError("文件大小与指定的维度不匹配")

        return raw_data


def rgb_to_hsl_cuda(rgb, height, width):
    """
    使用CUDA加速将RGB图像转换为HSL图像。

    参数:
        rgb (numpy.ndarray): 形状为(height, width, 3)的RGB图像数组,类型为np.uint8。

    返回:
        numpy.ndarray: 形状为(height, width, 3)的HSL图像数组,类型为np.uint16。
    """

    # 将输入数据复制到设备
    rgb_d = cuda.to_device(rgb)
    h_d = cuda.device_array((height, width), dtype=np.float16)
    s_d = cuda.device_array((height, width), dtype=np.float16)
    l_d = cuda.device_array((height, width), dtype=np.float16)

    # 启动核函数
    _rgb_to_hsl_kernel[128, 74](rgb_d, h_d, s_d, l_d, height, width)

    # 等待GPU
    cuda.synchronize()

    # 释放显存

    return h_d, s_d, l_d  # 返回显存数组句柄


@cuda.jit
def _rgb_to_hsl_kernel(rgb, h, s, l, width, height):
    row = cuda.grid(1)  # 行数
    # 每个函数处理一行
    if row < height:
        # 每行处理逻辑
        for col in range(width):
            # 提取每个像素的rgb值
            r = rgb[row, col, 0]
            g = rgb[row, col, 1]
            b = rgb[row, col, 2]
            # Step 1: 计算极值
            C_max = max(r, g, b)
            C_min = min(r, g, b)
            delta = C_max - C_min

            # Step 2: 亮度L (0-255)
            L = (C_max + C_min) / 2

            # Step 3: 饱和度S (0-255)
            if delta == 0:
                S = 0
            elif L <= 127:  # L≤0.5的等效条件
                denominator = C_max + C_min
                S = (delta * 255) / denominator if denominator > 0 else 0
            else:
                denominator = 510 - C_max - C_min
                S = (delta * 255) / denominator if denominator > 0 else 0

            # Step 4: 色相H (0-255)
            if delta == 0:
                H = 0
            else:
                if C_max == r:
                    H_temp = ((g - b) * 60) / delta
                elif C_max == g:
                    H_temp = ((b - r) * 60) / delta + 120
                else:  # C_max == b
                    H_temp = ((r - g) * 60) / delta + 240

                H_temp = H_temp % 360 if H_temp >= 0 else H_temp + 360
                H = (H_temp * 255) / 360
            # 将结果存储到数组中
            h[row, col] = H
            s[row, col] = S
            l[row, col] = L

def rgb_to_hsl_cuda1(rgb, height, width):
    """
    使用CUDA加速将RGB图像转换为HSL图像。

    参数:
        rgb (numpy.ndarray): 形状为(height, width, 3)的RGB图像数组,类型为np.uint8。

    返回:
        numpy.ndarray: 形状为(height, width, 3)的HSL图像数组,类型为np.uint16。
    """

    # 将输入数据复制到设备
    rgb_d = cuda.to_device(rgb)
    h_d = cuda.device_array((height, width), dtype=np.float16)
    s_d = cuda.device_array((height, width), dtype=np.float16)
    l_d = cuda.device_array((height, width), dtype=np.float16)

    # 启动核函数
    _rgb_to_hsl_kernel1[128, 74](rgb_d, h_d, s_d, l_d, height, width)

    # 等待GPU
    cuda.synchronize()

    # 释放显存

    return h_d, s_d, l_d  # 返回显存数组句柄


@cuda.jit
def _rgb_to_hsl_kernel1(rgb, h, s, l, width, height):
    row = cuda.grid(1)  # 行数
    # 每个函数处理一行
    if row < height:
        # 每行处理逻辑
        for col in range(width):
            # 提取每个像素的rgb值
            r = rgb[row, col, 0]
            g = rgb[row, col, 1]
            b = rgb[row, col, 2]
            # Step 1: 计算极值
            C_max = max(r, g, b)
            C_min = min(r, g, b)
            delta = C_max - C_min

            # Step 2: 亮度L (0-255)
            L = (C_max + C_min) / 2

            # Step 3: 饱和度S (0-255)
            if delta == 0:
                S = 0
            elif L <= 127:  # L≤0.5的等效条件
                denominator = C_max + C_min
                S = (delta * 255) / denominator if denominator > 0 else 0
            else:
                denominator = 510 - C_max - C_min
                S = (delta * 255) / denominator if denominator > 0 else 0

            # Step 4: 色相H (0-255)
            if delta == 0:
                H = 0
            else:
                if C_max == r:
                    H_temp = ((g - b) * 60) / delta
                elif C_max == g:
                    H_temp = ((b - r) * 60) / delta + 120
                else:  # C_max == b
                    H_temp = ((r - g) * 60) / delta + 240

                H_temp = H_temp % 360 if H_temp >= 0 else H_temp + 360
                H = (H_temp * 255) / 360
            # 将结果存储到数组中
            h[row, col] = H
            s[row, col] = S
            l[row, col] = L

def hsl_to_rgb_cuda(h_d, s_d, l_d, height, width):
    """
    使用CUDA加速将HSL图像转换为RGB图像。

    参数:
        h (numpy.ndarray): 形状为(height* width)的H通道数组,类型为np.float16。
        s (numpy.ndarray): 形状为(height* width)的S通道数组,类型为np.float16。
        l (numpy.ndarray): 形状为(height* width)的L通道数组,类型为np.float16。
        height (int): 图像高度。
        width (int): 图像宽度。

    返回:
        numpy.ndarray: 形状为(height* width* 3)的RGB图像数组,类型为np.uint8。
    """

    # 在设备上开辟RGB数组
    rgb_d = cuda.device_array((height, width, 3), dtype=np.uint8)

    # 启动核函数
    _hsl_to_rgb_kernel[128, 74](h_d, s_d, l_d, rgb_d, width, height)

    # 等待GPU
    cuda.synchronize()

    # 将结果从设备拷贝回主机
    rgb = rgb_d.copy_to_host()

    # 释放显存
    cuda.close()

    return rgb


@cuda.jit
def _hsl_to_rgb_kernel(h, s, l, rgb, width, height):
    row = cuda.grid(1)
    if row < height:
        for col in range(width):
            # 计算索引

            h_val = h[row, col] * 360.0 / 255.0  # 反归一化H值
            s_val = s[row, col] / 255.0  # 归一化S值
            l_val = l[row, col] / 255.0  # 归一化L值

            # HSL到RGB转换算法
            c = (1 - abs(2 * l_val - 1)) * s_val
            x = c * (1 - abs((h_val / 60.0) % 2 - 1))
            m = l_val - c / 2

            # 初始化RGB临时值
            r_temp, g_temp, b_temp = 0, 0, 0

            # 根据H值区间确定颜色分量
            if h_val < 60:
                r_temp, g_temp, b_temp = c, x, 0
            elif h_val < 120:
                r_temp, g_temp, b_temp = x, c, 0
            elif h_val < 180:
                r_temp, g_temp, b_temp = 0, c, x
            elif h_val < 240:
                r_temp, g_temp, b_temp = 0, x, c
            elif h_val < 300:
                r_temp, g_temp, b_temp = x, 0, c
            else:
                r_temp, g_temp, b_temp = c, 0, x

            # 计算最终RGB值并转换为uint8
            rgb[row, col, 0] = int(round((r_temp + m) * 255))
            rgb[row, col, 1] = int(round((g_temp + m) * 255))
            rgb[row, col, 2] = int(round((b_temp + m) * 255))


def upcy_cuda(s, h, height, width):
    # 新建内插后数组
    s2_d = cuda.device_array((2 * height, 2 * width), dtype=np.float16)
    h2_d = cuda.device_array((2 * height, 2 * width), dtype=np.float16)
    # 启动核函数
    _upcy_kernel[128, 80](s, h, s2_d, h2_d, 2 * height, 2 * width)
    #  等待GPU
    cuda.synchronize()
    # 返回内插后的句柄
    return s2_d, h2_d


@cuda.jit
def _upcy_kernel(s, h, s2, h2, height, width):
    # 大图像的行数
    row2 = cuda.grid(1)
    if row2 < height:
        for col2 in range(width):  # 大图像的列数
            row, col = int(row2 / 2), int(col2 / 2)  # 原始图像索引,并且取了整
            if row2 % 2 == 0 and col2 % 2 == 0:  # 偶数行偶数列
                s2[row2, col2] = s[row, col]
                h2[row2, col2] = h[row, col]  # 直接赋值
            else:
                s2[row2, col2] = (s[row, col] + s[row, col + 1] + s[row + 1, col] + s[row + 1, col + 1]) / 4  # 内插步骤
                h2[row2, col2] = (h[row, col] + h[row, col + 1] + h[row + 1, col] + h[row + 1, col + 1]) / 4


if __name__ == "__main__":
    file_path = "D:\\学习资料\\数字图像处理\\彩色变换+实践\\实践三\\RGB.raw"
    width = 10992
    height = 10992
    rgb_image = read_8bit_rgb_raw(file_path, width, height)
    rgb_image = rgb_image.reshape(height, width, 3)
    # 读取高分辨率灰度图像
    file_path = "D:\\学习资料\\数字图像处理\\彩色变换+实践\\实践三\\band_CH02.raw"
    gray_image = np.array(read_unsigned_shorts(file_path))
    gray_image = gray_image.reshape(height * 2, width * 2)
    # 将灰度图像转为8位存储
    gray_image = gray_image.astype(np.uint8)

    # 划分子块
    # 定义子块尺寸
    block_width = width // 2
    block_height = height // 2

    # 划分四个子块
    rgb_blocks = [
        rgb_image[:block_height, :block_width, :],  # 左上
        rgb_image[:block_height, block_width:, :],  # 右上
        rgb_image[block_height:, :block_width, :],  # 左下
        rgb_image[block_height:, block_width:, :]]
    gray_blocks = [
        gray_image[:2 * block_height, :2 * block_width],  # 左上
        gray_image[:2 * block_height, 2 * block_width:],  # 右上
        gray_image[2 * block_height:, :2 * block_width],  # 左下
        gray_image[2 * block_height:, 2 * block_width:]]
    del rgb_image, gray_image
    sys.stdout.flush()
    dawn = t.time()
    # 定义结果数组
    result = []
    # 分块处理
    for i in range(4):
        #  将RGB图像转换为HSL图像,并确保内存连续
        rgb_image = np.ascontiguousarray(rgb_blocks[i])
        gray_image = np.ascontiguousarray(gray_blocks[i])

        # HSL正变换
        if i!=1:
            h_d, s_d, l_d = rgb_to_hsl_cuda(rgb_image, height // 2, width // 2)
        else:
             h_d, s_d, l_d = rgb_to_hsl_cuda1(rgb_image, height // 2, width // 2)
        # 内插S、H
        s2_d, h2_d = upcy_cuda(s_d, h_d, height // 2, width // 2)
        # 将S,H拷贝回内存
        s2 = s2_d.copy_to_host()
        h2 = h2_d.copy_to_host()
        # 清除上下文
        cuda.close()

        del s2_d, h2_d, l_d
        # 将灰度图作为h2_d直接拷贝到设备,再把别的两个也拷贝过去
        h2_d = cuda.to_device(h2)
        s2_d = cuda.to_device(s2)
        l2_d = cuda.to_device(gray_image)

        rgb_image1 = hsl_to_rgb_cuda(h2_d, s2_d, l2_d, height, width)
        result.append(rgb_image1)
        cuda.close()
        del s2_d, h2_d, l2_d
        print('处理子块 {}'.format(i))
    # 合并子块
    top = np.hstack((result[0], result[1]))
    bottom = np.hstack((result[2], result[3]))
    final_image = np.vstack((top, bottom))
    dusk = t.time()
    print('\t(done in {:.2f}s)'.format(dusk - dawn))

The output is as follows:

已连接到 pydev 调试器(内部版本号 232.9559.58)处理子块 0
Traceback (most recent call last):
  File "C:\Program Files\JetBrains\PyCharm Community Edition 2023.2.1\plugins\python-ce\helpers\pydev\pydevd.py", line 2199, in <module>
    main()
  File "C:\Program Files\JetBrains\PyCharm Community Edition 2023.2.1\plugins\python-ce\helpers\pydev\pydevd.py", line 2181, in main
    globals = debugger.run(setup['file'], None, None, is_module)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\JetBrains\PyCharm Community Edition 2023.2.1\plugins\python-ce\helpers\pydev\pydevd.py", line 1493, in run
    return self._exec(is_module, entry_point_fn, module_name, file, globals, locals)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\JetBrains\PyCharm Community Edition 2023.2.1\plugins\python-ce\helpers\pydev\pydevd.py", line 1500, in _exec
    pydev_imports.execfile(file, globals, locals)  # execute the script
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\JetBrains\PyCharm Community Edition 2023.2.1\plugins\python-ce\helpers\pydev\_pydev_imps\_pydev_execfile.py", line 18, in execfile
    exec(compile(contents+"\n", file, 'exec'), glob, loc)
  File "C:\Users\24929\PycharmProjects\exams\图像融合(分块).py", line 335, in <module>
    s2_d, h2_d = upcy_cuda(s_d, h_d, height // 2, width // 2)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\24929\PycharmProjects\exams\图像融合(分块).py", line 267, in upcy_cuda
    _upcy_kernel[128, 80](s, h, s2_d, h2_d, 2 * height, 2 * width)
  File "D:\programefiles\ana\Lib\site-packages\numba\cuda\dispatcher.py", line 539, in __call__
    return self.dispatcher.call(args, self.griddim, self.blockdim,
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\programefiles\ana\Lib\site-packages\numba\cuda\dispatcher.py", line 683, in call
    kernel.launch(args, griddim, blockdim, stream, sharedmem)
  File "D:\programefiles\ana\Lib\site-packages\numba\cuda\dispatcher.py", line 327, in launch
    driver.launch_kernel(cufunc.handle,
  File "D:\programefiles\ana\Lib\site-packages\numba\cuda\cudadrv\driver.py", line 2563, in launch_kernel
    driver.cuLaunchKernel(cufunc_handle,
  File "D:\programefiles\ana\Lib\site-packages\numba\cuda\cudadrv\driver.py", line 327, in safe_cuda_api_call
    self._check_ctypes_error(fname, retcode)
  File "D:\programefiles\ana\Lib\site-packages\numba\cuda\cudadrv\driver.py", line 395, in _check_ctypes_error
    raise CudaAPIError(retcode, msg)
numba.cuda.cudadrv.driver.CudaAPIError: [701] Call to cuLaunchKernel results in CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES

Some Python functions and frameworks have a bit of an unpredictable way of handling dimensions with a size of 1. Sometimes they are treated as a full dimension, sometimes they are ignored.

But as the latest post shows, seems to not have been the reason in this case.

My guess is what I have suggested already: one of the shapes of the input arguments is changing in a meaningful way. To investigate that, for me I would want to see the shape info I already suggested: