Can you help me solve this [701] error? I am performing block image processing for four cycles. The first cycle runs normally, but the second cycle reports an error?
import sys
import numpy as np
from numba import cuda
import time as t
import cv2
def read_unsigned_shorts(file_path):
with open(file_path, 'rb') as file:
# 读取所有数据并转换为numpy数组,dtype=np.uint16表示每个像素值是8位无符号整数
raw_data = np.fromfile(file, dtype=np.uint16)
return raw_data
# 新增函数:读取8位三通道RAW图像
def read_8bit_rgb_raw(file_path, width, height):
"""
读取8位三通道RAW格式的RGB图像。
参数:
file_path (str): 文件路径。
width (int): 图像宽度。
height (int): 图像高度。
返回:
numpy.ndarray: 形状为(height, width, 3)的RGB图像数组。
"""
with open(file_path, 'rb') as file:
# 读取所有数据并转换为numpy数组,dtype=np.uint8表示每个像素值是8位无符号整数
raw_data = np.fromfile(file, dtype=np.uint8)
# 确保我们读取的数据与指定尺寸匹配
if len(raw_data) != width * height * 3:
raise ValueError("文件大小与指定的维度不匹配")
return raw_data
def rgb_to_hsl_cuda(rgb, height, width):
"""
使用CUDA加速将RGB图像转换为HSL图像。
参数:
rgb (numpy.ndarray): 形状为(height, width, 3)的RGB图像数组,类型为np.uint8。
返回:
numpy.ndarray: 形状为(height, width, 3)的HSL图像数组,类型为np.uint16。
"""
# 将输入数据复制到设备
rgb_d = cuda.to_device(rgb)
h_d = cuda.device_array((height, width), dtype=np.float32)
s_d = cuda.device_array((height, width), dtype=np.float32)
l_d = cuda.device_array((height, width), dtype=np.float32)
# 启动核函数
_rgb_to_hsl_kernel[128, 74](rgb_d, h_d, s_d, l_d, height, width)
# 等待GPU
cuda.synchronize()
# 释放显存
return h_d, s_d, l_d # 返回显存数组句柄
@cuda.jit
def _rgb_to_hsl_kernel(rgb, h, s, l, width, height):
row = cuda.grid(1) # 行数
# 每个函数处理一行
if row < height:
# 每行处理逻辑
for col in range(width):
# 提取每个像素的rgb值
r = rgb[row, col, 0]
g = rgb[row, col, 1]
b = rgb[row, col, 2]
# Step 1: 计算极值
C_max = max(r, g, b)
C_min = min(r, g, b)
delta = C_max - C_min
# Step 2: 亮度L (0-255)
L = (C_max + C_min) / 2
# Step 3: 饱和度S (0-255)
if delta == 0:
S = 0
elif L <= 127: # L≤0.5的等效条件
denominator = C_max + C_min
S = (delta * 255) / denominator if denominator > 0 else 0
else:
denominator = 510 - C_max - C_min
S = (delta * 255) / denominator if denominator > 0 else 0
# Step 4: 色相H (0-255)
if delta == 0:
H = 0
else:
if C_max == r:
H_temp = ((g - b) * 60) / delta
elif C_max == g:
H_temp = ((b - r) * 60) / delta + 120
else: # C_max == b
H_temp = ((r - g) * 60) / delta + 240
H_temp = H_temp % 360 if H_temp >= 0 else H_temp + 360
H = (H_temp * 255) / 360
# 将结果存储到数组中
h[row, col] = H
s[row, col] = S
l[row, col] = L
def hsl_to_rgb_cuda(h_d, s_d, l_d, height, width):
"""
使用CUDA加速将HSL图像转换为RGB图像。
参数:
h (numpy.ndarray): 形状为(height* width)的H通道数组,类型为np.float32。
s (numpy.ndarray): 形状为(height* width)的S通道数组,类型为np.float32。
l (numpy.ndarray): 形状为(height* width)的L通道数组,类型为np.float32。
height (int): 图像高度。
width (int): 图像宽度。
返回:
numpy.ndarray: 形状为(height* width* 3)的RGB图像数组,类型为np.uint8。
"""
# 在设备上开辟RGB数组
rgb_d = cuda.device_array((height, width, 3), dtype=np.uint8)
# 启动核函数
_hsl_to_rgb_kernel[128, 74](h_d, s_d, l_d, rgb_d, width, height)
# 等待GPU
cuda.synchronize()
# 将结果从设备拷贝回主机
rgb = rgb_d.copy_to_host()
# 释放显存
cuda.close()
return rgb
@cuda.jit
def _hsl_to_rgb_kernel(h, s, l, rgb, width, height):
row = cuda.grid(1)
if row < height:
for col in range(width):
# 计算索引
h_val = h[row, col] * 360.0 / 255.0 # 反归一化H值
s_val = s[row, col] / 255.0 # 归一化S值
l_val = l[row, col] / 255.0 # 归一化L值
# HSL到RGB转换算法
c = (1 - abs(2 * l_val - 1)) * s_val
x = c * (1 - abs((h_val / 60.0) % 2 - 1))
m = l_val - c / 2
# 初始化RGB临时值
r_temp, g_temp, b_temp = 0, 0, 0
# 根据H值区间确定颜色分量
if h_val < 60:
r_temp, g_temp, b_temp = c, x, 0
elif h_val < 120:
r_temp, g_temp, b_temp = x, c, 0
elif h_val < 180:
r_temp, g_temp, b_temp = 0, c, x
elif h_val < 240:
r_temp, g_temp, b_temp = 0, x, c
elif h_val < 300:
r_temp, g_temp, b_temp = x, 0, c
else:
r_temp, g_temp, b_temp = c, 0, x
# 计算最终RGB值并转换为uint8
rgb[row, col, 0] = int(round((r_temp + m) * 255))
rgb[row, col, 1] = int(round((g_temp + m) * 255))
rgb[row, col, 2] = int(round((b_temp + m) * 255))
def upcy_cuda(s, h, height, width):
# 新建内插后数组
s2_d = cuda.device_array((2 * height, 2 * width), dtype=np.float32)
h2_d = cuda.device_array((2 * height, 2 * width), dtype=np.float32)
# 启动核函数
_upcy_kernel[128, 80](s, h, s2_d, h2_d, 2 * height, 2 * width)
# 等待GPU
cuda.synchronize()
# 返回内插后的句柄
return s2_d, h2_d
@cuda.jit
def _upcy_kernel(s, h, s2, h2, height, width):
# 大图像的行数
row2 = cuda.grid(1)
if row2 < height:
for col2 in range(width): # 大图像的列数
row, col = int(row2 / 2), int(col2 / 2) # 原始图像索引,并且取了整
if row2 % 2 == 0 and col2 % 2 == 0: # 偶数行偶数列
s2[row2, col2] = s[row, col]
h2[row2, col2] = h[row, col] # 直接赋值
else:
s2[row2, col2] = (s[row, col] + s[row, col + 1] + s[row + 1, col] + s[row + 1, col + 1]) / 4 # 内插步骤
h2[row2, col2] = (h[row, col] + h[row, col + 1] + h[row + 1, col] + h[row + 1, col + 1]) / 4
if __name__ == "__main__":
file_path = "D:\\学习资料\\数字图像处理\\彩色变换+实践\\实践三\\RGB.raw"
width = 10992
height = 10992
rgb_image = read_8bit_rgb_raw(file_path, width, height)
rgb_image = rgb_image.reshape(height, width, 3)
# 读取高分辨率灰度图像
file_path = "D:\\学习资料\\数字图像处理\\彩色变换+实践\\实践三\\band_CH02.raw"
gray_image = np.array(read_unsigned_shorts(file_path))
gray_image = gray_image.reshape(height * 2, width * 2)
# 将灰度图像转为8位存储
gray_image = gray_image.astype(np.uint8)
# 划分子块
# 定义子块尺寸
block_width = width // 2
block_height = height // 2
# 划分四个子块
rgb_blocks = [
rgb_image[:block_height, :block_width, :], # 左上
rgb_image[:block_height, block_width:, :], # 右上
rgb_image[block_height:, :block_width, :], # 左下
rgb_image[block_height:, block_width:, :]]
gray_blocks = [
gray_image[:2 * block_height, :2 * block_width], # 左上
gray_image[:2 * block_height, 2 * block_width:], # 右上
gray_image[2 * block_height:, :2 * block_width], # 左下
gray_image[2 * block_height:, 2 * block_width:]]
del rgb_image, gray_image
sys.stdout.flush()
dawn = t.time()
# 定义结果数组
result = []
# 分块处理
for i in range(4):
# 将RGB图像转换为HSL图像,并确保内存连续
rgb_image = np.ascontiguousarray(rgb_blocks[i])
gray_image = np.ascontiguousarray(gray_blocks[i])
# HSL正变换
h_d, s_d, l_d = rgb_to_hsl_cuda(rgb_image, height // 2, width // 2)
# 内插S、H
s2_d, h2_d = upcy_cuda(s_d, h_d, height // 2, width // 2)
# 将S,H拷贝回内存
s2 = s2_d.copy_to_host()
h2 = h2_d.copy_to_host()
# 清除上下文
#cuda.close()
# 将灰度图作为h2_d直接拷贝到设备,再把别的两个也拷贝过去
h2_d = cuda.to_device(h2)
s2_d = cuda.to_device(s2)
l2_d = cuda.to_device(gray_image)
del h_d, s_d, l_d
rgb_image1 = hsl_to_rgb_cuda(h2_d, s2_d, l2_d, height, width)
result.append(rgb_image1)
#cuda.close()
del s2_d, h2_d, l2_d
print('处理子块 {}'.format(i))
# 合并子块
top = np.hstack((result[0], result[1]))
bottom = np.hstack((result[2], result[3]))
final_image = np.vstack((top, bottom))
dusk = t.time()
print('\t(done in {:.2f}s)'.format(dusk - dawn))
The error message is as follows:
已连接到 pydev 调试器(内部版本号 232.9559.58)处理子块 0
Traceback (most recent call last):
File "C:\Program Files\JetBrains\PyCharm Community Edition 2023.2.1\plugins\python-ce\helpers\pydev\pydevd.py", line 2199, in <module>
main()
File "C:\Program Files\JetBrains\PyCharm Community Edition 2023.2.1\plugins\python-ce\helpers\pydev\pydevd.py", line 2181, in main
globals = debugger.run(setup['file'], None, None, is_module)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Program Files\JetBrains\PyCharm Community Edition 2023.2.1\plugins\python-ce\helpers\pydev\pydevd.py", line 1493, in run
return self._exec(is_module, entry_point_fn, module_name, file, globals, locals)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Program Files\JetBrains\PyCharm Community Edition 2023.2.1\plugins\python-ce\helpers\pydev\pydevd.py", line 1500, in _exec
pydev_imports.execfile(file, globals, locals) # execute the script
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Program Files\JetBrains\PyCharm Community Edition 2023.2.1\plugins\python-ce\helpers\pydev\_pydev_imps\_pydev_execfile.py", line 18, in execfile
exec(compile(contents+"\n", file, 'exec'), glob, loc)
File "C:\Users\24929\PycharmProjects\exams\图像融合.py", line 256, in <module>
h_d, s_d, l_d = rgb_to_hsl_cuda(rgb_image, height // 2, width // 2)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\24929\PycharmProjects\exams\图像融合.py", line 58, in rgb_to_hsl_cuda
_rgb_to_hsl_kernel[128, 74](rgb_d, h_d, s_d, l_d, height, width)
File "D:\programefiles\ana\Lib\site-packages\numba\cuda\dispatcher.py", line 539, in __call__
return self.dispatcher.call(args, self.griddim, self.blockdim,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\programefiles\ana\Lib\site-packages\numba\cuda\dispatcher.py", line 683, in call
kernel.launch(args, griddim, blockdim, stream, sharedmem)
File "D:\programefiles\ana\Lib\site-packages\numba\cuda\dispatcher.py", line 327, in launch
driver.launch_kernel(cufunc.handle,
File "D:\programefiles\ana\Lib\site-packages\numba\cuda\cudadrv\driver.py", line 2563, in launch_kernel
driver.cuLaunchKernel(cufunc_handle,
File "D:\programefiles\ana\Lib\site-packages\numba\cuda\cudadrv\driver.py", line 327, in safe_cuda_api_call
self._check_ctypes_error(fname, retcode)
File "D:\programefiles\ana\Lib\site-packages\numba\cuda\cudadrv\driver.py", line 395, in _check_ctypes_error
raise CudaAPIError(retcode, msg)
numba.cuda.cudadrv.driver.CudaAPIError: [701] Call to cuLaunchKernel results in CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES
I am a beginner in CUDA programming and cannot find a solution online, so I am here for help. Thank you for your assistance