PyCUDA fills np.array too slow

With this code I want to draw filled triangles:

import cv2
import numpy as np
import os
import time
import math

import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule

executions_per_frame = 10
pycuda_code = """
__device__ void set_pixel_3d(unsigned char *canvas, int* canvas_shape, float *z_buffer, int x, int y, float z, unsigned char *color) {
    int index = y * canvas_shape[1] + x;
    if (z > z_buffer[index]) {
        z_buffer[index] = z;
        for (int i = 0; i < canvas_shape[2]; ++i) {
            canvas[index * canvas_shape[2] + i] = color[i];
        }
    }
}

// l/r - left/right
// l/u - lower/upper
__global__ void draw_triangle(unsigned char *canvas, int *canvas_shape, float *z_buffer, float *ll, float *rl, float *lu, float *ru, unsigned char *color, int height, int min_x, int min_y) {
    int global_thread_x = threadIdx.x + blockIdx.x * blockDim.x;
    int global_thread_y = threadIdx.y + blockIdx.y * blockDim.y;

    float k1 = (float)global_thread_y / height;
    int left_x = (int)(ll[0] + (lu[0] - ll[0]) * k1);
    int right_x = (int)(rl[0] + (ru[0] - rl[0]) * k1);
    float left_z = ll[2] + (lu[2] - ll[2]) * k1;
    float right_z = rl[2] + (ru[2] - rl[2]) * k1;
    int actual_x = min_x + global_thread_x;
    if (left_x != right_x && left_x <= actual_x && actual_x <= right_x) {
        int actual_y = min_y + global_thread_y;
        float k2 = (float)(global_thread_x - (left_x - min_x)) / (right_x - left_x);
        float actual_z = left_z + (right_z - left_z) * k2;
        set_pixel_3d(canvas, canvas_shape, z_buffer, actual_x, actual_y, actual_z, color);
    }
}
"""

if __name__ == '__main__':
    if (os.system("cl.exe")):
        os.environ['PATH'] += ';' + r"C:\Program Files\Microsoft Visual Studio\2017\Community\VC\Tools\MSVC\14.16.27023\bin\Hostx64\x64"
    if (os.system("cl.exe")):
        raise RuntimeError("cl.exe still not found")

    pycuda_src_module = SourceModule(pycuda_code, no_extern_c=True)
    pycuda_draw_triangle = pycuda_src_module.get_function("_Z13draw_trianglePhPiPfS1_S1_S1_S1_S_iii")

    time_start, frames_count, fps = time.time(), 0, 0
    while True:
        key = cv2.waitKeyEx(1)
        if key == 27:
            break

        canvas_width, canvas_height = 1000, 800
        canvas = np.zeros((canvas_height, canvas_width, 3), dtype=np.uint8)
        z_buffer = np.zeros((canvas_height, canvas_width), dtype=np.float32)
        fragment_width, fragment_height = 400, 300
        color = [0, 0, 200]

        block_side = 32
        block_dim = (block_side, block_side, 1)
        grid_dim = (math.ceil(fragment_width / block_side), math.ceil(fragment_height / block_side))

        param_canvas = cuda.InOut(canvas)  # unsigned char *canvas
        param_canvas_shape = cuda.In(np.array(canvas.shape, dtype=np.int32))  # int *canvas_shape
        param_z_buffer = cuda.InOut(z_buffer)  # float *z_buffer
        param_ll = cuda.In(np.array([100, 200, frames_count], dtype=np.float32))  # float *ll
        param_rl = cuda.In(np.array([500, 200, frames_count], dtype=np.float32))  # float *rl
        param_lu = cuda.In(np.array([400, 500, frames_count], dtype=np.float32))  # float *lu
        param_ru = cuda.In(np.array([400, 500, frames_count], dtype=np.float32))  # float *ru
        param_color = cuda.In(np.array(color, dtype=np.uint8))  # unsigned char *color
        param_height = np.int32(fragment_height)  # int height
        param_min_x = np.int32(100)  # int min_x
        param_min_y = np.int32(200)  # int min_y
        for i in range(executions_per_frame):
            pycuda_draw_triangle(param_canvas, param_canvas_shape,
                param_z_buffer, param_ll, param_rl, param_lu, param_ru,
                param_color, param_height, param_min_x, param_min_y,
                block=block_dim, grid=grid_dim)

        frames_count += 1
        fps = frames_count / (time.time() - time_start)
        cv2.putText(canvas, "fps={:0.2f}".format(fps), (5, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255))
        cv2.imshow('Scene', canvas)
    cv2.destroyAllWindows()

With executions_per_frame=1 (for 1 iteration C function will be called 1 time) I got ~100 fps, with executions_per_frame=10 - ~30 fps. It doesn’t look as productive as it could be. What did I miss?

Also, does this have benefit in that particular task?

block_side = 32
block_dim = (block_side, block_side, 1)
grid_dim = (math.ceil(fragment_width / block_side), math.ceil(fragment_height / block_side))
pycuda_draw_triangle(..., block=block_dim, grid=grid_dim)

Or it can be just

pycuda_draw_triangle(..., block=(1, 1, 1), grid=(fragment_width, fragment_height))

Python 3.6.9, CUDA 10.0, RTX 2060

https://stackoverflow.com/questions/59146759/pycuda-fills-np-array-too-slow