How to use pycuda to implement concurrency of multiple streams

I’m new to cuda and want to use pycuda for multi-stream concurrent execution, but I don’t know how to do it. The following is my pycuda program and report, I hope to get your help.
report36.qdrep (652.3 KB)

import pycuda.autoinit
import pycuda.driver as drv
import numpy
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import time


pycuda.compiler.DEFAULT_NVCC_FLAGS = ["--default-stream=per-thread"]
mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
  const int i = threadIdx.x+blockIdx.x*blockDim.x;
  dest[i] = a[i] * b[i];
  //printf("%d,",blockDim.x);
}
""", options=["--default-stream=per-thread"])
multiply_them = mod.get_function("multiply_them")


stream1 = drv.Stream()
stream2 = drv.Stream()
stream3 = drv.Stream()
stream4 = drv.Stream()
stream5 = drv.Stream()
stream6 = drv.Stream()
stream7 = cuda.Stream()


a = numpy.random.randn(40000).astype(numpy.float32)
b = numpy.random.randn(40000).astype(numpy.float32)
a1 = numpy.random.randn(40000).astype(numpy.float32)
b1 = numpy.random.randn(40000).astype(numpy.float32)
a2 = numpy.random.randn(40000).astype(numpy.float32)
b2 = numpy.random.randn(40000).astype(numpy.float32)
a3 = numpy.random.randn(40000).astype(numpy.float32)
b3 = numpy.random.randn(40000).astype(numpy.float32)
a4 = numpy.random.randn(40000).astype(numpy.float32)
b4 = numpy.random.randn(40000).astype(numpy.float32)
a5 = numpy.random.randn(40000).astype(numpy.float32)
b5 = numpy.random.randn(40000).astype(numpy.float32)

a_gpu = drv.mem_alloc(a.nbytes * a.dtype.itemsize)
b_gpu = drv.mem_alloc(b.nbytes * b.dtype.itemsize)
a1_gpu = drv.mem_alloc(a1.nbytes * a.dtype.itemsize)
b1_gpu = drv.mem_alloc(b1.nbytes * b.dtype.itemsize)
a2_gpu = drv.mem_alloc(a2.nbytes * a.dtype.itemsize)
b2_gpu = drv.mem_alloc(b2.nbytes * b.dtype.itemsize)
a3_gpu = drv.mem_alloc(a3.nbytes * a.dtype.itemsize)
b3_gpu = drv.mem_alloc(b3.nbytes * b.dtype.itemsize)
a4_gpu = drv.mem_alloc(a4.nbytes * a.dtype.itemsize)
b4_gpu = drv.mem_alloc(b4.nbytes * b.dtype.itemsize)
a5_gpu = drv.mem_alloc(a5.nbytes * a.dtype.itemsize)
b5_gpu = drv.mem_alloc(b5.nbytes * b.dtype.itemsize)
dest = numpy.zeros_like(a).astype(numpy.float32)
dest1 = numpy.zeros_like(a1).astype(numpy.float32)
dest2 = numpy.zeros_like(a2).astype(numpy.float32)
dest3 = numpy.zeros_like(a3).astype(numpy.float32)
dest4 = numpy.zeros_like(a4).astype(numpy.float32)
dest5 = numpy.zeros_like(a5).astype(numpy.float32)
dest_gpu = drv.mem_alloc(dest.nbytes)
dest1_gpu = drv.mem_alloc(dest1.nbytes)
dest2_gpu = drv.mem_alloc(dest2.nbytes)
dest3_gpu = drv.mem_alloc(dest3.nbytes)
dest4_gpu = drv.mem_alloc(dest4.nbytes)
dest5_gpu = drv.mem_alloc(dest5.nbytes)

T1 = time.perf_counter()
drv.memcpy_htod_async(a_gpu, a, stream=stream1)
drv.memcpy_htod_async(b_gpu, b, stream=stream1)
drv.memcpy_htod_async(a1_gpu, a1, stream=stream2)
drv.memcpy_htod_async(b1_gpu, b1, stream=stream2)
drv.memcpy_htod_async(a2_gpu, a2, stream=stream3)
drv.memcpy_htod_async(b2_gpu, b2, stream=stream3)
drv.memcpy_htod_async(a3_gpu, a3, stream=stream4)
drv.memcpy_htod_async(b3_gpu, b3, stream=stream4)
drv.memcpy_htod_async(a4_gpu, a4, stream=stream5)
drv.memcpy_htod_async(b4_gpu, b4, stream=stream5)
drv.memcpy_htod_async(a5_gpu, a5, stream=stream6)
drv.memcpy_htod_async(b5_gpu, b5, stream=stream6)


multiply_them(dest_gpu, a_gpu, b_gpu,block=(400,1,1), grid=(100,1), stream=stream1)
multiply_them(dest1_gpu, a1_gpu, b1_gpu,block=(400,1,1), grid=(100,1), stream=stream2)
multiply_them(dest2_gpu, a2_gpu, b2_gpu,block=(400,1,1), grid=(100,1), stream=stream3)
multiply_them(dest3_gpu, a3_gpu, b3_gpu,block=(400,1,1), grid=(100,1), stream=stream4)
multiply_them(dest4_gpu, a4_gpu, b4_gpu,block=(400,1,1), grid=(100,1), stream=stream5)
multiply_them(dest5_gpu, a5_gpu, b5_gpu,block=(400,1,1), grid=(100,1), stream=stream6)


drv.memcpy_dtoh_async(dest, dest_gpu, stream=stream1)
drv.memcpy_dtoh_async(dest1, dest1_gpu, stream=stream2)
drv.memcpy_dtoh_async(dest2, dest2_gpu, stream=stream3)
drv.memcpy_dtoh_async(dest3, dest3_gpu, stream=stream4)
drv.memcpy_dtoh_async(dest4, dest4_gpu, stream=stream5)
drv.memcpy_dtoh_async(dest5, dest5_gpu, stream=stream6)

T2 = time.perf_counter()
print('program run time:%s ms' % ((T2 - T1) * 1000))

I want to get the following result,


but mine is this

Using multiple streams does not guarantee concurrent execution. If the kernel on one stream fully occupies the GPU, there cannot be a kernel that executes on a different stream at the same time.