How to use pycuda to implement concurrency of multiple streams

jychen2021 · June 1, 2023, 11:38am

I’m new to cuda and want to use pycuda for multi-stream concurrent execution, but I don’t know how to do it. The following is my pycuda program and report, I hope to get your help.
report36.qdrep (652.3 KB)

import pycuda.autoinit
import pycuda.driver as drv
import numpy
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import time


pycuda.compiler.DEFAULT_NVCC_FLAGS = ["--default-stream=per-thread"]
mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
  const int i = threadIdx.x+blockIdx.x*blockDim.x;
  dest[i] = a[i] * b[i];
  //printf("%d,",blockDim.x);
}
""", options=["--default-stream=per-thread"])
multiply_them = mod.get_function("multiply_them")


stream1 = drv.Stream()
stream2 = drv.Stream()
stream3 = drv.Stream()
stream4 = drv.Stream()
stream5 = drv.Stream()
stream6 = drv.Stream()
stream7 = cuda.Stream()


a = numpy.random.randn(40000).astype(numpy.float32)
b = numpy.random.randn(40000).astype(numpy.float32)
a1 = numpy.random.randn(40000).astype(numpy.float32)
b1 = numpy.random.randn(40000).astype(numpy.float32)
a2 = numpy.random.randn(40000).astype(numpy.float32)
b2 = numpy.random.randn(40000).astype(numpy.float32)
a3 = numpy.random.randn(40000).astype(numpy.float32)
b3 = numpy.random.randn(40000).astype(numpy.float32)
a4 = numpy.random.randn(40000).astype(numpy.float32)
b4 = numpy.random.randn(40000).astype(numpy.float32)
a5 = numpy.random.randn(40000).astype(numpy.float32)
b5 = numpy.random.randn(40000).astype(numpy.float32)

a_gpu = drv.mem_alloc(a.nbytes * a.dtype.itemsize)
b_gpu = drv.mem_alloc(b.nbytes * b.dtype.itemsize)
a1_gpu = drv.mem_alloc(a1.nbytes * a.dtype.itemsize)
b1_gpu = drv.mem_alloc(b1.nbytes * b.dtype.itemsize)
a2_gpu = drv.mem_alloc(a2.nbytes * a.dtype.itemsize)
b2_gpu = drv.mem_alloc(b2.nbytes * b.dtype.itemsize)
a3_gpu = drv.mem_alloc(a3.nbytes * a.dtype.itemsize)
b3_gpu = drv.mem_alloc(b3.nbytes * b.dtype.itemsize)
a4_gpu = drv.mem_alloc(a4.nbytes * a.dtype.itemsize)
b4_gpu = drv.mem_alloc(b4.nbytes * b.dtype.itemsize)
a5_gpu = drv.mem_alloc(a5.nbytes * a.dtype.itemsize)
b5_gpu = drv.mem_alloc(b5.nbytes * b.dtype.itemsize)
dest = numpy.zeros_like(a).astype(numpy.float32)
dest1 = numpy.zeros_like(a1).astype(numpy.float32)
dest2 = numpy.zeros_like(a2).astype(numpy.float32)
dest3 = numpy.zeros_like(a3).astype(numpy.float32)
dest4 = numpy.zeros_like(a4).astype(numpy.float32)
dest5 = numpy.zeros_like(a5).astype(numpy.float32)
dest_gpu = drv.mem_alloc(dest.nbytes)
dest1_gpu = drv.mem_alloc(dest1.nbytes)
dest2_gpu = drv.mem_alloc(dest2.nbytes)
dest3_gpu = drv.mem_alloc(dest3.nbytes)
dest4_gpu = drv.mem_alloc(dest4.nbytes)
dest5_gpu = drv.mem_alloc(dest5.nbytes)

T1 = time.perf_counter()
drv.memcpy_htod_async(a_gpu, a, stream=stream1)
drv.memcpy_htod_async(b_gpu, b, stream=stream1)
drv.memcpy_htod_async(a1_gpu, a1, stream=stream2)
drv.memcpy_htod_async(b1_gpu, b1, stream=stream2)
drv.memcpy_htod_async(a2_gpu, a2, stream=stream3)
drv.memcpy_htod_async(b2_gpu, b2, stream=stream3)
drv.memcpy_htod_async(a3_gpu, a3, stream=stream4)
drv.memcpy_htod_async(b3_gpu, b3, stream=stream4)
drv.memcpy_htod_async(a4_gpu, a4, stream=stream5)
drv.memcpy_htod_async(b4_gpu, b4, stream=stream5)
drv.memcpy_htod_async(a5_gpu, a5, stream=stream6)
drv.memcpy_htod_async(b5_gpu, b5, stream=stream6)


multiply_them(dest_gpu, a_gpu, b_gpu,block=(400,1,1), grid=(100,1), stream=stream1)
multiply_them(dest1_gpu, a1_gpu, b1_gpu,block=(400,1,1), grid=(100,1), stream=stream2)
multiply_them(dest2_gpu, a2_gpu, b2_gpu,block=(400,1,1), grid=(100,1), stream=stream3)
multiply_them(dest3_gpu, a3_gpu, b3_gpu,block=(400,1,1), grid=(100,1), stream=stream4)
multiply_them(dest4_gpu, a4_gpu, b4_gpu,block=(400,1,1), grid=(100,1), stream=stream5)
multiply_them(dest5_gpu, a5_gpu, b5_gpu,block=(400,1,1), grid=(100,1), stream=stream6)


drv.memcpy_dtoh_async(dest, dest_gpu, stream=stream1)
drv.memcpy_dtoh_async(dest1, dest1_gpu, stream=stream2)
drv.memcpy_dtoh_async(dest2, dest2_gpu, stream=stream3)
drv.memcpy_dtoh_async(dest3, dest3_gpu, stream=stream4)
drv.memcpy_dtoh_async(dest4, dest4_gpu, stream=stream5)
drv.memcpy_dtoh_async(dest5, dest5_gpu, stream=stream6)

T2 = time.perf_counter()
print('program run time:%s ms' % ((T2 - T1) * 1000))

jychen2021 · June 1, 2023, 11:42am

I want to get the following result，

but mine is this

striker159 · June 1, 2023, 12:26pm

Using multiple streams does not guarantee concurrent execution. If the kernel on one stream fully occupies the GPU, there cannot be a kernel that executes on a different stream at the same time.

Topic		Replies	Views
Cannot force kernels to concurrent execution CUDA Programming and Performance	8	5596	April 28, 2012
Parallel execution of multiple kernels possible? CUDA Programming and Performance	1	1655	June 4, 2008
Multiple CPU threads with multiple cudaStreams CUDA Programming and Performance	5	6285	July 23, 2015
concurrent kernel execution using stream CUDA Programming and Performance	1	576	March 22, 2016
cuda stream CUDA Programming and Performance	3	5840	April 6, 2011
Using CUDA to run many instances CUDA Programming and Performance	10	3538	April 1, 2012
CUDA stream not fully utilize CUDA Programming and Performance	0	463	July 27, 2017
a problem about stream (concurrent copy and execute) CUDA Programming and Performance	0	3402	August 10, 2010
Cuda multithreading and stream problems generic system issues CUDA Programming and Performance	4	3387	August 15, 2008
Separate kernel grids do not execute concurrent CUDA Programming and Performance	1	3262	December 18, 2009

How to use pycuda to implement concurrency of multiple streams

Related topics