I’m new to cuda and want to use pycuda for multi-stream concurrent execution, but I don’t know how to do it. The following is my pycuda program and report, I hope to get your help.
report36.qdrep (652.3 KB)
import pycuda.autoinit
import pycuda.driver as drv
import numpy
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import time
pycuda.compiler.DEFAULT_NVCC_FLAGS = ["--default-stream=per-thread"]
mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
const int i = threadIdx.x+blockIdx.x*blockDim.x;
dest[i] = a[i] * b[i];
//printf("%d,",blockDim.x);
}
""", options=["--default-stream=per-thread"])
multiply_them = mod.get_function("multiply_them")
stream1 = drv.Stream()
stream2 = drv.Stream()
stream3 = drv.Stream()
stream4 = drv.Stream()
stream5 = drv.Stream()
stream6 = drv.Stream()
stream7 = cuda.Stream()
a = numpy.random.randn(40000).astype(numpy.float32)
b = numpy.random.randn(40000).astype(numpy.float32)
a1 = numpy.random.randn(40000).astype(numpy.float32)
b1 = numpy.random.randn(40000).astype(numpy.float32)
a2 = numpy.random.randn(40000).astype(numpy.float32)
b2 = numpy.random.randn(40000).astype(numpy.float32)
a3 = numpy.random.randn(40000).astype(numpy.float32)
b3 = numpy.random.randn(40000).astype(numpy.float32)
a4 = numpy.random.randn(40000).astype(numpy.float32)
b4 = numpy.random.randn(40000).astype(numpy.float32)
a5 = numpy.random.randn(40000).astype(numpy.float32)
b5 = numpy.random.randn(40000).astype(numpy.float32)
a_gpu = drv.mem_alloc(a.nbytes * a.dtype.itemsize)
b_gpu = drv.mem_alloc(b.nbytes * b.dtype.itemsize)
a1_gpu = drv.mem_alloc(a1.nbytes * a.dtype.itemsize)
b1_gpu = drv.mem_alloc(b1.nbytes * b.dtype.itemsize)
a2_gpu = drv.mem_alloc(a2.nbytes * a.dtype.itemsize)
b2_gpu = drv.mem_alloc(b2.nbytes * b.dtype.itemsize)
a3_gpu = drv.mem_alloc(a3.nbytes * a.dtype.itemsize)
b3_gpu = drv.mem_alloc(b3.nbytes * b.dtype.itemsize)
a4_gpu = drv.mem_alloc(a4.nbytes * a.dtype.itemsize)
b4_gpu = drv.mem_alloc(b4.nbytes * b.dtype.itemsize)
a5_gpu = drv.mem_alloc(a5.nbytes * a.dtype.itemsize)
b5_gpu = drv.mem_alloc(b5.nbytes * b.dtype.itemsize)
dest = numpy.zeros_like(a).astype(numpy.float32)
dest1 = numpy.zeros_like(a1).astype(numpy.float32)
dest2 = numpy.zeros_like(a2).astype(numpy.float32)
dest3 = numpy.zeros_like(a3).astype(numpy.float32)
dest4 = numpy.zeros_like(a4).astype(numpy.float32)
dest5 = numpy.zeros_like(a5).astype(numpy.float32)
dest_gpu = drv.mem_alloc(dest.nbytes)
dest1_gpu = drv.mem_alloc(dest1.nbytes)
dest2_gpu = drv.mem_alloc(dest2.nbytes)
dest3_gpu = drv.mem_alloc(dest3.nbytes)
dest4_gpu = drv.mem_alloc(dest4.nbytes)
dest5_gpu = drv.mem_alloc(dest5.nbytes)
T1 = time.perf_counter()
drv.memcpy_htod_async(a_gpu, a, stream=stream1)
drv.memcpy_htod_async(b_gpu, b, stream=stream1)
drv.memcpy_htod_async(a1_gpu, a1, stream=stream2)
drv.memcpy_htod_async(b1_gpu, b1, stream=stream2)
drv.memcpy_htod_async(a2_gpu, a2, stream=stream3)
drv.memcpy_htod_async(b2_gpu, b2, stream=stream3)
drv.memcpy_htod_async(a3_gpu, a3, stream=stream4)
drv.memcpy_htod_async(b3_gpu, b3, stream=stream4)
drv.memcpy_htod_async(a4_gpu, a4, stream=stream5)
drv.memcpy_htod_async(b4_gpu, b4, stream=stream5)
drv.memcpy_htod_async(a5_gpu, a5, stream=stream6)
drv.memcpy_htod_async(b5_gpu, b5, stream=stream6)
multiply_them(dest_gpu, a_gpu, b_gpu,block=(400,1,1), grid=(100,1), stream=stream1)
multiply_them(dest1_gpu, a1_gpu, b1_gpu,block=(400,1,1), grid=(100,1), stream=stream2)
multiply_them(dest2_gpu, a2_gpu, b2_gpu,block=(400,1,1), grid=(100,1), stream=stream3)
multiply_them(dest3_gpu, a3_gpu, b3_gpu,block=(400,1,1), grid=(100,1), stream=stream4)
multiply_them(dest4_gpu, a4_gpu, b4_gpu,block=(400,1,1), grid=(100,1), stream=stream5)
multiply_them(dest5_gpu, a5_gpu, b5_gpu,block=(400,1,1), grid=(100,1), stream=stream6)
drv.memcpy_dtoh_async(dest, dest_gpu, stream=stream1)
drv.memcpy_dtoh_async(dest1, dest1_gpu, stream=stream2)
drv.memcpy_dtoh_async(dest2, dest2_gpu, stream=stream3)
drv.memcpy_dtoh_async(dest3, dest3_gpu, stream=stream4)
drv.memcpy_dtoh_async(dest4, dest4_gpu, stream=stream5)
drv.memcpy_dtoh_async(dest5, dest5_gpu, stream=stream6)
T2 = time.perf_counter()
print('program run time:%s ms' % ((T2 - T1) * 1000))