Asynchronous multi streaming: not working...

Hi.
I tried coding about asynchronous multi streaming.
I used “pinned memory” and “cudaMemcpyAsync” API.
But there is no overlap between kernels…

this is my code.
please, check the “// Asynchronous data transfer and kernel execution” (for) part.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

__global__ void mulKernel(int *c, int *a, int *b) {
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    
	if (i<5242880) {
		c[i] = a[i] * b[i];
	}	
}

int main(){
	cudaError_t cudaStatus;
    const int arraySize = 83886080;
	const int size = 5242880;

	cudaStream_t stream1, stream2, stream3, stream4;
	cudaStreamCreate(&stream1);
	cudaStreamCreate(&stream2);
	cudaStreamCreate(&stream3);
	cudaStreamCreate(&stream4);

	int *host_a, *host_b, *host_c;
	
	cudaStatus = cudaHostAlloc((void**)&host_a, arraySize * sizeof(int), cudaHostAllocDefault);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaHostAlloc failed!\n");
		goto Error;
	}
	cudaStatus = cudaHostAlloc((void**)&host_b, arraySize * sizeof(int), cudaHostAllocDefault);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaHostAlloc failed!\n");
		goto Error;
	}
	cudaStatus = cudaHostAlloc((void**)&host_c, arraySize * sizeof(int), cudaHostAllocDefault);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaHostAlloc failed!\n");
		goto Error;
	}

	int j = 1;
	for (int i = 0; i < arraySize; i++) {
		if (j == 10001) {
			j = 1;
		}

		host_a[i] = j;
		host_b[i] = j;
		host_c[i] = 0;
		j++;
	}

	int *dev_a1, *dev_b1, *dev_c1;
	int *dev_a2, *dev_b2, *dev_c2;
	int *dev_a3, *dev_b3, *dev_c3;
	int *dev_a4, *dev_b4, *dev_c4;

	// Choose which GPU to run on, change this on a multi-GPU system.
	cudaStatus = cudaSetDevice(0);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?\n");
		goto Error;
	}

	// Allocate GPU buffers
	cudaStatus = cudaMalloc((void**)&dev_a1, size * sizeof(int));
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMalloc failed!\n");
		goto Error;
	}
	cudaStatus = cudaMalloc((void**)&dev_b1, size * sizeof(int));
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMalloc failed!\n");
		goto Error;
	}
	cudaStatus = cudaMalloc((void**)&dev_c1, size * sizeof(int));
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMalloc failed!\n");
		goto Error;
	}
	
	cudaStatus = cudaMalloc((void**)&dev_a2, size * sizeof(int));
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMalloc failed!\n");
		goto Error;
	}
	cudaStatus = cudaMalloc((void**)&dev_b2, size * sizeof(int));
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMalloc failed!\n");
		goto Error;
	}
	cudaStatus = cudaMalloc((void**)&dev_c2, size * sizeof(int));
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMalloc failed!\n");
		goto Error;
	}
	
	cudaStatus = cudaMalloc((void**)&dev_a3, size * sizeof(int));
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMalloc failed!\n");
		goto Error;
	}
	cudaStatus = cudaMalloc((void**)&dev_b3, size * sizeof(int));
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMalloc failed!\n");
		goto Error;
	}
	cudaStatus = cudaMalloc((void**)&dev_c3, size * sizeof(int));
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMalloc failed!\n");
		goto Error;
	}

	cudaStatus = cudaMalloc((void**)&dev_a4, size * sizeof(int));
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMalloc failed!\n");
		goto Error;
	}
	cudaStatus = cudaMalloc((void**)&dev_b4, size * sizeof(int));
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMalloc failed!\n");
		goto Error;
	}
	cudaStatus = cudaMalloc((void**)&dev_c4, size * sizeof(int));
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMalloc failed!\n");
		goto Error;
	}

	// Asynchronous data transfer and kernel execution
	for (int i = 0; i < arraySize; i += size*4) {

		// stream1
		cudaStatus = cudaMemcpyAsync(dev_a1, host_a + i, size * sizeof(int), cudaMemcpyHostToDevice, stream1);
		if (cudaStatus != cudaSuccess) {
			fprintf(stderr, "1cudaMemcpyAsync 'stream1' failed!\n");
			goto Error;
		}
		cudaStatus = cudaMemcpyAsync(dev_b1, host_b + i, size * sizeof(int), cudaMemcpyHostToDevice, stream1);
		if (cudaStatus != cudaSuccess) {
			fprintf(stderr, "2cudaMemcpyAsync 'stream1' failed!\n");
			goto Error;
		}

		mulKernel << < size/1024, 1024, 0, stream1 >> > (dev_c1, dev_a1, dev_b1);

		cudaStatus = cudaMemcpyAsync(host_c + i, dev_c1, size * sizeof(int), cudaMemcpyDeviceToHost, stream1);
		if (cudaStatus != cudaSuccess) {
			fprintf(stderr, "3cudaMemcpyAsync 'stream1' failed!\n");
			goto Error;
		}

		// stream2
		cudaStatus = cudaMemcpyAsync(dev_a2, host_a + i + size, size * sizeof(int), cudaMemcpyHostToDevice, stream2);
		if (cudaStatus != cudaSuccess) {
			fprintf(stderr, "1cudaMemcpyAsync 'stream2' failed!\n");
			goto Error;
		}
		cudaStatus = cudaMemcpyAsync(dev_b2, host_b + i + size, size * sizeof(int), cudaMemcpyHostToDevice, stream2);
		if (cudaStatus != cudaSuccess) {
			fprintf(stderr, "2cudaMemcpyAsync 'stream2' failed!\n");
			goto Error;
		}

		mulKernel << <size / 1024, 1024, 0, stream2 >> > (dev_c2, dev_a2, dev_b2);

		cudaStatus = cudaMemcpyAsync(host_c + i + size, dev_c2, size * sizeof(int), cudaMemcpyDeviceToHost, stream2);
		if (cudaStatus != cudaSuccess) {
			fprintf(stderr, "3cudaMemcpyAsync 'stream2' failed!\n");
			goto Error;
		}

		// stream3
		cudaStatus = cudaMemcpyAsync(dev_a3, host_a + i + (size * 2), size * sizeof(int), cudaMemcpyHostToDevice, stream3);
		if (cudaStatus != cudaSuccess) {
			fprintf(stderr, "1cudaMemcpyAsync 'stream3' failed!\n");
			goto Error;
		}
		cudaStatus = cudaMemcpyAsync(dev_b3, host_b + i + (size * 2), size * sizeof(int), cudaMemcpyHostToDevice, stream3);
		if (cudaStatus != cudaSuccess) {
			fprintf(stderr, "2cudaMemcpyAsync 'stream3' failed!\n");
			goto Error;
		}

		mulKernel << <size / 1024, 1024, 0, stream3 >> > (dev_c3, dev_a3, dev_b3);

		cudaStatus = cudaMemcpyAsync(host_c + i + (size * 2), dev_c3, size * sizeof(int), cudaMemcpyDeviceToHost, stream3);
		if (cudaStatus != cudaSuccess) {
			fprintf(stderr, "3cudaMemcpyAsync 'stream3' failed!\n");
			goto Error;
		}

		//stream4
		cudaStatus = cudaMemcpyAsync(dev_a4, host_a + i + (size * 3), size * sizeof(int), cudaMemcpyHostToDevice, stream4);
		if (cudaStatus != cudaSuccess) {
			fprintf(stderr, "1cudaMemcpyAsync 'stream4' failed!\n");
			goto Error;
		}
		cudaStatus = cudaMemcpyAsync(dev_b4, host_b + i + (size * 3), size * sizeof(int), cudaMemcpyHostToDevice, stream4);
		if (cudaStatus != cudaSuccess) {
			fprintf(stderr, "2cudaMemcpyAsync 'stream4' failed!\n");
			goto Error;
		}

		mulKernel << < size / 1024, 1024, 0, stream4 >> > (dev_c4, dev_a4, dev_b4);

		cudaStatus = cudaMemcpyAsync(host_c + i + (size * 3), dev_c4, size * sizeof(int), cudaMemcpyDeviceToHost, stream4);
		if (cudaStatus != cudaSuccess) {
			fprintf(stderr, "3cudaMemcpyAsync 'stream4' failed!\n");
			goto Error;
		}
	}
	cudaStreamSynchronize(stream1);
	cudaStreamSynchronize(stream2);
	cudaStreamSynchronize(stream3);
	cudaStreamSynchronize(stream4);
	
	int k = 1;
	int count1 = 0, count2 = 0;
	for (int i = 0; i < arraySize; i++) {
		if (k == 10001) {
			k = 1;
		}

		if (host_a[i] == k && host_b[i] == k) {
			if (host_c[i] != k*k) {
				printf("%d: %d %d %d\n", i, host_a[i], host_b[i], host_c[i]);
			}
		}

		k++;
	}


Error:
	cudaFree(dev_c1);
	cudaFree(dev_a1);
	cudaFree(dev_b1);
	cudaFree(dev_c2);
	cudaFree(dev_a2);
	cudaFree(dev_b2);
	cudaFree(dev_c3);
	cudaFree(dev_a3);
	cudaFree(dev_b3);
	cudaFree(dev_c4);
	cudaFree(dev_a4);
	cudaFree(dev_b4);
	cudaFreeHost(host_a);
	cudaFreeHost(host_b);
	cudaFreeHost(host_c);
	cudaStreamDestroy(stream1);
	cudaStreamDestroy(stream2);
	cudaStreamDestroy(stream3);
	cudaStreamDestroy(stream4);

	return cudaStatus;

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!\n");
        return 1;
    }

    return 0;
}

You’re unlikely to see much if any overlap for kernels that launch 5242880/1024 = 5120 blocks. That will fill up any current GPU. You may want to study the concurrentKernels sample code.

Thank you so much.
I modified my code.

finally, data transfer and kernel are overlapped.
but my code is not overlapped between data transfers.

but 5 slide shows DH and HD are overlapped in that URL.
I also read the information. Data transfers can’t overlap.