Hi.
I tried coding about asynchronous multi streaming.
I used “pinned memory” and “cudaMemcpyAsync” API.
But there is no overlap between kernels…
this is my code.
please, check the “// Asynchronous data transfer and kernel execution” (for) part.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
__global__ void mulKernel(int *c, int *a, int *b) {
int i = threadIdx.x + blockIdx.x * blockDim.x;
if (i<5242880) {
c[i] = a[i] * b[i];
}
}
int main(){
cudaError_t cudaStatus;
const int arraySize = 83886080;
const int size = 5242880;
cudaStream_t stream1, stream2, stream3, stream4;
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
cudaStreamCreate(&stream3);
cudaStreamCreate(&stream4);
int *host_a, *host_b, *host_c;
cudaStatus = cudaHostAlloc((void**)&host_a, arraySize * sizeof(int), cudaHostAllocDefault);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaHostAlloc failed!\n");
goto Error;
}
cudaStatus = cudaHostAlloc((void**)&host_b, arraySize * sizeof(int), cudaHostAllocDefault);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaHostAlloc failed!\n");
goto Error;
}
cudaStatus = cudaHostAlloc((void**)&host_c, arraySize * sizeof(int), cudaHostAllocDefault);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaHostAlloc failed!\n");
goto Error;
}
int j = 1;
for (int i = 0; i < arraySize; i++) {
if (j == 10001) {
j = 1;
}
host_a[i] = j;
host_b[i] = j;
host_c[i] = 0;
j++;
}
int *dev_a1, *dev_b1, *dev_c1;
int *dev_a2, *dev_b2, *dev_c2;
int *dev_a3, *dev_b3, *dev_c3;
int *dev_a4, *dev_b4, *dev_c4;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?\n");
goto Error;
}
// Allocate GPU buffers
cudaStatus = cudaMalloc((void**)&dev_a1, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!\n");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_b1, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!\n");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_c1, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!\n");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_a2, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!\n");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_b2, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!\n");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_c2, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!\n");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_a3, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!\n");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_b3, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!\n");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_c3, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!\n");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_a4, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!\n");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_b4, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!\n");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_c4, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!\n");
goto Error;
}
// Asynchronous data transfer and kernel execution
for (int i = 0; i < arraySize; i += size*4) {
// stream1
cudaStatus = cudaMemcpyAsync(dev_a1, host_a + i, size * sizeof(int), cudaMemcpyHostToDevice, stream1);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "1cudaMemcpyAsync 'stream1' failed!\n");
goto Error;
}
cudaStatus = cudaMemcpyAsync(dev_b1, host_b + i, size * sizeof(int), cudaMemcpyHostToDevice, stream1);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "2cudaMemcpyAsync 'stream1' failed!\n");
goto Error;
}
mulKernel << < size/1024, 1024, 0, stream1 >> > (dev_c1, dev_a1, dev_b1);
cudaStatus = cudaMemcpyAsync(host_c + i, dev_c1, size * sizeof(int), cudaMemcpyDeviceToHost, stream1);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "3cudaMemcpyAsync 'stream1' failed!\n");
goto Error;
}
// stream2
cudaStatus = cudaMemcpyAsync(dev_a2, host_a + i + size, size * sizeof(int), cudaMemcpyHostToDevice, stream2);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "1cudaMemcpyAsync 'stream2' failed!\n");
goto Error;
}
cudaStatus = cudaMemcpyAsync(dev_b2, host_b + i + size, size * sizeof(int), cudaMemcpyHostToDevice, stream2);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "2cudaMemcpyAsync 'stream2' failed!\n");
goto Error;
}
mulKernel << <size / 1024, 1024, 0, stream2 >> > (dev_c2, dev_a2, dev_b2);
cudaStatus = cudaMemcpyAsync(host_c + i + size, dev_c2, size * sizeof(int), cudaMemcpyDeviceToHost, stream2);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "3cudaMemcpyAsync 'stream2' failed!\n");
goto Error;
}
// stream3
cudaStatus = cudaMemcpyAsync(dev_a3, host_a + i + (size * 2), size * sizeof(int), cudaMemcpyHostToDevice, stream3);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "1cudaMemcpyAsync 'stream3' failed!\n");
goto Error;
}
cudaStatus = cudaMemcpyAsync(dev_b3, host_b + i + (size * 2), size * sizeof(int), cudaMemcpyHostToDevice, stream3);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "2cudaMemcpyAsync 'stream3' failed!\n");
goto Error;
}
mulKernel << <size / 1024, 1024, 0, stream3 >> > (dev_c3, dev_a3, dev_b3);
cudaStatus = cudaMemcpyAsync(host_c + i + (size * 2), dev_c3, size * sizeof(int), cudaMemcpyDeviceToHost, stream3);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "3cudaMemcpyAsync 'stream3' failed!\n");
goto Error;
}
//stream4
cudaStatus = cudaMemcpyAsync(dev_a4, host_a + i + (size * 3), size * sizeof(int), cudaMemcpyHostToDevice, stream4);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "1cudaMemcpyAsync 'stream4' failed!\n");
goto Error;
}
cudaStatus = cudaMemcpyAsync(dev_b4, host_b + i + (size * 3), size * sizeof(int), cudaMemcpyHostToDevice, stream4);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "2cudaMemcpyAsync 'stream4' failed!\n");
goto Error;
}
mulKernel << < size / 1024, 1024, 0, stream4 >> > (dev_c4, dev_a4, dev_b4);
cudaStatus = cudaMemcpyAsync(host_c + i + (size * 3), dev_c4, size * sizeof(int), cudaMemcpyDeviceToHost, stream4);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "3cudaMemcpyAsync 'stream4' failed!\n");
goto Error;
}
}
cudaStreamSynchronize(stream1);
cudaStreamSynchronize(stream2);
cudaStreamSynchronize(stream3);
cudaStreamSynchronize(stream4);
int k = 1;
int count1 = 0, count2 = 0;
for (int i = 0; i < arraySize; i++) {
if (k == 10001) {
k = 1;
}
if (host_a[i] == k && host_b[i] == k) {
if (host_c[i] != k*k) {
printf("%d: %d %d %d\n", i, host_a[i], host_b[i], host_c[i]);
}
}
k++;
}
Error:
cudaFree(dev_c1);
cudaFree(dev_a1);
cudaFree(dev_b1);
cudaFree(dev_c2);
cudaFree(dev_a2);
cudaFree(dev_b2);
cudaFree(dev_c3);
cudaFree(dev_a3);
cudaFree(dev_b3);
cudaFree(dev_c4);
cudaFree(dev_a4);
cudaFree(dev_b4);
cudaFreeHost(host_a);
cudaFreeHost(host_b);
cudaFreeHost(host_c);
cudaStreamDestroy(stream1);
cudaStreamDestroy(stream2);
cudaStreamDestroy(stream3);
cudaStreamDestroy(stream4);
return cudaStatus;
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!\n");
return 1;
}
return 0;
}