I use cuda for image sequence processing, hoping to achieve overlap between data copy and kernel .
But when I call cudaEventSynchronize it blocks other streams.
What’s the reason?
System: windows 10 x64
Graphics card: 1050 Ti
11.6 sdk
The code is as follows:
#include <stdio.h>
void __global__ kernel(char* in)
{
char* pIn = in + blockDim.x * blockIdx.x + threadIdx.x * 100;
for (int i = 0; i < 50; i++)
pIn[i] = i;
}
int main(void)
{
int buffer_size = 1024 * 1024 * 100;
char* pCpuBuffer;
char* pCudaBuffer[2];
cudaStream_t s[2];
cudaEvent_t e[2][3];
for (int i = 0; i < 2; i++)
{
for (int j = 0; j < 3; j++)
cudaEventCreate(&e[i][j]);
cudaStreamCreate(&s[i]);
cudaMalloc(&pCudaBuffer[i], buffer_size);
}
cudaMallocHost(&pCpuBuffer, buffer_size);
for (int i = 0; i < 7; i++)
{
if (i < 6)
{
int ii = i % 2;
cudaEventRecord(e[ii][0], s[ii]);
cudaMemcpyAsync(pCudaBuffer[ii], pCpuBuffer, buffer_size, cudaMemcpyHostToDevice, s[ii]);
cudaEventRecord(e[ii][1], s[ii]);
kernel << <1024, 1024, 0, s[ii] >> > (pCudaBuffer[ii]);
cudaEventRecord(e[ii][2], s[ii]);
}
if (i > 0)
{
int ii = (i - 1) % 2;
float t1, t2;
cudaEventSynchronize(e[ii][2]);
cudaEventElapsedTime(&t1, e[ii][0], e[ii][1]);
cudaEventElapsedTime(&t2, e[ii][1], e[ii][2]);
printf("%d: copy in %4d ms, kernel %4d ms\n", ii, (int)t1, (int)t2);
}
}
return 0;
}
result: