Why does cudaEventSynchronize block other streams?

I use cuda for image sequence processing, hoping to achieve overlap between data copy and kernel .

But when I call cudaEventSynchronize it blocks other streams.
What’s the reason?

System: windows 10 x64
Graphics card: 1050 Ti
11.6 sdk


The code is as follows:
#include <stdio.h>

void __global__ kernel(char* in)
{
	char* pIn = in + blockDim.x * blockIdx.x + threadIdx.x * 100;
	for (int i = 0; i < 50; i++)
		pIn[i] = i;
}

int main(void)
{
	int buffer_size = 1024 * 1024 * 100;
	char* pCpuBuffer;
	char* pCudaBuffer[2];
	cudaStream_t s[2];
	cudaEvent_t e[2][3];
	for (int i = 0; i < 2; i++)
	{
		for (int j = 0; j < 3; j++)
			cudaEventCreate(&e[i][j]);
		cudaStreamCreate(&s[i]);
		cudaMalloc(&pCudaBuffer[i], buffer_size);
	}
	cudaMallocHost(&pCpuBuffer, buffer_size);

	for (int i = 0; i < 7; i++)
	{
		if (i < 6)
		{
			int ii = i % 2;
			cudaEventRecord(e[ii][0], s[ii]);
			cudaMemcpyAsync(pCudaBuffer[ii], pCpuBuffer, buffer_size, cudaMemcpyHostToDevice, s[ii]);
			cudaEventRecord(e[ii][1], s[ii]);
			kernel << <1024, 1024, 0, s[ii] >> > (pCudaBuffer[ii]);
			cudaEventRecord(e[ii][2], s[ii]);
		}
		if (i > 0)
		{
			int ii = (i - 1) % 2;
			float t1, t2;
			cudaEventSynchronize(e[ii][2]);
			cudaEventElapsedTime(&t1, e[ii][0], e[ii][1]);
			cudaEventElapsedTime(&t2, e[ii][1], e[ii][2]);
			printf("%d: copy in %4d ms, kernel %4d ms\n", ii, (int)t1, (int)t2);

		}
	}
	return 0;
}

result:

  • The following code also gets another wrong result

void __global__ kernel(char* in)
{
	char* pIn = in + blockDim.x * blockIdx.x + threadIdx.x * 100;
	for (int i = 0; i < 50; i++)
		pIn[i] = i;
}

int main(void)
{
	int buffer_size = 1024 * 1024 * 100;
	char* pCpuBuffer;
	char* pCudaBuffer[2];
	cudaStream_t s[2];
	for (int i = 0; i < 2; i++)
	{
		cudaStreamCreate(&s[i]);
		cudaMalloc(&pCudaBuffer[i], buffer_size);
	}
	cudaMallocHost(&pCpuBuffer, buffer_size);

	for (int i = 0; i < 7; i++)
	{
		if (i < 6)
		{
			cudaMemcpyAsync(pCudaBuffer[i % 2], pCpuBuffer, buffer_size, cudaMemcpyHostToDevice, s[i % 2]);
			kernel << <1024, 1024, 0, s[i % 2] >> > (pCudaBuffer[i % 2]);
		}
		if (i > 0) cudaStreamSynchronize(s[(i - 1) % 2]);
	}
	return 0;
}