How to yield CPU When I use cudaEventSynchronize?

Following is my code:

#include <unistd.h>
#include <cuda.h>
#include <cuda_runtime.h>

global void compute()
{
char a[100][100];
char b[100][100];
for (int i = 0; i < 100; i++) {
for (int j = 0; j < 100; j++) {
for (int k = 0; k < 100; k++) {
a[i][j] = a[i][k] * b[k][j];
}
}
}
}

int main(void)
{
void *h_buff = malloc(1024 * 1024 * 10);
void *d_buff = NULL;
cudaSetDevice(0);
cudaSetDeviceFlags(cudaDeviceScheduleYield);
cudaMalloc((void **)&d_buff, 1024 * 1024 * 10);
cudaEvent_t event;
cudaEventCreateWithFlags(&event, cudaEventBlockingSync /cudaEventDefault/);
cudaStream_t pStream;
cudaStreamCreate(&pStream);
int i = 0;
for (i = 0; i < 1024 * 1024; i++) {
// compute<<<1, 1, 0, pStream>>>();
cudaMemcpyAsync(h_buff, d_buff, 1024 * 1024 * 10, cudaMemcpyDeviceToHost, pStream);
}
cudaEventRecord(event, pStream);
cudaEventSynchronize(event);
return 0;
}

when it is running, the result of top showed that cpu is 100% busy…

So, I want to know how to yield CPU? Is there anyone can help me with it?

Thanks!