Asynchronous problem with cudaMalloc

Hi
I read the post https://developer.nvidia.com/blog/gpu-pro-tip-cuda-7-streams-simplify-concurrency/#disqus_thread

const int N = 1 << 20;

global void kernel(float *x, int n)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (int i = tid; i < n; i += blockDim.x * gridDim.x) {
x[i] = sqrt(pow(3.14159,i));
}
}

int main()
{
const int num_streams = 8;

cudaStream_t streams[num_streams];
float *data[num_streams];

for (int i = 0; i < num_streams; i++) {
    cudaStreamCreate(&streams[i]);

    cudaMalloc(&data[i], N * sizeof(float));
    
    // launch one worker kernel per stream
    kernel<<<1, 64, 0, streams[i]>>>(data[i], N);

    // launch a dummy kernel on the default stream
    kernel<<<1, 1>>>(0, 0);
}

cudaDeviceReset();

return 0;

}

I have questions about cudaMalloc because in cuda c programming guide, it claims that device memory allocation is synchronous. However, in this post, it did not affect the asynchronous execution. However, if I add one cudaFree, it will affect the asynchronous and make it much slower.

Anyone can help me?