GPU Broadcast in CUDA 4.0

I am running CUDA 4.0 on Linux using CUDA’s SDK as development framework.

I am running on 4 devices split between 2 GTX 590s.

I wanted to try to improve the bandwidth of a Host to Device broadcast using CUDA 4.0. For the following code I get a measured bandwidth of 1.45GB/s and no verification errors (it works fine!):

...

    cutilSafeCall(cudaMallocHost(&h0, buf_size)); // Automatically portable with UVA

    printf("Synchronous Broadcast:");

    __QC_START_TIMER //timer macro

    for(int i=0; i<trials; i++){

        //copy to g0 on device0

        cutilSafeCall(cudaMemcpy(g0, h0, buf_size, cudaMemcpyDefault));

        //copy to g1 on device1

        cutilSafeCall(cudaMemcpy(g1, h0, buf_size, cudaMemcpyDefault));

        //copy to g2 on device2

        cutilSafeCall(cudaMemcpy(g2, h0, buf_size, cudaMemcpyDefault));

        //copy to g3 on device3

        cutilSafeCall(cudaMemcpy(g3, h0, buf_size, cudaMemcpyDefault));

    }

    __QC_STOP_TIMER //timer macro

    printf(" %.2fGB/s\n",(1.0f / (__QC_ELAPSED_TIME / 1000.0f)) * (float(trials * buf_size)) / 1000.0f / 1000.0f / 1000.0f);

    VERIFY_DATA(); //verification macro

...

However, I thought I could speed up the bandwidth a bit by using cudaMemcpyAsync(); I get 4.32GB/s bandwidth, but the verification fails on g1, g2 and g3 for this code:

...

    //streams were created on their respective devices

    printf("Asynchronous Broadcast:");

    trials = 1;

    __QC_START_TIMER

    for(int i=0; i<trials; i++){

        cutilSafeCall(cudaSetDevice(gpuid_tesla[0]));

        cutilSafeCall(cudaMemcpyAsync(g0, h0, buf_size, cudaMemcpyDefault, stream[0]));

        cutilSafeCall(cudaSetDevice(gpuid_tesla[1]));

        cutilSafeCall(cudaMemcpyAsync(g1, h0, buf_size, cudaMemcpyDefault, stream[1]));

        cutilSafeCall(cudaSetDevice(gpuid_tesla[2]));

        cutilSafeCall(cudaMemcpyAsync(g2, h0, buf_size, cudaMemcpyDefault, stream[2]));

        cutilSafeCall(cudaSetDevice(gpuid_tesla[3]));

        cutilSafeCall(cudaMemcpyAsync(g3, h0, buf_size, cudaMemcpyDefault, stream[3]));

	

        cutilSafeCall(cudaStreamSynchronize(stream[0]));

        cutilSafeCall(cudaStreamSynchronize(stream[1]));

        cutilSafeCall(cudaStreamSynchronize(stream[2]));

        cutilSafeCall(cudaStreamSynchronize(stream[3]));

        cutilSafeCall(cudaDeviceSynchronize());

    }

	__QC_STOP_TIMER

	printf(" %.2fGB/s\n",(1.0f / (__QC_ELAPSED_TIME / 1000.0f)) * (float(trials * buf_size)) / 1000.0f / 1000.0f / 1000.0f);

	VERIFY_DATA();

...

I seems like I am only recording the time it takes for the first copy to complete (from h0 to g0).

Am I missing something? Do the other copies fail because they are reading from the same pinned host memory (h0)?