bi-directional PCI-E transfer overlap

I have a GTX280 with PCI-E bus.
I intended to hide the bi-directional data transfer time between host and device using two streams.
Here is the test code:

#include <cuda.h>
#include <stdio.h>
#include <sys/time.h>

#define N (4096*4096)
int main (void) {

float2 *in, *out, *i_d, *o_d;
struct timeval tim;
double t1,t2;

cudaHostAlloc( (void **)&in, sizeof(float2)*N, cudaHostAllocDefault);
cudaHostAlloc( (void **)&out, sizeof(float2)*N, cudaHostAllocDefault);

cudaMalloc((void**)&i_d, sizeof(float2)N);
cudaMalloc((void
*)&o_d, sizeof(float2)*N);

cudaStream_t stream1,stream2;
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);

gettimeofday(&tim, NULL);
t1=tim.tv_sec+(tim.tv_usec/1000000.0);

cudaMemcpyAsync(i_d, in, sizeof(float2)*N, cudaMemcpyHostToDevice,stream1);
cudaMemcpyAsync(out, o_d, sizeof(float2)*N, cudaMemcpyDeviceToHost,stream2);

cudaThreadSynchronize();
gettimeofday(&tim, NULL);
t2=tim.tv_sec+(tim.tv_usec/1000000.0);
printf(" Run time=%f\n",(t2-t1));

cudaFreeHost(in);
cudaFreeHost(out);
cudaFree(i_d);
cudaFree(o_d);

return 0;
}

Steam1 along takes 40ms; stream2 along take 39 ms;
All together they take 79ms, there is no overlap!

PCI-E supports full duplex communications.
Can anybody explain why is this? Is there a way to overlap at all?

You can’t overlap both directions simultaneously using cudaMemcpyAsync until Fermi.

Thanks a lot !

If you write your own function that interleaves reads and writes from system RAM, I bet it will overlap.

Just allocate with cudaHostAllocMapped and write your own 2 way memcpy function.

I have never used mapped memory.

But, what do you mean by 2-way memcpy function? You should be able to access memory addresses directly in a kernel if it is mapped.

Isn’t that Mapped memory should be only used on integrated GPU cards? Otherwise it is much slower.

Also, is it possible to overlap 2-way transfer using two independent p-threads?

I mean this:

__global__ void BiDirectionalMemcpy(uint *dst0, uint *src0, uint *dst1, uint src1, uint n)

{

  for (uint i = threadIdx.x; i < n; i += blockDim.x)

  {

	dst0[i] = src0[i];

	dst1[i] = src1[i];

  }

}

int main()

{

  cudaSetDeviceFlags(cudaDeviceMapHost);

  BiDirectionalMemcpy <<<1, 512>>>(in_d, in_mapped, out_mapped, out_d);	// should use more than 1 block, but this is just an example

}

Mapped memory will be slower than device memory, but it won’t be any slower than using cudaMemcpy() on host RAM obviously.

No, there’s only 1 GPU and if the GPU doesn’t allow it, #threads won’t make a difference.

I tried the follow code:

[codebox]include <cuda.h>

include <stdio.h>

include <sys/time.h>

define N 1024 //(4096*4096)

define Threads 32 //512

define Blocks 1 //512

define Lines (N/(Threads*Blocks))

define PCI_IN

define PCI_OUT

global void BiDirectionalMemcpy(float2 *in_d, float2 *in_map,

                       float2 *out_d, float2 *out_map )

{

int base=blockDim.xLinesThreads+threadIdx.x;

for(int j=0; j<Lines; j++)

{

int i=base+j*Lines;

ifdef PCI_IN

    in_d[i].x=in_map[i].x;

    in_d[i].y=in_map[i].y;

#endif

ifdef PCI_OUT

  out_map[i].x=out_d[i].x;

  out_map[i].y=out_d[i].y;

#endif  

}

}

int main (void) {

float2 *in, *out, *in_map, *out_map, *in_d, *out_d;

struct timeval tim;

double t1,t2;

cudaSetDevice(0);

cudaSetDeviceFlags(cudaDeviceMapHost);

/* Pinned Mapped Memory */

cudaHostAlloc( (void **)&in, sizeof(float2)*N, cudaHostAllocMapped);

cudaHostAlloc( (void **)&out, sizeof(float2)*N, cudaHostAllocMapped);

cudaHostGetDevicePointer((void**)&in_map, (void*)in,0);

cudaHostGetDevicePointer((void**)&out_map, (void*)out,0);

cudaMalloc((void**)&in_d, sizeof(float2)*N);

cudaMalloc((void**)&out_d, sizeof(float2)*N);

/* time stamp1 */

cudaThreadSynchronize();

gettimeofday(&tim, NULL);

t1=tim.tv_sec+(tim.tv_usec/1000000.0);

BiDirectionalMemcpy<<<Blocks, Threads >>>(in_d,in_map,out_d, out_map);

/* time stamp2 */

cudaThreadSynchronize();

gettimeofday(&tim, NULL);

t2=tim.tv_sec+(tim.tv_usec/1000000.0);

printf(" Run time=%f\n",(t2-t1));

cudaFreeHost(in);

cudaFreeHost(out);

cudaFree(in_d);

cudaFree(out_d);

return 0;

}

[/codebox]

I am not sure if there is overlap or not but it is very slow if I did it correctly.

The PCI_OUT part is probably removed by nvcc, but the PCI_IN part of 1024 float2 takes 12 ms

CudaMemcpy of 16M float2 takes only 40ms

I tried your program and got these results for N = 4096 * 1024:

total transfer size = 64 MiB

BiDirectionalMemcpy: Tmin = 9.1ms Tmax = 24ms Tavg = 9.2ms

2 cudaMemcpyAsync: Tmin = 11.7ms Tmax = 24.7ms Tavg = 11.9 ms

Theoretical unidirectional PCI x16 bandwidth =

5 GhZ * (8 payload bits / 10 carrier bits) * (1 byte / 8 bits) * 16 lanes * 0.5 = 3815 MiB/s

Theoretical transfer time

without overlap:

16.8ms

complete overlap:

8.39ms

Update: this is a factor of 2 off (see below)

So both cudaMemcpyAsync() & BidirectionalMemcpy() overlap the transfers, despite what Tim said. I’m using Tesla 1060. Here, PCIe is much more of a bottleneck than memory bandwidth, but in general, a memory bandwidth bottleneck will prevent complete overlap.

I found I needed multiple trials in order for cudaMemcpy speed to ramp up, probably because calling cudaMemcpy for 1st time does extra initialization. BTW, your BiDirectionalMemcpy() is incorrect. Also, you should check for errors in cuda code (easiest is with CUDA_SAFE_CALL), especially when you’re allocating such big pinned arrays!

#define N (4096*1024)

#define Threads 512

#define Blocks 120

#define PCI_IN

#define PCI_OUT

__global__ void BiDirectionalMemcpy(float2 *in_d, float2 *in_map,

float2 *out_d, float2 *out_map )

{

int totalThreads = blockDim.x * gridDim.x;

for(int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += totalThreads)

{

#ifdef PCI_IN

in_d[i].x=in_map[i].x;

in_d[i].y=in_map[i].y;

#endif

#ifdef PCI_OUT

out_map[i].x=out_d[i].x;

out_map[i].y=out_d[i].y;

#endif

}

}

Don’t worry if you don’t get overlap. I’ve asked Tim and he insists current hardware doesn’t support it. Earlier you reported cudaMemcpyAsync didn’t overlap, which probably means the hardware doesn’t support it. Maybe NVIDIA thinks overlapped transfers are for high end applications and people who can afford $2000 Tesla boards.

I’ll test it on my GTX 260 core216 at home and see if that’s the case.

it has nothing to do with Tesla, GT200 just can’t do it while GF100 (and not just the Tesla ones) can.

Thanks for the help.

My BiDirectionalMemcpy funtion was a mass…

Now, I basically get the same result as using CudaMemcpyAsync.

10ms for PCI_IN , 9.9ms for PCI_OUT and 20 ms for both. There is no overlap.

I thought it was 1.0 but GTX280 and Tesla are using PCI-E 2.0 which has 8GBps unidirectional bandwidth.

Therefore all results are within this limit.

Well Tim, according to yourself here,

This is exactly what I’ve stated here and you ignored.

Please READ THE DETAILS when answering a non-trivial question and avoid misleading replies. This isn’t the 1st time you totally missed my question. I earlier asked here if transferring directly from a PCIe device to GPU RAM is possible and you said no period. Later, I found out it is clearly doable on Quadro boards.

Can GT200 overlap cuMemcpyDtoHAsync and cuMemcpyHtoDAsync in different streams? No. Your calculations are way off, because the maximum measured PCIe speed (for Gen2) is roughly 6GB/s. At that point, the fastest your transfer could run is 10.67ms. Which is to say, hey, no overlap!

Can GF100 overlap cuMemcpyDtoHAsync and cuMemcpyHtoDAsync in different streams? Yes, that’s a major architectural improvement.

Can GT200 overlap DtoH and HtoD transfers via zero-copy? Yes, it can.

Can GF100 do the same? Yes.

2 men say they’re Jesus. 1 has to be wrong.

Wikipedia - PCI Express

[b]

16 lane slot:

* v1.x: 4 GB/s

* v2.0: 8 GB/s

* v3.0: 16 GB/s

[/b]

I didn’t say maximum theoretical, I said maximum measured of Gen2 PCIe. Wikipedia’s stats are per direction, by the way, not aggregate.

[codebox]c:\Users\All Users\NVIDIA Corporation\NVIDIA GPU Computing SDK\C\bin\win64\Debug>bandwidthTest.exe --memory=pinned

[bandwidthTest]

bandwidthTest.exe Starting…

Running on…

Device 0: Tesla C1060

Quick Mode

affinity mask: 1

affinity mask: 1

Host to Device Bandwidth, 1 Device(s), Pinned memory, Write-Combined Memory Enabled

Transfer Size (Bytes) Bandwidth(MB/s)

33554432 5751.5

affinity mask: 1

affinity mask: 1

Device to Host Bandwidth, 1 Device(s), Pinned memory, Write-Combined Memory Enabled

Transfer Size (Bytes) Bandwidth(MB/s)

33554432 5292.6

affinity mask: 1

affinity mask: 1

Device to Device Bandwidth, 1 Device(s)

Transfer Size (Bytes) Bandwidth(MB/s)

33554432 74010.5

[bandwidthTest] - Test results:

PASSED

Press to Quit…


[/codebox]

I know exactly how the hardware is implemented, how the driver is implemented, and have looked at PCIe effects in plenty of systems. I even released a bidirectional bandwidth test a couple of weeks ago.

Whoops, my mistake. Apparently cudaMemcpyAsync() doesn’t overlap. The amount of overlap for BiDirectionalMemcpy() is much smaller than found earlier. The mistake was the code had a 2 way memory bank conflict, probably making the results less meaningful. After rewriting it here are the times:

32 MiB transfer per direction

Tavg_host2device = 6.9 ms => 4600 MiB/s
Tavg_both = 9.5 ms => 6700 MiB/s (aggregate)

Previously, I thought incorrectly that PCI 2.0 unidirection speed was 250 MByte/s since deviceBandwidthTest reports 3800 MiB/s for non-pinned memory copies, which I thought was the maximum PCI x16 speed.

You lost me.

What is the bank conflict you are talking about?

I meant non-coalesced accesses, not bank conflict.

in_d[i].x=in_map[i].x clearly doesn’t access contiguous locations across threads. I just changed float2 to float.

Actually, I think your following code using float2 is coalesced for compute capability 1.2

[codebox]

float2 *out_d, float2 *out_map;

int totalThreads = blockDim.x * gridDim.x;

for(int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += totalThreads)

{

ifdef PCI_IN

in_d[i].x=in_map[i].x;

in_d[i].y=in_map[i].y;

endif

}

[/codebox]

This will result in one 128Byte read/write for each half warp. Isn’t it?

BTW, even using float, I see no overlap anyway.

Nvidia driver version is 190.18, cuda-toolkit version 2.3

Tim, can you explain why GTX280 and Tesla have different results?

can any body tell is this PCI-E 1.0 or 2.0 from the lspci returns:

lspci

00:00.0 Host bridge: nVidia Corporation C55 Host Bridge (rev a2)

00:00.1 RAM memory: nVidia Corporation C55 Memory Controller (rev a1)

00:00.2 RAM memory: nVidia Corporation C55 Memory Controller (rev a1)

00:00.3 RAM memory: nVidia Corporation C55 Memory Controller (rev a1)

00:00.4 RAM memory: nVidia Corporation C55 Memory Controller (rev a1)

00:00.5 RAM memory: nVidia Corporation C55 Memory Controller (rev a2)

00:00.6 RAM memory: nVidia Corporation C55 Memory Controller (rev a1)

00:00.7 RAM memory: nVidia Corporation C55 Memory Controller (rev a1)

00:01.0 RAM memory: nVidia Corporation C55 Memory Controller (rev a1)

00:01.1 RAM memory: nVidia Corporation C55 Memory Controller (rev a1)

00:01.2 RAM memory: nVidia Corporation C55 Memory Controller (rev a1)

00:01.3 RAM memory: nVidia Corporation C55 Memory Controller (rev a1)

00:01.4 RAM memory: nVidia Corporation C55 Memory Controller (rev a1)

00:01.5 RAM memory: nVidia Corporation C55 Memory Controller (rev a1)

00:01.6 RAM memory: nVidia Corporation C55 Memory Controller (rev a1)

00:02.0 RAM memory: nVidia Corporation C55 Memory Controller (rev a1)

00:02.1 RAM memory: nVidia Corporation C55 Memory Controller (rev a1)

00:02.2 RAM memory: nVidia Corporation C55 Memory Controller (rev a1)

00:03.0 PCI bridge: nVidia Corporation C55 PCI Express bridge (rev a1)

00:09.0 RAM memory: nVidia Corporation MCP55 Memory Controller (rev a1)

00:0a.0 ISA bridge: nVidia Corporation MCP55 LPC Bridge (rev a2)

00:0a.1 SMBus: nVidia Corporation MCP55 SMBus (rev a2)

00:0a.2 RAM memory: nVidia Corporation MCP55 Memory Controller (rev a2)

00:0b.0 USB Controller: nVidia Corporation MCP55 USB Controller (rev a1)

00:0b.1 USB Controller: nVidia Corporation MCP55 USB Controller (rev a2)

00:0d.0 IDE interface: nVidia Corporation MCP55 IDE (rev a1)

00:0e.0 IDE interface: nVidia Corporation MCP55 SATA Controller (rev a2)

00:0e.1 IDE interface: nVidia Corporation MCP55 SATA Controller (rev a2)

00:0e.2 IDE interface: nVidia Corporation MCP55 SATA Controller (rev a2)

00:0f.0 PCI bridge: nVidia Corporation MCP55 PCI bridge (rev a2)

00:0f.1 Audio device: nVidia Corporation MCP55 High Definition Audio (rev a2)

00:11.0 Bridge: nVidia Corporation MCP55 Ethernet (rev a2)

00:12.0 Bridge: nVidia Corporation MCP55 Ethernet (rev a2)

01:00.0 VGA compatible controller: nVidia Corporation Device 05e1 (rev a1)

02:07.0 FireWire (IEEE 1394): Texas Instruments TSB43AB22/A IEEE-1394a-2000 Controller (PHY/Link)

Yes, it uses 1 128 byte transaction, but half of the data would be unused, hence wasted bandwidth.

Now that you mention it, an easier way to avoid the problem is to have each thread issue 1 8 byte load instead of 2 4 bytes ones.