Concurrent bandwidth test

I noticed that when multi gpu is disabled, cuda reports only 1 device for GTX295, why???

GTX295 + 9800GTX, 790i, quite unimpressive

[font=“Courier New”]Device 0 took 3056.922363 ms

Device 1 took 3056.153564 ms

Device 2 took 2621.760254 ms

Average HtoD bandwidth in MB/s: 6628.852578

Device 0 took 2920.098633 ms

Device 1 took 3001.642822 ms

Device 2 took 2021.621704 ms

Average DtoH bandwidth in MB/s: 7489.647805[/font]

Windows code (I hope that normally NVIDIA code does not look like this)

[codebox]/*

  • Copyright 1993-2009 NVIDIA Corporation. All rights reserved.

  • NOTICE TO USER:

  • This source code is subject to NVIDIA ownership rights under U.S. and

  • international Copyright laws. Users and possessors of this source code

  • are hereby granted a nonexclusive, royalty-free license to use this code

  • in individual and commercial software.

  • NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE

  • CODE FOR ANY PURPOSE. IT IS PROVIDED “AS IS” WITHOUT EXPRESS OR

  • IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH

  • REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF

  • MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.

  • IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,

  • OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS

  • OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE

  • OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE

  • OR PERFORMANCE OF THIS SOURCE CODE.

  • U.S. Government End Users. This source code is a “commercial item” as

  • that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of

  • “commercial computer software” and "commercial computer software

  • documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)

  • and is provided to the U.S. Government only as a commercial end item.

  • Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through

  • 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the

  • source code with only those rights set forth herein.

  • Any use of this source code in individual and commercial software must

  • include, in the user documentation and internal comments to the code,

  • the above Disclaimer and U.S. Government End Users Notice.

*/

#include <stdlib.h>

#include <stdio.h>

#include <cuda.h>

#include <windows.h>

#define MEMCOPY_ITERATIONS 50

#define MEMCOPY_SIZE (1 << 27) // 128M

#define MAX_DEVICES 16 //supports up to 16 devices at a time. 16 devices should be enough for anyone!

unsigned int devices[MAX_DEVICES];

unsigned int numDevices;

HANDLE condvar;

HANDLE devThreads[MAX_DEVICES];

float elapsedTimes[MAX_DEVICES];

HANDLE backEvents[MAX_DEVICES];

typedef union data_t

{

float f;

unsigned int ui;

} PackedType;

PackedType vars[MAX_DEVICES];

DWORD WINAPI testBandwidthHtoD(void* id)

{

PackedType & arg = *(PackedType *)(id);

unsigned int devID = arg.ui;

CUdevice dev;

CUcontext ctx;

CUevent start, stop;

void* loc1;

CUdeviceptr loc2;

cuDeviceGet(&dev, devID);

if (cuCtxCreate(&ctx, CU_CTX_SCHED_AUTO, dev) != CUDA_SUCCESS)

{

  printf("Creating a conext with devID %u failed, aborting\n", devID);

  ExitThread(1);

}

if (cuMemAllocHost(&loc1, MEMCOPY_SIZE) != CUDA_SUCCESS)

{

  printf("cuMemAllocHost failed, aborting\n");

  ExitThread(1);

}

if (cuMemAlloc(&loc2, MEMCOPY_SIZE) != CUDA_SUCCESS)

{

  printf("cuMemAlloc failed, aborting\n");

  ExitThread(1);

}

cuEventCreate(&start, 0);

cuEventCreate(&stop, 0);

//critical section

SetEvent(backEvents[devID]);

WaitForSingleObject(condvar, INFINITE);

cuEventRecord(start, 0);

for (int i = 0; i < MEMCOPY_ITERATIONS; i++)

{

  if (cuMemcpyHtoD(loc2, loc1, MEMCOPY_SIZE) != CUDA_SUCCESS)

{

  printf("cuMemcpyHtOD failed!\n");

}

}

cuEventRecord(stop, 0);

cuEventSynchronize(stop);

float elapsedTime;

cuEventElapsedTime(&elapsedTime, start, stop);

arg.f = elapsedTime;

return 0;

}

DWORD WINAPI testBandwidthDtoH(void* id)

{

PackedType & arg = *(PackedType *)(id);

unsigned int devID = arg.ui;

CUdevice dev;

CUcontext ctx;

CUevent start, stop;

CUdeviceptr loc1;

void* loc2;

cuDeviceGet(&dev, devID);

if (cuCtxCreate(&ctx, CU_CTX_SCHED_AUTO, dev) != CUDA_SUCCESS)

{

  printf("Creating a conext with devID %u failed, aborting\n", devID);

  ExitThread(1);

}

if (cuMemAllocHost(&loc2, MEMCOPY_SIZE) != CUDA_SUCCESS)

{

  printf("cuMemAllocHost failed, aborting\n");

  ExitThread(1);

}

if (cuMemAlloc(&loc1, MEMCOPY_SIZE) != CUDA_SUCCESS)

{

  printf("cuMemAlloc failed, aborting\n");

  ExitThread(1);

}

cuEventCreate(&start, 0);

cuEventCreate(&stop, 0);

//critical section

SetEvent(backEvents[devID]);

WaitForSingleObject(condvar, INFINITE);

cuEventRecord(start, 0);

for (int i = 0; i < MEMCOPY_ITERATIONS; i++)

{

  if (cuMemcpyDtoH(loc2, loc1, MEMCOPY_SIZE) != CUDA_SUCCESS)

{

  printf("cuMemcpyDtOH failed!\n");

}

}

cuEventRecord(stop, 0);

cuEventSynchronize(stop);

float elapsedTime;

cuEventElapsedTime(&elapsedTime, start, stop);

arg.f = elapsedTime;

return 0;

}

int main (int argc, char** argv)

{

if (argc == 1)

{

  printf("usage: %s deviceID deviceID...\n", argv[0]);

  exit(1);

}

if (cuInit(0) != CUDA_SUCCESS)

{

  printf("cuInit failed, aborting...\n");

  exit(1);

}

for (int i = 0; i < argc - 1; i++)

{

  int dev = atoi(argv[i+1]);

  CUdevice device;

  if (cuDeviceGet(&device, dev) != CUDA_SUCCESS)

{

  printf("Could not get device %d, aborting\n", dev);

  exit(1);

}

  devices[i] = dev;

}

numDevices = argc - 1;

condvar = CreateEvent(NULL, true, false, NULL);

for (int i = 0; i < numDevices; i++)

{

  backEvents[devices[i]] = CreateEvent(NULL, false, false, NULL);

  vars[devices[i]].ui = devices[i];

  devThreads[i] = CreateThread(NULL, 0, testBandwidthHtoD, &vars[devices[i]], 0, NULL);

}

for (int i = 0; i < numDevices; i++) WaitForSingleObject(backEvents[devices[i]], INFINITE);

SetEvent(condvar);

void* returnVal = 0;

float maxElapsedTime = 0.f;

for (int i = 0; i < numDevices; i++)

{

	WaitForSingleObject(devThreads[i], INFINITE);

PackedType d = vars[devices[i]];

  printf("Device %u took %f ms\n", devices[i], d.f);

  elapsedTimes[i] = d.f;

  if (d.f > maxElapsedTime)

maxElapsedTime = d.f;

}

double bandwidthInMBs = 0;

for (int i = 0; i < numDevices; i++)

{

  bandwidthInMBs += (1e3f * MEMCOPY_SIZE * (float)MEMCOPY_ITERATIONS) / (elapsedTimes[i] * (float)(1 << 20));

}

printf(“Average HtoD bandwidth in MB/s: %f\n”, bandwidthInMBs);

ResetEvent(condvar);

for (int i = 0; i < numDevices; i++)

{

  vars[devices[i]].ui = devices[i];

  devThreads[i] = CreateThread(NULL, 0, testBandwidthDtoH, &vars[devices[i]], 0, NULL);

}

for (int i = 0; i < numDevices; i++) WaitForSingleObject(backEvents[devices[i]], INFINITE);

SetEvent(condvar);

returnVal = 0;

maxElapsedTime = 0.f;

for (int i = 0; i < numDevices; i++)

{

  WaitForSingleObject(devThreads[i], INFINITE);

  PackedType d = vars[devices[i]];

  printf("Device %u took %f ms\n", devices[i], d.f);

  elapsedTimes[i] = d.f;

  if (d.f > maxElapsedTime)

maxElapsedTime = d.f;

}

bandwidthInMBs = 0;

for (int i = 0; i < numDevices; i++)

{

bandwidthInMBs += (1e3f * MEMCOPY_SIZE * (float)MEMCOPY_ITERATIONS) / (elapsedTimes[i] * (float)(1 << 20));

}

printf(“Average DtoH bandwidth in MB/s: %f\n”, bandwidthInMBs);

}

[/codebox]

Driver code is much, much cleaner–I just write hacky things in my spare time to stress things that I care about.

Are you sure that computation of bandwidth is correct?

For me it seems like it shuld be

(data + data2 + … + datan) /max time

and not

data1/time1 + data2/time2 + + datan/timen

Polished source code for Windows:

[codebox]/*

  • Copyright 1993-2009 NVIDIA Corporation. All rights reserved.

  • NOTICE TO USER:

  • This source code is subject to NVIDIA ownership rights under U.S. and

  • international Copyright laws. Users and possessors of this source code

  • are hereby granted a nonexclusive, royalty-free license to use this code

  • in individual and commercial software.

  • NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE

  • CODE FOR ANY PURPOSE. IT IS PROVIDED “AS IS” WITHOUT EXPRESS OR

  • IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH

  • REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF

  • MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.

  • IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,

  • OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS

  • OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE

  • OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE

  • OR PERFORMANCE OF THIS SOURCE CODE.

  • U.S. Government End Users. This source code is a “commercial item” as

  • that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of

  • “commercial computer software” and "commercial computer software

  • documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)

  • and is provided to the U.S. Government only as a commercial end item.

  • Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through

  • 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the

  • source code with only those rights set forth herein.

  • Any use of this source code in individual and commercial software must

  • include, in the user documentation and internal comments to the code,

  • the above Disclaimer and U.S. Government End Users Notice.

*/

#include <stdlib.h>

#include <stdio.h>

#include <cuda.h>

#include <windows.h>

#define MEMCOPY_ITERATIONS 50

#define MEMCOPY_SIZE (1 << 27) // 128M

#define MAX_DEVICES 16 //supports up to 16 devices at a time. 16 devices should be enough for anyone!

unsigned int devices[MAX_DEVICES];

unsigned int devices_r[MAX_DEVICES];

unsigned int numDevices;

HANDLE condvar;

HANDLE devThreads[MAX_DEVICES];

float elapsedTimes[MAX_DEVICES];

HANDLE backEvents[MAX_DEVICES];

typedef union data_t

{

float f;

unsigned int ui;

} PackedType;

PackedType vars[MAX_DEVICES];

DWORD WINAPI testBandwidthHtoD(void* id)

{

PackedType & arg = *(PackedType *)(id);

unsigned int devID = arg.ui;

CUdevice dev;

CUcontext ctx;

CUevent start, stop;

void* loc1;

CUdeviceptr loc2;

cuDeviceGet(&dev, devID);

if (cuCtxCreate(&ctx, CU_CTX_SCHED_AUTO, dev) != CUDA_SUCCESS)

{

	printf("Creating a conext with devID %u failed, aborting\n", devID);

	ExitThread(1);

}

if (cuMemAllocHost(&loc1, MEMCOPY_SIZE) != CUDA_SUCCESS)

{

	printf("cuMemAllocHost failed, aborting\n");

	ExitThread(1);

}

if (cuMemAlloc(&loc2, MEMCOPY_SIZE) != CUDA_SUCCESS)

{

	printf("cuMemAlloc failed, aborting\n");

	ExitThread(1);

}

cuEventCreate(&start, 0);

cuEventCreate(&stop, 0);

//critical section

SetEvent(backEvents[devID]);

WaitForSingleObject(condvar, INFINITE);

cuEventRecord(start, 0);

for (int i = 0; i < MEMCOPY_ITERATIONS; i++)

{

	if (cuMemcpyHtoD(loc2, loc1, MEMCOPY_SIZE) != CUDA_SUCCESS)

	{

		printf("cuMemcpyHtOD failed!\n");

	}

}

cuEventRecord(stop, 0);

cuEventSynchronize(stop);

float elapsedTime;

cuEventElapsedTime(&elapsedTime, start, stop);

cuMemFree(loc2);

cuMemFreeHost(loc1);

arg.f = elapsedTime;

return 0;

}

DWORD WINAPI testBandwidthDtoH(void* id)

{

PackedType & arg = *(PackedType *)(id);

unsigned int devID = arg.ui;

CUdevice dev;

CUcontext ctx;

CUevent start, stop;

CUdeviceptr loc1;

void* loc2;

cuDeviceGet(&dev, devID);

if (cuCtxCreate(&ctx, CU_CTX_SCHED_AUTO, dev) != CUDA_SUCCESS)

{

	printf("Creating a conext with devID %u failed, aborting\n", devID);

	ExitThread(1);

}

if (cuMemAllocHost(&loc2, MEMCOPY_SIZE) != CUDA_SUCCESS)

{

	printf("cuMemAllocHost failed, aborting\n");

	ExitThread(1);

}

if (cuMemAlloc(&loc1, MEMCOPY_SIZE) != CUDA_SUCCESS)

{

	printf("cuMemAlloc failed, aborting\n");

	ExitThread(1);

}

cuEventCreate(&start, 0);

cuEventCreate(&stop, 0);

//critical section

SetEvent(backEvents[devID]);

WaitForSingleObject(condvar, INFINITE);

cuEventRecord(start, 0);

for (int i = 0; i < MEMCOPY_ITERATIONS; i++)

{

	if (cuMemcpyDtoH(loc2, loc1, MEMCOPY_SIZE) != CUDA_SUCCESS)

	{

		printf("cuMemcpyDtOH failed!\n");

	}

}

cuEventRecord(stop, 0);

cuEventSynchronize(stop);

float elapsedTime;

cuEventElapsedTime(&elapsedTime, start, stop);

cuMemFree(loc1);

cuMemFreeHost(loc2);

arg.f = elapsedTime;

return 0;

}

DWORD WINAPI testBandwidthBidirectional(void* id)

{

PackedType & arg = *(PackedType *)(id);

unsigned int devID = arg.ui;

CUdevice dev;

CUcontext ctx;

CUevent start, stop;

CUstream stream1, stream2;

CUdeviceptr loc1, loc3;

void* loc2, *loc4;

cuDeviceGet(&dev, devID);

if (cuCtxCreate(&ctx, CU_CTX_SCHED_AUTO, dev) != CUDA_SUCCESS) {

	printf("cuStreamCreate failed\n");

	ExitThread(1);

}

if (cuStreamCreate(&stream1, 0) != CUDA_SUCCESS) {

	printf("cuStreamCreate failed\n");

	ExitThread(1);

}

if (cuStreamCreate(&stream2, 0) != CUDA_SUCCESS) {

	printf("cuStreamCreate failed\n");

	ExitThread(1);

}

if (cuMemAllocHost(&loc2, MEMCOPY_SIZE) != CUDA_SUCCESS) {

	printf("cuMemAllocHost failed, aborting\n");

	ExitThread(1);

}

if (cuMemAllocHost(&loc4, MEMCOPY_SIZE) != CUDA_SUCCESS) {

	printf("cuMemAllocHost failed, aborting\n");

	ExitThread(1);

}

if (cuMemAlloc(&loc1, MEMCOPY_SIZE) != CUDA_SUCCESS) {

	printf("cuMemAlloc failed, aborting\n");

	ExitThread(1);

}

if (cuMemAlloc(&loc3, MEMCOPY_SIZE) != CUDA_SUCCESS) {

	printf("cuMemAlloc failed, aborting\n");

	ExitThread(1);

}

cuEventCreate(&start, 0);

cuEventCreate(&stop, 0);

//critical section

SetEvent(backEvents[devID]);

WaitForSingleObject(condvar, INFINITE);

cuEventRecord(start, 0);

for (int i = 0; i < MEMCOPY_ITERATIONS; i++) {

	if (cuMemcpyDtoHAsync(loc2, loc1, MEMCOPY_SIZE, stream1) != CUDA_SUCCESS) {

		printf("cuMemcpyDtOH failed!\n");

	}

	if (cuMemcpyHtoDAsync(loc3, loc4, MEMCOPY_SIZE, stream2) != CUDA_SUCCESS) {

		printf("cuMemcpyHtoDAsync failed!\n");

	}		

	cuCtxSynchronize();

}	

cuEventRecord(stop, 0);

cuCtxSynchronize();



float elapsedTime;

cuEventElapsedTime(&elapsedTime, start, stop);

arg.f = elapsedTime;

cuMemFree(loc1);

cuMemFree(loc3);

cuMemFreeHost(loc2);

cuMemFreeHost(loc4);

return 0;

}

int main (int argc, char** argv)

{

if (argc == 1)

{

	printf("usage: %s deviceID deviceID...\n", argv[0]);

	exit(1);

}

if (cuInit(0) != CUDA_SUCCESS)

{

	printf("cuInit failed, aborting...\n");

	exit(1);

}

int j = 0;

for (int i = 0; i < argc - 1; i++)

{

	int dev = atoi(argv[i+1]);

	CUdevice device;

	if (cuDeviceGet(&device, dev) != CUDA_SUCCESS)

	{

		printf("Could not get device %d, aborting\n", dev);

		exit(1);

	}

	if(dev < 16 && devices_r[dev] == 0)

	{

		devices[j] = dev;

		devices_r[dev] = i+1;

		j++;

	} 

}

numDevices = j;

condvar = CreateEvent(NULL, true, false, NULL);

for (int i = 0; i < numDevices; i++)

{

	backEvents[devices[i]] = CreateEvent(NULL, false, false, NULL);

	vars[devices[i]].ui = devices[i];

	devThreads[i] = CreateThread(NULL, 0, testBandwidthHtoD, &vars[devices[i]], 0, NULL);

}

for (int i = 0; i < numDevices; i++) WaitForSingleObject(backEvents[devices[i]], INFINITE);

SetEvent(condvar);

void* returnVal = 0;

float maxElapsedTime = 0.f;

for (int i = 0; i < numDevices; i++)

{

	WaitForSingleObject(devThreads[i], INFINITE);

	PackedType d = vars[devices[i]];

	printf("Device %u took %f ms\n", devices[i], d.f);

	elapsedTimes[devices[i]] = d.f;

	if (d.f > maxElapsedTime)

		maxElapsedTime = d.f;

}

double bandwidthInMBs = 0;

for (int i = 0; i < numDevices; i++)

{

	bandwidthInMBs += (1e3f * MEMCOPY_SIZE * (float)MEMCOPY_ITERATIONS) / (elapsedTimes[devices[i]] * (float)(1 << 20));

}

printf("Average HtoD bandwidth in MB/s: %f\n", bandwidthInMBs);

ResetEvent(condvar);

for (int i = 0; i < numDevices; i++)

{

	vars[devices[i]].ui = devices[i];

	devThreads[i] = CreateThread(NULL, 0, testBandwidthDtoH, &vars[devices[i]], 0, NULL);

}

for (int i = 0; i < numDevices; i++) WaitForSingleObject(backEvents[devices[i]], INFINITE);

SetEvent(condvar);

returnVal = 0;

maxElapsedTime = 0.f;

for (int i = 0; i < numDevices; i++)

{

	WaitForSingleObject(devThreads[i], INFINITE);

	PackedType d = vars[devices[i]];

	printf("Device %u took %f ms\n", devices[i], d.f);

	elapsedTimes[devices[i]] = d.f;

	if (d.f > maxElapsedTime)

		maxElapsedTime = d.f;

}

bandwidthInMBs = 0;

for (int i = 0; i < numDevices; i++)

{

	bandwidthInMBs += (1e3f * MEMCOPY_SIZE * (float)MEMCOPY_ITERATIONS) / (elapsedTimes[devices[i]] * (float)(1 << 20));

}

printf("Average DtoH bandwidth in MB/s: %f\n", bandwidthInMBs);

ResetEvent(condvar);

for (int i = 0; i < numDevices; i++) {

	vars[devices[i]].ui = devices[i];

	devThreads[i] = CreateThread(NULL, 0, testBandwidthBidirectional, &vars[devices[i]], 0, NULL);

}

for (int i = 0; i < numDevices; i++) WaitForSingleObject(backEvents[devices[i]], INFINITE);

SetEvent(condvar);

returnVal = 0;

maxElapsedTime = 0.f;

for (int i = 0; i < numDevices; i++) {

	WaitForSingleObject(devThreads[i], INFINITE);

	PackedType d = vars[devices[i]];

	printf("Device %u took %f ms\n", devices[i], d.f);

	elapsedTimes[devices[i]] = d.f;

	if (d.f > maxElapsedTime)

		maxElapsedTime = d.f;

}

bandwidthInMBs = 0;

for (int i = 0; i < numDevices; i++) {

	bandwidthInMBs += (1e3f * MEMCOPY_SIZE * 2 * (float)MEMCOPY_ITERATIONS) / (elapsedTimes[devices[i]] * (float)(1 << 20));

}

printf("Average bidirectional bandwidth in MB/s: %f\n", bandwidthInMBs);

}

[/codebox]

So, for those of us without machines yet, but planning to take the plunge soon, what’s the take home lesson from all this?

What I’ve gathered is that using one card is fine, but using two at once saturates the PCIe controller (or something in that area) creating a bottleneck. Does this apply to AMD mobos, Intel mobos, etc? Specifically how does this play out for 790FX/890FX vs. nForce 980a vs. X58 solutions? If I’m concerned about being bandwidth limited, should I shell out the extra $$$ for a X58 /Core i7 solution vs AMD or nVidia’s offerings?

Regards,
Martin

Get the following message when we try to run it for a C2050 card:
Creating a conext with devID 0 failed, aborting

I also would be interested in the answer to this question. Additionally would somebody kindly tell me what kind of bandwidth I can get if I want to do the following?

  1. Transfer 8 50 MB or 100 MB chunks from non-overlapping parts of the host memory to 8 different GPU boards? Would they go in parallel? Would increasing the chunk sizes help? Is there an optimal or minimum chunk size?= to achieve the peak transfer rates?

  2. What about transferring between 4 pairs of GPU boards?

TIA

Anil

Tylersburg + GTX295:

PCI-E Bandwidth test: Device 0

---------------------------------------------------- 

Device 0 took 1129.994263 ms

Average Host-to-Device bandwidth in MB/s: 5663.745570

Device 0 took 1195.939331 ms

Average Device-to-Host bandwidth in MB/s: 5351.442029

Device 0 took 2408.072998 ms

Average bi-directional bandwidth in MB/s: 5315.453481

PCI-E Bandwidth test: Device 1 3

---------------------------------------------------- 

Device 1 took 1126.713745 ms

Device 3 took 1138.142090 ms

Average Host-to-Device bandwidth in MB/s: 11303.435494

Device 1 took 1446.144409 ms

Device 3 took 1432.187256 ms

Average Device-to-Host bandwidth in MB/s: 8894.250188

Device 1 took 2439.152344 ms

Device 3 took 2419.308594 ms

Average bi-directional bandwidth in MB/s: 10538.492497

PCI-E Bandwidth test: Device 0 1 2

---------------------------------------------------- 

Device 0 took 2244.541016 ms

Device 1 took 2247.074219 ms

Device 2 took 1131.505371 ms

Average Host-to-Device bandwidth in MB/s: 11355.692213

Device 0 took 2392.018066 ms

Device 1 took 2459.899902 ms

Device 2 took 1433.580200 ms

Average Device-to-Host bandwidth in MB/s: 9741.644330

Device 0 took 36331.773438 ms

Device 1 took 36270.406250 ms

Device 2 took 35048.425781 ms

Average bi-directional bandwidth in MB/s: 1070.422476

PCI-E Bandwidth test: Device 0 1 2 3

---------------------------------------------------- 

Device 0 took 2249.097412 ms

Device 1 took 2232.135986 ms

Device 2 took 2251.639404 ms

Device 3 took 2251.724854 ms

Average Host-to-Device bandwidth in MB/s: 11397.433759

Device 0 took 3327.489014 ms

Device 1 took 4061.572266 ms

Device 2 took 4162.840332 ms

Device 3 took 3516.382324 ms

Average Device-to-Host bandwidth in MB/s: 6856.580913

Device 0 took 64065.414063 ms

Device 1 took 64817.562500 ms

Device 2 took 64863.097656 ms

Device 3 took 64817.652344 ms

Average bi-directional bandwidth in MB/s: 792.088911

Tylersburg + GTX295:

PCI-E Bandwidth test: Device 0

---------------------------------------------------- 

Device 0 took 1129.994263 ms

Average Host-to-Device bandwidth in MB/s: 5663.745570

Device 0 took 1195.939331 ms

Average Device-to-Host bandwidth in MB/s: 5351.442029

Device 0 took 2408.072998 ms

Average bi-directional bandwidth in MB/s: 5315.453481

PCI-E Bandwidth test: Device 1 3

---------------------------------------------------- 

Device 1 took 1126.713745 ms

Device 3 took 1138.142090 ms

Average Host-to-Device bandwidth in MB/s: 11303.435494

Device 1 took 1446.144409 ms

Device 3 took 1432.187256 ms

Average Device-to-Host bandwidth in MB/s: 8894.250188

Device 1 took 2439.152344 ms

Device 3 took 2419.308594 ms

Average bi-directional bandwidth in MB/s: 10538.492497

PCI-E Bandwidth test: Device 0 1 2

---------------------------------------------------- 

Device 0 took 2244.541016 ms

Device 1 took 2247.074219 ms

Device 2 took 1131.505371 ms

Average Host-to-Device bandwidth in MB/s: 11355.692213

Device 0 took 2392.018066 ms

Device 1 took 2459.899902 ms

Device 2 took 1433.580200 ms

Average Device-to-Host bandwidth in MB/s: 9741.644330

Device 0 took 36331.773438 ms

Device 1 took 36270.406250 ms

Device 2 took 35048.425781 ms

Average bi-directional bandwidth in MB/s: 1070.422476

PCI-E Bandwidth test: Device 0 1 2 3

---------------------------------------------------- 

Device 0 took 2249.097412 ms

Device 1 took 2232.135986 ms

Device 2 took 2251.639404 ms

Device 3 took 2251.724854 ms

Average Host-to-Device bandwidth in MB/s: 11397.433759

Device 0 took 3327.489014 ms

Device 1 took 4061.572266 ms

Device 2 took 4162.840332 ms

Device 3 took 3516.382324 ms

Average Device-to-Host bandwidth in MB/s: 6856.580913

Device 0 took 64065.414063 ms

Device 1 took 64817.562500 ms

Device 2 took 64863.097656 ms

Device 3 took 64817.652344 ms

Average bi-directional bandwidth in MB/s: 792.088911

Gigabyte X58A-UD9 (2 x nForce 200), i7-980, 3 x GTX580

device: 0 1 2

---------------------------------------------------- 

Device 0 took 2297.387207 ms

Device 1 took 2302.033447 ms

Device 2 took 1166.566772 ms

Average HtoD bandwidth in MB/s: 11052.108154

Device 0 took 2554.130615 ms

Device 1 took 2496.386963 ms

Device 2 took 1561.133179 ms

Average DtoH bandwidth in MB/s: 9169.035889

Device 0 took 3592.909912 ms

Device 1 took 3571.318115 ms

Device 2 took 2607.047607 ms

Average bidirectional bandwidth in MB/s: 12056.451416

device: 1 2

---------------------------------------------------- 

Device 1 took 1173.054077 ms

Device 2 took 1173.096313 ms

Average HtoD bandwidth in MB/s: 10911.491211

Device 1 took 1557.881958 ms

Device 2 took 1557.917847 ms

Average DtoH bandwidth in MB/s: 8216.188965

Device 1 took 2730.019531 ms

Device 2 took 2729.413330 ms

Average bidirectional bandwidth in MB/s: 9378.263672

device: 2

---------------------------------------------------- 

Device 2 took 1124.733765 ms

Average HtoD bandwidth in MB/s: 5690.235352

Device 2 took 1014.834045 ms

Average DtoH bandwidth in MB/s: 6306.449707

Device 2 took 2135.481445 ms

Average bidirectional bandwidth in MB/s: 5993.964355

Gigabyte X58A-UD9 (2 x nForce 200), i7-980, 3 x GTX580

device: 0 1 2

---------------------------------------------------- 

Device 0 took 2297.387207 ms

Device 1 took 2302.033447 ms

Device 2 took 1166.566772 ms

Average HtoD bandwidth in MB/s: 11052.108154

Device 0 took 2554.130615 ms

Device 1 took 2496.386963 ms

Device 2 took 1561.133179 ms

Average DtoH bandwidth in MB/s: 9169.035889

Device 0 took 3592.909912 ms

Device 1 took 3571.318115 ms

Device 2 took 2607.047607 ms

Average bidirectional bandwidth in MB/s: 12056.451416

device: 1 2

---------------------------------------------------- 

Device 1 took 1173.054077 ms

Device 2 took 1173.096313 ms

Average HtoD bandwidth in MB/s: 10911.491211

Device 1 took 1557.881958 ms

Device 2 took 1557.917847 ms

Average DtoH bandwidth in MB/s: 8216.188965

Device 1 took 2730.019531 ms

Device 2 took 2729.413330 ms

Average bidirectional bandwidth in MB/s: 9378.263672

device: 2

---------------------------------------------------- 

Device 2 took 1124.733765 ms

Average HtoD bandwidth in MB/s: 5690.235352

Device 2 took 1014.834045 ms

Average DtoH bandwidth in MB/s: 6306.449707

Device 2 took 2135.481445 ms

Average bidirectional bandwidth in MB/s: 5993.964355

I tried version 1.1 and it gave several casting errors,when I tried to fix them It compiled but now it segfaults.

Here is the code.

/*

 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.

 *

 * NOTICE TO USER:

 *

 * This source code is subject to NVIDIA ownership rights under U.S. and

 * international Copyright laws.  Users and possessors of this source code

 * are hereby granted a nonexclusive, royalty-free license to use this code

 * in individual and commercial software.

 *

 * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE

 * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

 * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH

 * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF

 * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.

 * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,

 * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS

 * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE

 * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE

 * OR PERFORMANCE OF THIS SOURCE CODE.

 *

 * U.S. Government End Users.   This source code is a "commercial item" as

 * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of

 * "commercial computer  software"  and "commercial computer software

 * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)

 * and is provided to the U.S. Government only as a commercial end item.

 * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through

 * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the

 * source code with only those rights set forth herein.

 *

 * Any use of this source code in individual and commercial software must

 * include, in the user documentation and internal comments to the code,

 * the above Disclaimer and U.S. Government End Users Notice.

 */

#include <stdlib.h>

#include <stdio.h>

#include <cuda.h>

#include <pthread.h>

#define MEMCOPY_ITERATIONS 50

#define MEMCOPY_SIZE (1 << 27) // 128M

#define MAX_DEVICES 16 //supports up to 16 devices at a time. 16 devices should be enough for anyone!

unsigned int devices[MAX_DEVICES]; 

unsigned int numDevices;

volatile unsigned int numWaiting = 0;

pthread_mutex_t lock;

pthread_cond_t condvar;

pthread_t devThreads[MAX_DEVICES];

float elapsedTimes[MAX_DEVICES];

typedef union data_t

{

    float f;

    void* v;

    unsigned int ui;

} PackedType;

void* testBandwidthHtoD(void* id)

{

    PackedType arg = *(data_t*)(id);

    unsigned int devID = arg.ui;

    CUdevice dev;

    CUcontext ctx;

    CUevent start, stop;

void* loc1;

    CUdeviceptr loc2;

cuDeviceGet(&dev, devID);

    if (cuCtxCreate(&ctx, CU_CTX_SCHED_AUTO, dev) != CUDA_SUCCESS) {

        printf("Creating a context with devID %u failed, aborting\n", devID);

        pthread_exit((void*)1);      

    }

    if (cuMemAllocHost(&loc1, MEMCOPY_SIZE) != CUDA_SUCCESS) {

        printf("cuMemAllocHost failed, aborting\n");

        pthread_exit((void*)1);

    }

    if (cuMemAlloc(&loc2, MEMCOPY_SIZE) != CUDA_SUCCESS) {

        printf("cuMemAlloc failed, aborting\n");

        pthread_exit((void*)1);

    }

cuEventCreate(&start, 0);

    cuEventCreate(&stop, 0);

//critical section

    pthread_mutex_lock(&lock);

    ++numWaiting;

    pthread_cond_wait(&condvar, &lock);

    pthread_mutex_unlock(&lock);

cuEventRecord(start, 0);

    for (int i = 0; i < MEMCOPY_ITERATIONS; i++) {

        if (cuMemcpyHtoDAsync(loc2, loc1, MEMCOPY_SIZE, 0) != CUDA_SUCCESS) {

            printf("cuMemcpyHtOD failed!\n");

        }

    }

    cuEventRecord(stop, 0);

    cuEventSynchronize(stop);

    float elapsedTime;

    cuEventElapsedTime(&elapsedTime, start, stop);

    PackedType retval;

    retval.f = elapsedTime;

    return (void*)retval.v;

}

void* testBandwidthDtoH(void* id)

{

    PackedType arg = *(data_t*)(id);

    unsigned int devID = arg.ui;

    CUdevice dev;

    CUcontext ctx;

    CUevent start, stop;

CUdeviceptr loc1;

    void* loc2;

cuDeviceGet(&dev, devID);

    if (cuCtxCreate(&ctx, CU_CTX_SCHED_AUTO, dev) != CUDA_SUCCESS) {

        printf("Creating a context with devID %u failed, aborting\n", devID);

        pthread_exit((void*)1);      

    }

    if (cuMemAllocHost(&loc2, MEMCOPY_SIZE) != CUDA_SUCCESS) {

        printf("cuMemAllocHost failed, aborting\n");

        pthread_exit((void*)1);

    }

    if (cuMemAlloc(&loc1, MEMCOPY_SIZE) != CUDA_SUCCESS) {

        printf("cuMemAlloc failed, aborting\n");

        pthread_exit((void*)1);

    }

cuEventCreate(&start, 0);

    cuEventCreate(&stop, 0);

//critical section

    pthread_mutex_lock(&lock);

    ++numWaiting;

    pthread_cond_wait(&condvar, &lock);

    pthread_mutex_unlock(&lock);

cuEventRecord(start, 0);

    for (int i = 0; i < MEMCOPY_ITERATIONS; i++) {

        if (cuMemcpyDtoHAsync(loc2, loc1, MEMCOPY_SIZE, 0) != CUDA_SUCCESS) {

            printf("cuMemcpyDtOH failed!\n");

        }

    }

    cuEventRecord(stop, 0);

    cuEventSynchronize(stop);

    float elapsedTime;

    cuEventElapsedTime(&elapsedTime, start, stop);

    PackedType retval;

    retval.f = elapsedTime;

    return (void*)retval.v;

}

void* testBandwidthBidirectional(void* id)

{

    PackedType arg = *(data_t*)(id);

    unsigned int devID = arg.ui;

    CUdevice dev;

    CUcontext ctx;

    CUevent start, stop;

    CUstream stream1, stream2;

CUdeviceptr loc1, loc3;

    void* loc2, *loc4;

cuDeviceGet(&dev, devID);

    if (cuCtxCreate(&ctx, CU_CTX_SCHED_AUTO, dev) != CUDA_SUCCESS) {

        printf("cuStreamCreate failed\n");

        pthread_exit((void*)1);      

    }

    if (cuStreamCreate(&stream1, 0) != CUDA_SUCCESS) {

        printf("cuStreamCreate failed\n");

        pthread_exit((void*)1);      

    }

    if (cuStreamCreate(&stream2, 0) != CUDA_SUCCESS) {

        printf("cuStreamCreate failed\n");

        pthread_exit((void*)1);      

    }

if (cuMemAllocHost(&loc2, MEMCOPY_SIZE) != CUDA_SUCCESS) {

        printf("cuMemAllocHost failed, aborting\n");

        pthread_exit((void*)1);

    }

    if (cuMemAllocHost(&loc4, MEMCOPY_SIZE) != CUDA_SUCCESS) {

        printf("cuMemAllocHost failed, aborting\n");

        pthread_exit((void*)1);

    }

if (cuMemAlloc(&loc1, MEMCOPY_SIZE) != CUDA_SUCCESS) {

        printf("cuMemAlloc failed, aborting\n");

        pthread_exit((void*)1);

    }

    if (cuMemAlloc(&loc3, MEMCOPY_SIZE) != CUDA_SUCCESS) {

        printf("cuMemAlloc failed, aborting\n");

        pthread_exit((void*)1);

    }

cuEventCreate(&start, 0);

    cuEventCreate(&stop, 0);

//critical section

    pthread_mutex_lock(&lock);

    ++numWaiting;

    pthread_cond_wait(&condvar, &lock);

    pthread_mutex_unlock(&lock);

cuEventRecord(start, 0);

    for (int i = 0; i < MEMCOPY_ITERATIONS; i++) {

        if (cuMemcpyDtoHAsync(loc2, loc1, MEMCOPY_SIZE, stream1) != CUDA_SUCCESS) {

            printf("cuMemcpyDtOH failed!\n");

        }

        if (cuMemcpyHtoDAsync(loc3, loc4, MEMCOPY_SIZE, stream2) != CUDA_SUCCESS) {

            printf("cuMemcpyHtoDAsync failed!\n");

        }

    }

    cuEventRecord(stop, 0);

    cuCtxSynchronize();

    float elapsedTime;

    cuEventElapsedTime(&elapsedTime, start, stop);

    PackedType retval;

    retval.f = elapsedTime;

    return (void*)retval.v;

}

int main (int argc, char** argv)

{        

    if (argc == 1) {

        printf("usage: %s deviceID deviceID...\n", argv[0]);

        exit(1);

    }

if (cuInit(0) != CUDA_SUCCESS) {

        printf("cuInit failed, aborting...\n");

        exit(1);

    }

    for (int i = 0; i < argc - 1; i++) {

        int dev = atoi(argv[i+1]);

        CUdevice device;

        if (cuDeviceGet(&device, dev) != CUDA_SUCCESS) {

            printf("Could not get device %d, aborting\n", dev);

            exit(1);

        }

        devices[i] = dev;

    }

    numDevices = argc - 1;

pthread_mutex_init(&lock, NULL);

    pthread_cond_init(&condvar, NULL);

for (int i = 0; i < numDevices; i++) {

        PackedType arg;

        arg.ui = devices[i];

        pthread_create(&devThreads[i], NULL,testBandwidthHtoD,arg.v);

    }

while (numWaiting != numDevices) ;

    pthread_cond_broadcast(&condvar);

void* returnVal = 0;

    float maxElapsedTime = 0.f;

    for (int i = 0; i < numDevices; i++) {

        pthread_join(devThreads[i], &returnVal);

        PackedType d = *(PackedType*)returnVal;

        printf("Device %u took %f ms\n", devices[i], d.f);

        elapsedTimes[i] = d.f;

        if (d.f > maxElapsedTime) {

            maxElapsedTime = d.f;

        }

    }

    double bandwidthInMBs = 0;

    for (int i = 0; i < numDevices; i++) {

        bandwidthInMBs += (1e3f * MEMCOPY_SIZE * (float)MEMCOPY_ITERATIONS) / (elapsedTimes[i] * (float)(1 << 20));

    }

    printf("Average HtoD bandwidth in MB/s: %f\n", bandwidthInMBs);

    numWaiting = 0;

for (int i = 0; i < numDevices; i++) {

        PackedType arg;

        arg.ui = devices[i];

        pthread_create(&devThreads[i], NULL,(testBandwidthDtoH),arg.v);

    }

while (numWaiting != numDevices) ;

    pthread_cond_broadcast(&condvar);

returnVal = 0;

    maxElapsedTime = 0.f;

    for (int i = 0; i < numDevices; i++) {

        pthread_join(devThreads[i], &returnVal);

        PackedType d = *(PackedType*)returnVal;

        printf("Device %u took %f ms\n", devices[i], d.f);

        elapsedTimes[i] = d.f;

        if (d.f > maxElapsedTime)

            maxElapsedTime = d.f;

    }

    bandwidthInMBs = 0;

    for (int i = 0; i < numDevices; i++) {

        bandwidthInMBs += (1e3f * MEMCOPY_SIZE * (float)MEMCOPY_ITERATIONS) / (elapsedTimes[i] * (float)(1 << 20));

    }

    printf("Average DtoH bandwidth in MB/s: %f\n", bandwidthInMBs);

    numWaiting = 0;

for (int i = 0; i < numDevices; i++) {

        PackedType arg;

        arg.ui = devices[i];

        pthread_create(&devThreads[i], NULL, testBandwidthBidirectional,arg.v);

    }

while (numWaiting != numDevices) ;

    pthread_cond_broadcast(&condvar);

returnVal = 0;

    maxElapsedTime = 0.f;

    for (int i = 0; i < numDevices; i++) {

        pthread_join(devThreads[i], &returnVal);

        PackedType d = *(PackedType*)returnVal;

        printf("Device %u took %f ms\n", devices[i], d.f);

        elapsedTimes[i] = d.f;

        if (d.f > maxElapsedTime)

            maxElapsedTime = d.f;

    }

    bandwidthInMBs = 0;

    for (int i = 0; i < numDevices; i++) {

        bandwidthInMBs += (1e3f * MEMCOPY_SIZE * 2 * (float)MEMCOPY_ITERATIONS) / (elapsedTimes[i] * (float)(1 << 20));

    }

    printf("Average bidirectional bandwidth in MB/s: %f\n", bandwidthInMBs);

}

Also is there a reason it is written in low-level driver code+pthreads and not using the API?