Concurrent bandwidth test

TrekCZ · March 23, 2010, 6:58pm

I noticed that when multi gpu is disabled, cuda reports only 1 device for GTX295, why???

GTX295 + 9800GTX, 790i, quite unimpressive

[font=“Courier New”]Device 0 took 3056.922363 ms

Device 1 took 3056.153564 ms

Device 2 took 2621.760254 ms

Average HtoD bandwidth in MB/s: 6628.852578

Device 0 took 2920.098633 ms

Device 1 took 3001.642822 ms

Device 2 took 2021.621704 ms

Average DtoH bandwidth in MB/s: 7489.647805[/font]

Windows code (I hope that normally NVIDIA code does not look like this)

[codebox]/*

Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
NOTICE TO USER:
This source code is subject to NVIDIA ownership rights under U.S. and
international Copyright laws. Users and possessors of this source code
are hereby granted a nonexclusive, royalty-free license to use this code
in individual and commercial software.
NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
CODE FOR ANY PURPOSE. IT IS PROVIDED “AS IS” WITHOUT EXPRESS OR
IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
OR PERFORMANCE OF THIS SOURCE CODE.
U.S. Government End Users. This source code is a “commercial item” as
that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
“commercial computer software” and "commercial computer software
documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
and is provided to the U.S. Government only as a commercial end item.
Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
source code with only those rights set forth herein.
Any use of this source code in individual and commercial software must
include, in the user documentation and internal comments to the code,
the above Disclaimer and U.S. Government End Users Notice.

*/

#include <stdlib.h>

#include <stdio.h>

#include <cuda.h>

#include <windows.h>

#define MEMCOPY_ITERATIONS 50

#define MEMCOPY_SIZE (1 << 27) // 128M

#define MAX_DEVICES 16 //supports up to 16 devices at a time. 16 devices should be enough for anyone!

unsigned int devices[MAX_DEVICES];

unsigned int numDevices;

HANDLE condvar;

HANDLE devThreads[MAX_DEVICES];

float elapsedTimes[MAX_DEVICES];

HANDLE backEvents[MAX_DEVICES];

typedef union data_t

{

float f;

unsigned int ui;

} PackedType;

PackedType vars[MAX_DEVICES];

DWORD WINAPI testBandwidthHtoD(void* id)

{

PackedType & arg = *(PackedType *)(id);

unsigned int devID = arg.ui;

CUdevice dev;

CUcontext ctx;

CUevent start, stop;

void* loc1;

CUdeviceptr loc2;

cuDeviceGet(&dev, devID);

if (cuCtxCreate(&ctx, CU_CTX_SCHED_AUTO, dev) != CUDA_SUCCESS)

{

  printf("Creating a conext with devID %u failed, aborting\n", devID);

  ExitThread(1);

}

if (cuMemAllocHost(&loc1, MEMCOPY_SIZE) != CUDA_SUCCESS)

{

  printf("cuMemAllocHost failed, aborting\n");

  ExitThread(1);

}

if (cuMemAlloc(&loc2, MEMCOPY_SIZE) != CUDA_SUCCESS)

{

  printf("cuMemAlloc failed, aborting\n");

  ExitThread(1);

}

cuEventCreate(&start, 0);

cuEventCreate(&stop, 0);

//critical section

SetEvent(backEvents[devID]);

WaitForSingleObject(condvar, INFINITE);

cuEventRecord(start, 0);

for (int i = 0; i < MEMCOPY_ITERATIONS; i++)

{

  if (cuMemcpyHtoD(loc2, loc1, MEMCOPY_SIZE) != CUDA_SUCCESS)

{

  printf("cuMemcpyHtOD failed!\n");

}

}

cuEventRecord(stop, 0);

cuEventSynchronize(stop);

float elapsedTime;

cuEventElapsedTime(&elapsedTime, start, stop);

arg.f = elapsedTime;

return 0;

}

DWORD WINAPI testBandwidthDtoH(void* id)

{

PackedType & arg = *(PackedType *)(id);

unsigned int devID = arg.ui;

CUdevice dev;

CUcontext ctx;

CUevent start, stop;

CUdeviceptr loc1;

void* loc2;

cuDeviceGet(&dev, devID);

if (cuCtxCreate(&ctx, CU_CTX_SCHED_AUTO, dev) != CUDA_SUCCESS)

{

  printf("Creating a conext with devID %u failed, aborting\n", devID);

  ExitThread(1);

}

if (cuMemAllocHost(&loc2, MEMCOPY_SIZE) != CUDA_SUCCESS)

{

  printf("cuMemAllocHost failed, aborting\n");

  ExitThread(1);

}

if (cuMemAlloc(&loc1, MEMCOPY_SIZE) != CUDA_SUCCESS)

{

  printf("cuMemAlloc failed, aborting\n");

  ExitThread(1);

}

cuEventCreate(&start, 0);

cuEventCreate(&stop, 0);

//critical section

SetEvent(backEvents[devID]);

WaitForSingleObject(condvar, INFINITE);

cuEventRecord(start, 0);

for (int i = 0; i < MEMCOPY_ITERATIONS; i++)

{

  if (cuMemcpyDtoH(loc2, loc1, MEMCOPY_SIZE) != CUDA_SUCCESS)

{

  printf("cuMemcpyDtOH failed!\n");

}

}

cuEventRecord(stop, 0);

cuEventSynchronize(stop);

float elapsedTime;

cuEventElapsedTime(&elapsedTime, start, stop);

arg.f = elapsedTime;

return 0;

}

int main (int argc, char** argv)

{

if (argc == 1)

{

  printf("usage: %s deviceID deviceID...\n", argv[0]);

  exit(1);

}

if (cuInit(0) != CUDA_SUCCESS)

{

  printf("cuInit failed, aborting...\n");

  exit(1);

}

for (int i = 0; i < argc - 1; i++)

{

  int dev = atoi(argv[i+1]);

  CUdevice device;

  if (cuDeviceGet(&device, dev) != CUDA_SUCCESS)

{

  printf("Could not get device %d, aborting\n", dev);

  exit(1);

}

  devices[i] = dev;

}

numDevices = argc - 1;

condvar = CreateEvent(NULL, true, false, NULL);