I noticed that when multi gpu is disabled, cuda reports only 1 device for GTX295, why???
GTX295 + 9800GTX, 790i, quite unimpressive
[font=“Courier New”]Device 0 took 3056.922363 ms
Device 1 took 3056.153564 ms
Device 2 took 2621.760254 ms
Average HtoD bandwidth in MB/s: 6628.852578
Device 0 took 2920.098633 ms
Device 1 took 3001.642822 ms
Device 2 took 2021.621704 ms
Average DtoH bandwidth in MB/s: 7489.647805[/font]
Windows code (I hope that normally NVIDIA code does not look like this)
[codebox]/*
-
Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
-
NOTICE TO USER:
-
This source code is subject to NVIDIA ownership rights under U.S. and
-
international Copyright laws. Users and possessors of this source code
-
are hereby granted a nonexclusive, royalty-free license to use this code
-
in individual and commercial software.
-
NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
-
CODE FOR ANY PURPOSE. IT IS PROVIDED “AS IS” WITHOUT EXPRESS OR
-
IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
-
REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
-
MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
-
IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
-
OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
-
OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
-
OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
-
OR PERFORMANCE OF THIS SOURCE CODE.
-
U.S. Government End Users. This source code is a “commercial item” as
-
that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
-
“commercial computer software” and "commercial computer software
-
documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
-
and is provided to the U.S. Government only as a commercial end item.
-
Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
-
227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
-
source code with only those rights set forth herein.
-
Any use of this source code in individual and commercial software must
-
include, in the user documentation and internal comments to the code,
-
the above Disclaimer and U.S. Government End Users Notice.
*/
#include <stdlib.h>
#include <stdio.h>
#include <cuda.h>
#include <windows.h>
#define MEMCOPY_ITERATIONS 50
#define MEMCOPY_SIZE (1 << 27) // 128M
#define MAX_DEVICES 16 //supports up to 16 devices at a time. 16 devices should be enough for anyone!
unsigned int devices[MAX_DEVICES];
unsigned int numDevices;
HANDLE condvar;
HANDLE devThreads[MAX_DEVICES];
float elapsedTimes[MAX_DEVICES];
HANDLE backEvents[MAX_DEVICES];
typedef union data_t
{
float f;
unsigned int ui;
} PackedType;
PackedType vars[MAX_DEVICES];
DWORD WINAPI testBandwidthHtoD(void* id)
{
PackedType & arg = *(PackedType *)(id);
unsigned int devID = arg.ui;
CUdevice dev;
CUcontext ctx;
CUevent start, stop;
void* loc1;
CUdeviceptr loc2;
cuDeviceGet(&dev, devID);
if (cuCtxCreate(&ctx, CU_CTX_SCHED_AUTO, dev) != CUDA_SUCCESS)
{
printf("Creating a conext with devID %u failed, aborting\n", devID);
ExitThread(1);
}
if (cuMemAllocHost(&loc1, MEMCOPY_SIZE) != CUDA_SUCCESS)
{
printf("cuMemAllocHost failed, aborting\n");
ExitThread(1);
}
if (cuMemAlloc(&loc2, MEMCOPY_SIZE) != CUDA_SUCCESS)
{
printf("cuMemAlloc failed, aborting\n");
ExitThread(1);
}
cuEventCreate(&start, 0);
cuEventCreate(&stop, 0);
//critical section
SetEvent(backEvents[devID]);
WaitForSingleObject(condvar, INFINITE);
cuEventRecord(start, 0);
for (int i = 0; i < MEMCOPY_ITERATIONS; i++)
{
if (cuMemcpyHtoD(loc2, loc1, MEMCOPY_SIZE) != CUDA_SUCCESS)
{
printf("cuMemcpyHtOD failed!\n");
}
}
cuEventRecord(stop, 0);
cuEventSynchronize(stop);
float elapsedTime;
cuEventElapsedTime(&elapsedTime, start, stop);
arg.f = elapsedTime;
return 0;
}
DWORD WINAPI testBandwidthDtoH(void* id)
{
PackedType & arg = *(PackedType *)(id);
unsigned int devID = arg.ui;
CUdevice dev;
CUcontext ctx;
CUevent start, stop;
CUdeviceptr loc1;
void* loc2;
cuDeviceGet(&dev, devID);
if (cuCtxCreate(&ctx, CU_CTX_SCHED_AUTO, dev) != CUDA_SUCCESS)
{
printf("Creating a conext with devID %u failed, aborting\n", devID);
ExitThread(1);
}
if (cuMemAllocHost(&loc2, MEMCOPY_SIZE) != CUDA_SUCCESS)
{
printf("cuMemAllocHost failed, aborting\n");
ExitThread(1);
}
if (cuMemAlloc(&loc1, MEMCOPY_SIZE) != CUDA_SUCCESS)
{
printf("cuMemAlloc failed, aborting\n");
ExitThread(1);
}
cuEventCreate(&start, 0);
cuEventCreate(&stop, 0);
//critical section
SetEvent(backEvents[devID]);
WaitForSingleObject(condvar, INFINITE);
cuEventRecord(start, 0);
for (int i = 0; i < MEMCOPY_ITERATIONS; i++)
{
if (cuMemcpyDtoH(loc2, loc1, MEMCOPY_SIZE) != CUDA_SUCCESS)
{
printf("cuMemcpyDtOH failed!\n");
}
}
cuEventRecord(stop, 0);
cuEventSynchronize(stop);
float elapsedTime;
cuEventElapsedTime(&elapsedTime, start, stop);
arg.f = elapsedTime;
return 0;
}
int main (int argc, char** argv)
{
if (argc == 1)
{
printf("usage: %s deviceID deviceID...\n", argv[0]);
exit(1);
}
if (cuInit(0) != CUDA_SUCCESS)
{
printf("cuInit failed, aborting...\n");
exit(1);
}
for (int i = 0; i < argc - 1; i++)
{
int dev = atoi(argv[i+1]);
CUdevice device;
if (cuDeviceGet(&device, dev) != CUDA_SUCCESS)
{
printf("Could not get device %d, aborting\n", dev);
exit(1);
}
devices[i] = dev;
}
numDevices = argc - 1;
condvar = CreateEvent(NULL, true, false, NULL);
for (int i = 0; i < numDevices; i++)
{
backEvents[devices[i]] = CreateEvent(NULL, false, false, NULL);
vars[devices[i]].ui = devices[i];
devThreads[i] = CreateThread(NULL, 0, testBandwidthHtoD, &vars[devices[i]], 0, NULL);
}
for (int i = 0; i < numDevices; i++) WaitForSingleObject(backEvents[devices[i]], INFINITE);
SetEvent(condvar);
void* returnVal = 0;
float maxElapsedTime = 0.f;
for (int i = 0; i < numDevices; i++)
{
WaitForSingleObject(devThreads[i], INFINITE);
PackedType d = vars[devices[i]];
printf("Device %u took %f ms\n", devices[i], d.f);
elapsedTimes[i] = d.f;
if (d.f > maxElapsedTime)
maxElapsedTime = d.f;
}
double bandwidthInMBs = 0;
for (int i = 0; i < numDevices; i++)
{
bandwidthInMBs += (1e3f * MEMCOPY_SIZE * (float)MEMCOPY_ITERATIONS) / (elapsedTimes[i] * (float)(1 << 20));
}
printf(“Average HtoD bandwidth in MB/s: %f\n”, bandwidthInMBs);
ResetEvent(condvar);
for (int i = 0; i < numDevices; i++)
{
vars[devices[i]].ui = devices[i];
devThreads[i] = CreateThread(NULL, 0, testBandwidthDtoH, &vars[devices[i]], 0, NULL);
}
for (int i = 0; i < numDevices; i++) WaitForSingleObject(backEvents[devices[i]], INFINITE);
SetEvent(condvar);
returnVal = 0;
maxElapsedTime = 0.f;
for (int i = 0; i < numDevices; i++)
{
WaitForSingleObject(devThreads[i], INFINITE);
PackedType d = vars[devices[i]];
printf("Device %u took %f ms\n", devices[i], d.f);
elapsedTimes[i] = d.f;
if (d.f > maxElapsedTime)
maxElapsedTime = d.f;
}
bandwidthInMBs = 0;
for (int i = 0; i < numDevices; i++)
{
bandwidthInMBs += (1e3f * MEMCOPY_SIZE * (float)MEMCOPY_ITERATIONS) / (elapsedTimes[i] * (float)(1 << 20));
}
printf(“Average DtoH bandwidth in MB/s: %f\n”, bandwidthInMBs);
}
[/codebox]