Well, file uploading isn’t working, so I’ll just attach 1.1 (with bidirectional bandwidth measurements) to this post.
[codebox]/*
-
Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
-
-
NOTICE TO USER:
-
-
This source code is subject to NVIDIA ownership rights under U.S. and
-
international Copyright laws. Users and possessors of this source code
-
are hereby granted a nonexclusive, royalty-free license to use this code
-
in individual and commercial software.
-
-
NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
-
CODE FOR ANY PURPOSE. IT IS PROVIDED “AS IS” WITHOUT EXPRESS OR
-
IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
-
REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
-
MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
-
IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
-
OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
-
OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
-
OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
-
OR PERFORMANCE OF THIS SOURCE CODE.
-
-
U.S. Government End Users. This source code is a “commercial item” as
-
that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
-
“commercial computer software” and "commercial computer software
-
documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
-
and is provided to the U.S. Government only as a commercial end item.
-
Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
-
227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
-
source code with only those rights set forth herein.
-
-
Any use of this source code in individual and commercial software must
-
include, in the user documentation and internal comments to the code,
-
the above Disclaimer and U.S. Government End Users Notice.
*/
#include <stdlib.h>
#include <stdio.h>
#include <cuda.h>
#include <pthread.h>
#define MEMCOPY_ITERATIONS 50
#define MEMCOPY_SIZE (1 << 27) // 128M
#define MAX_DEVICES 16 //supports up to 16 devices at a time. 16 devices should be enough for anyone!
unsigned int devices[MAX_DEVICES];
unsigned int numDevices;
volatile unsigned int numWaiting = 0;
pthread_mutex_t lock;
pthread_cond_t condvar;
pthread_t devThreads[MAX_DEVICES];
float elapsedTimes[MAX_DEVICES];
typedef union data_t
{
float f;
void* v;
unsigned int ui;
} PackedType;
void* testBandwidthHtoD(void* id)
{
PackedType arg = (PackedType)(id);
unsigned int devID = arg.ui;
CUdevice dev;
CUcontext ctx;
CUevent start, stop;
void* loc1;
CUdeviceptr loc2;
cuDeviceGet(&dev, devID);
if (cuCtxCreate(&ctx, CU_CTX_SCHED_AUTO, dev) != CUDA_SUCCESS) {
printf("Creating a context with devID %u failed, aborting\n", devID);
pthread_exit((void*)1);
}
if (cuMemAllocHost(&loc1, MEMCOPY_SIZE) != CUDA_SUCCESS) {
printf("cuMemAllocHost failed, aborting\n");
pthread_exit((void*)1);
}
if (cuMemAlloc(&loc2, MEMCOPY_SIZE) != CUDA_SUCCESS) {
printf("cuMemAlloc failed, aborting\n");
pthread_exit((void*)1);
}
cuEventCreate(&start, 0);
cuEventCreate(&stop, 0);
//critical section
pthread_mutex_lock(&lock);
++numWaiting;
pthread_cond_wait(&condvar, &lock);
pthread_mutex_unlock(&lock);
cuEventRecord(start, 0);
for (int i = 0; i < MEMCOPY_ITERATIONS; i++) {
if (cuMemcpyHtoDAsync(loc2, loc1, MEMCOPY_SIZE, 0) != CUDA_SUCCESS) {
printf("cuMemcpyHtOD failed!\n");
}
}
cuEventRecord(stop, 0);
cuEventSynchronize(stop);
float elapsedTime;
cuEventElapsedTime(&elapsedTime, start, stop);
PackedType retval;
retval.f = elapsedTime;
return (void*)retval.v;
}
void* testBandwidthDtoH(void* id)
{
PackedType arg = (PackedType)(id);
unsigned int devID = arg.ui;
CUdevice dev;
CUcontext ctx;
CUevent start, stop;
CUdeviceptr loc1;
void* loc2;
cuDeviceGet(&dev, devID);
if (cuCtxCreate(&ctx, CU_CTX_SCHED_AUTO, dev) != CUDA_SUCCESS) {
printf("Creating a context with devID %u failed, aborting\n", devID);
pthread_exit((void*)1);
}
if (cuMemAllocHost(&loc2, MEMCOPY_SIZE) != CUDA_SUCCESS) {
printf("cuMemAllocHost failed, aborting\n");
pthread_exit((void*)1);
}
if (cuMemAlloc(&loc1, MEMCOPY_SIZE) != CUDA_SUCCESS) {
printf("cuMemAlloc failed, aborting\n");
pthread_exit((void*)1);
}
cuEventCreate(&start, 0);
cuEventCreate(&stop, 0);
//critical section
pthread_mutex_lock(&lock);
++numWaiting;
pthread_cond_wait(&condvar, &lock);
pthread_mutex_unlock(&lock);
cuEventRecord(start, 0);
for (int i = 0; i < MEMCOPY_ITERATIONS; i++) {
if (cuMemcpyDtoHAsync(loc2, loc1, MEMCOPY_SIZE, 0) != CUDA_SUCCESS) {
printf("cuMemcpyDtOH failed!\n");
}
}
cuEventRecord(stop, 0);
cuEventSynchronize(stop);
float elapsedTime;
cuEventElapsedTime(&elapsedTime, start, stop);
PackedType retval;
retval.f = elapsedTime;
return (void*)retval.v;
}
void* testBandwidthBidirectional(void* id)
{
PackedType arg = (PackedType)(id);
unsigned int devID = arg.ui;
CUdevice dev;
CUcontext ctx;
CUevent start, stop;
CUstream stream1, stream2;
CUdeviceptr loc1, loc3;
void* loc2, *loc4;
cuDeviceGet(&dev, devID);
if (cuCtxCreate(&ctx, CU_CTX_SCHED_AUTO, dev) != CUDA_SUCCESS) {
printf("cuStreamCreate failed\n");
pthread_exit((void*)1);
}
if (cuStreamCreate(&stream1, 0) != CUDA_SUCCESS) {
printf("cuStreamCreate failed\n");
pthread_exit((void*)1);
}
if (cuStreamCreate(&stream2, 0) != CUDA_SUCCESS) {
printf("cuStreamCreate failed\n");
pthread_exit((void*)1);
}
if (cuMemAllocHost(&loc2, MEMCOPY_SIZE) != CUDA_SUCCESS) {
printf("cuMemAllocHost failed, aborting\n");
pthread_exit((void*)1);
}
if (cuMemAllocHost(&loc4, MEMCOPY_SIZE) != CUDA_SUCCESS) {
printf("cuMemAllocHost failed, aborting\n");
pthread_exit((void*)1);
}
if (cuMemAlloc(&loc1, MEMCOPY_SIZE) != CUDA_SUCCESS) {
printf("cuMemAlloc failed, aborting\n");
pthread_exit((void*)1);
}
if (cuMemAlloc(&loc3, MEMCOPY_SIZE) != CUDA_SUCCESS) {
printf("cuMemAlloc failed, aborting\n");
pthread_exit((void*)1);
}
cuEventCreate(&start, 0);
cuEventCreate(&stop, 0);
//critical section
pthread_mutex_lock(&lock);
++numWaiting;
pthread_cond_wait(&condvar, &lock);
pthread_mutex_unlock(&lock);
cuEventRecord(start, 0);
for (int i = 0; i < MEMCOPY_ITERATIONS; i++) {
if (cuMemcpyDtoHAsync(loc2, loc1, MEMCOPY_SIZE, stream1) != CUDA_SUCCESS) {
printf("cuMemcpyDtOH failed!\n");
}
if (cuMemcpyHtoDAsync(loc3, loc4, MEMCOPY_SIZE, stream2) != CUDA_SUCCESS) {
printf("cuMemcpyHtoDAsync failed!\n");
}
}
cuEventRecord(stop, 0);
cuCtxSynchronize();
float elapsedTime;
cuEventElapsedTime(&elapsedTime, start, stop);
PackedType retval;
retval.f = elapsedTime;
return (void*)retval.v;
}
int main (int argc, char** argv)
{
if (argc == 1) {
printf("usage: %s deviceID deviceID...\n", argv[0]);
exit(1);
}
if (cuInit(0) != CUDA_SUCCESS) {
printf("cuInit failed, aborting...\n");
exit(1);
}
for (int i = 0; i < argc - 1; i++) {
int dev = atoi(argv[i+1]);
CUdevice device;
if (cuDeviceGet(&device, dev) != CUDA_SUCCESS) {
printf("Could not get device %d, aborting\n", dev);
exit(1);
}
devices[i] = dev;
}
numDevices = argc - 1;
pthread_mutex_init(&lock, NULL);
pthread_cond_init(&condvar, NULL);
for (int i = 0; i < numDevices; i++) {
PackedType arg;
arg.ui = devices[i];
pthread_create(&devThreads[i], NULL, (testBandwidthHtoD),arg.v);
}
while (numWaiting != numDevices) ;
pthread_cond_broadcast(&condvar);
void* returnVal = 0;
float maxElapsedTime = 0.f;
for (int i = 0; i < numDevices; i++) {
pthread_join(devThreads[i], &returnVal);
PackedType d = (PackedType)returnVal;
printf("Device %u took %f ms\n", devices[i], d.f);
elapsedTimes[i] = d.f;
if (d.f > maxElapsedTime) {
maxElapsedTime = d.f;
}
}
double bandwidthInMBs = 0;
for (int i = 0; i < numDevices; i++) {
bandwidthInMBs += (1e3f * MEMCOPY_SIZE * (float)MEMCOPY_ITERATIONS) / (elapsedTimes[i] * (float)(1 << 20));
}
printf("Average HtoD bandwidth in MB/s: %f\n", bandwidthInMBs);
numWaiting = 0;
for (int i = 0; i < numDevices; i++) {
PackedType arg;
arg.ui = devices[i];
pthread_create(&devThreads[i], NULL, (testBandwidthDtoH),arg.v);
}
while (numWaiting != numDevices) ;
pthread_cond_broadcast(&condvar);
returnVal = 0;
maxElapsedTime = 0.f;
for (int i = 0; i < numDevices; i++) {
pthread_join(devThreads[i], &returnVal);
PackedType d = (PackedType)returnVal;
printf("Device %u took %f ms\n", devices[i], d.f);
elapsedTimes[i] = d.f;
if (d.f > maxElapsedTime)
maxElapsedTime = d.f;
}
bandwidthInMBs = 0;
for (int i = 0; i < numDevices; i++) {
bandwidthInMBs += (1e3f * MEMCOPY_SIZE * (float)MEMCOPY_ITERATIONS) / (elapsedTimes[i] * (float)(1 << 20));
}
printf("Average DtoH bandwidth in MB/s: %f\n", bandwidthInMBs);
numWaiting = 0;
for (int i = 0; i < numDevices; i++) {
PackedType arg;
arg.ui = devices[i];
pthread_create(&devThreads[i], NULL, (testBandwidthBidirectional),arg.v);
}
while (numWaiting != numDevices) ;
pthread_cond_broadcast(&condvar);
returnVal = 0;
maxElapsedTime = 0.f;
for (int i = 0; i < numDevices; i++) {
pthread_join(devThreads[i], &returnVal);
PackedType d = (PackedType)returnVal;
printf("Device %u took %f ms\n", devices[i], d.f);
elapsedTimes[i] = d.f;
if (d.f > maxElapsedTime)
maxElapsedTime = d.f;
}
bandwidthInMBs = 0;
for (int i = 0; i < numDevices; i++) {
bandwidthInMBs += (1e3f * MEMCOPY_SIZE * 2 * (float)MEMCOPY_ITERATIONS) / (elapsedTimes[i] * (float)(1 << 20));
}
printf("Average bidirectional bandwidth in MB/s: %f\n", bandwidthInMBs);
}
[/codebox]