Can't get any concurrency on simple vector add across multi-GPU and streams

I have built a small test case to better understand asynchronous data transfer and execution.

The problem is that I have tried every possible combination to achieve concurency:

loop_over(memcpy,kernel,memcpy).

loop_over(memcpy) then loop_over(kernel)then loop_over(memcpy).

And still I can’t get any concurrency on both multi-GPU AND streams in each GPU.

Multi-GPU works great and they are concurrent,but I can’t get it to overlap computation and data-transfer.

I am attaching my code so you can help me:

#include <stdio.h>

#include <stdlib.h>

// #include <cutil_inline.h>

#include <cuda.h>

#include <my_cuda_lib/my_lib.cu>

#include <my_cuda_lib/timer.h>

#include "helpers.cu"

#define DEBUG

__global__ void vectorAddGPU(float *a, float *b, float *c, int N)

{

  int idx = blockIdx.x*blockDim.x + threadIdx.x;

  if (idx < N)

    c[idx] = a[idx] + b[idx];

}

// Allocate generic memory with malloc() and pin it laster instead of using cudaHostAlloc()

int main(int argc, char *argv[])

{

int nstreams_per_device=atoi(argv[2]);

  int n, nelem;

 int numGPUs;

cudaGetDeviceCount(&numGPUs);

printf("CUDA-capable device count: %i\n", numGPUs);

cudaStream_t  streams_array[numGPUs*nstreams_per_device];

TGPUplan * plan;

plan=(TGPUplan *)malloc(numGPUs*sizeof(TGPUplan));

printf("plans created successfully \n");

unsigned int flags;

  size_t bytes;

  float *a, *b, *c;                      // Pinned memory allocated on the CPU

             // Device pointers for mapped memory  

#ifdef DEBUG

  float errorNorm, refNorm, ref, diff;

#endif

/* Allocate mapped CPU memory. */

nelem = atoi(argv[1])*1048576/4;

printf("Total number of elements in each of the three arrays : %d \n",nelem);

  bytes = nelem*sizeof(float);

printf("Total number of bytes in each of the three arrays : %d \n",bytes);

  timer total_time;

int ntiled=nelem/numGPUs;

int ntiled2=ntiled/nstreams_per_device;

timer gpu_malloc_timer;

  MY_SAFE_CALL(cudaMallocHost(&a, bytes));

  MY_SAFE_CALL(cudaMallocHost(&b, bytes));

  MY_SAFE_CALL(cudaMallocHost(&c, bytes));

float total_gpu_malloc=gpu_malloc_timer.milliseconds_elapsed();

for(int i = 0; i < numGPUs; i++){

MY_SAFE_CALL( cudaSetDevice(i) );

   MY_SAFE_CALL(cudaMalloc(&plan[i].d_a, ntiled*sizeof(float)));

   MY_SAFE_CALL(cudaMalloc(&plan[i].d_b, ntiled*sizeof(float)));

   MY_SAFE_CALL(cudaMalloc(&plan[i].d_c, ntiled*sizeof(float)));

}

/* Initialize the vectors. */

for(n = 0; n < nelem; n++)

  {

    a[n] = rand() / (float)RAND_MAX;

    b[n] = rand() / (float)RAND_MAX;

  }

printf("init arrays \n");

  /* Get the device pointers for the pinned CPU memory mapped into the GPU

     memory space. */

timer total_gpu_timer;

/* Call the GPU kernel using the device pointers for the mapped memory. */ 

 for(int i = 0; i < numGPUs; i++){

for (int j=0;j<nstreams_per_device;j++){

	//for each stream in the device we create a stream

// 	cudaStream_t * stream_ptr=&((plan[i].streams_array[i]));

        MY_SAFE_CALL(cudaStreamCreate(&streams_array[i*nstreams_per_device+j]) );

        }

}

dim3 block(256);

  dim3 grid((unsigned int)ceil(ntiled2/(float)block.x));

timer kernel_timer;

 for (int j=0;j<nstreams_per_device;j++){

for(int i = 0; i < numGPUs; i++){

        MY_SAFE_CALL( cudaSetDevice(i) );

 MY_SAFE_CALL(cudaMemcpyAsync( plan[i].d_a+j*ntiled2,a+i*ntiled+j*ntiled2 , ntiled2*sizeof(float), cudaMemcpyHostToDevice,streams_array[i*nstreams_per_device+j] ));

}

}

 for (int j=0;j<nstreams_per_device;j++){

for(int i = 0; i < numGPUs; i++){

//  myCudaGetLastError();

 MY_SAFE_CALL(cudaMemcpyAsync( plan[i].d_b+j*ntiled2,b+i*ntiled+j*ntiled2 , ntiled2*sizeof(float), cudaMemcpyHostToDevice,streams_array[i*nstreams_per_device+j] ));

}

}

for (int j=0;j<nstreams_per_device;j++){

for(int i = 0; i < numGPUs; i++){

MY_SAFE_CALL( cudaSetDevice(i) );

  vectorAddGPU<<<grid, block>>>(plan[i].d_a+j*ntiled2,plan[i].d_b+j*ntiled2,plan[i].d_c+j*ntiled2, ntiled2);

}

}

// myCudaGetLastError();

 for (int j=0;j<nstreams_per_device;j++){

for(int i = 0; i < numGPUs; i++){

MY_SAFE_CALL( cudaSetDevice(i) );

  MY_SAFE_CALL(cudaMemcpyAsync( c+i*ntiled+j*ntiled2, plan[i].d_c+j*ntiled2, ntiled2*sizeof(float), cudaMemcpyDeviceToHost,streams_array[i*nstreams_per_device+j] ));

}

}

float kernel_time=kernel_timer.milliseconds_elapsed();

//   /* Compare the results */

//  for ( int i=0; i<numGPUs; i++ ) 

// {

// MY_SAFE_CALL( cudaSetDevice(i) );

//  cudaDeviceSynchronize();

// // cudaStreamSynchronize(streams_array[i]);

// }

#ifdef DEBUG

  printf("Checking the results...\n");

  errorNorm = 0.f;

  refNorm = 0.f;

  for(n = 0; n < nelem; n++)

  {

    ref = a[n] + b[n];

    diff = c[n] - ref;

    errorNorm += diff*diff;

    refNorm += ref*ref;

  }

  errorNorm = (float)sqrt((double)errorNorm);

  refNorm = (float)sqrt((double)refNorm);

#endif

  /* Memory clean up */

timer gpu_free_timer;

    MY_SAFE_CALL(cudaFreeHost(a));

    MY_SAFE_CALL(cudaFreeHost(b));

    MY_SAFE_CALL(cudaFreeHost(c));

float total_gpu_free=gpu_free_timer.milliseconds_elapsed();

float total_gpu_time=total_gpu_timer.milliseconds_elapsed();

float total_time_ms=total_time.milliseconds_elapsed();

printf("total time elapsed was %f \n",total_time_ms);

printf("total mallocGPU time elapsed was %f \n",total_gpu_malloc);

printf("total kernel time elapsed was %f \n",kernel_time);

printf("total gpu free time elapsed was %f \n",total_gpu_free);

printf("total gpu time elapsed was %f \n",total_gpu_time+kernel_time+total_gpu_free);

#ifdef DEBUG

  printf("%s\n", (errorNorm/refNorm < 1.e-6f) ? "PASSED" : "FAILED");

#endif

}

You can substitute my_safe_call by:

#define my_safe_call(x) x

I have red:

http://www.pgroup.com/lit/articles/insider/v3n1a4.htm

as well as the webinar but nothing seems to work.

Any help would be appreciated.

Apostolis

Hi,
What is clearly missing here is to define the stream to attach your kernel invocation to. The lack of stream in you kernel call parameters means that they are in stream #0, which is fully synchronous. Change this to put “vectorAddGPU<<<grid, block, stream[i]>>>(plan[i].d_a+j*ntiled2,plan[i]” with one single stream per device.
See this and this for more details.

If you do not use streams all the calls are on the so called default streams and they are execute in serial order not concurrently.

cudaStream_t stream[nstr]; // defien stremas

    for (int is = 0; is < nstr; is++)

    {

 // cuda set device and allocations, stream creations

    cudaMalloc(&dev_overlap[is],sizeof(int)); 

cudaStreamCreate(&stream[is]);

    }

// execution on streams

			for(int ist=0;ist<nstr;ist++)

			{

    			      jxyz[ist].x=jump*(2.0*genrand64_real2()-1.0);

			      jxyz[ist].y=jump*(2.0*genrand64_real2()-1.0);

			      jxyz[ist].z=jump*(2.0*genrand64_real2()-1.0);

			

			      atom_i[ist]=round((Np-1)*genrand64_real2());

			

			      rnd[ist]=genrand64_real2();

			

			      newMCenergyarray<<<grid,blocks,0,stream[ist]>>>(d....); 

			}

A call without the stream (newMCenergyarray<<<grid,blocks>>>(d…); ) will block all streams.

That was a minor typo from last night.

I corrected it and still no concurrency.

Here is the updated code:

#include <stdio.h>

#include <stdlib.h>

// #include <cutil_inline.h>

#include <cuda.h>

#include <my_cuda_lib/my_lib.cu>

#include <my_cuda_lib/timer.h>

#include "helpers.cu"

// #define DEBUG

__global__ void vectorAddGPU(float *a, float *b, float *c, int N)

{

  int idx = blockIdx.x*blockDim.x + threadIdx.x;

  if (idx < N)

    c[idx] = a[idx] + b[idx];

}

// Allocate generic memory with malloc() and pin it laster instead of using cudaHostAlloc()

int main(int argc, char *argv[])

{

int nstreams_per_device=atoi(argv[2]);

  int n, nelem;

 int numGPUs;

cudaGetDeviceCount(&numGPUs);

printf("CUDA-capable device count: %i\n", numGPUs);

cudaStream_t  streams_array[numGPUs*nstreams_per_device];

TGPUplan * plan;

plan=(TGPUplan *)malloc(numGPUs*sizeof(TGPUplan));

printf("plans created successfully \n");

unsigned int flags;

  size_t bytes;

  float *a, *b, *c;                      // Pinned memory allocated on the CPU

             // Device pointers for mapped memory  

#ifdef DEBUG

  float errorNorm, refNorm, ref, diff;

#endif

/* Allocate mapped CPU memory. */

nelem = atoi(argv[1])*1048576/4;

printf("Total number of elements in each of the three arrays : %d \n",nelem);

  bytes = nelem*sizeof(float);

printf("Total number of bytes in each of the three arrays : %d \n",bytes);

  timer total_time;

int ntiled=nelem/numGPUs;

int ntiled2=ntiled/nstreams_per_device;

timer gpu_malloc_timer;

  MY_SAFE_CALL(cudaMallocHost(&a, bytes));

  MY_SAFE_CALL(cudaMallocHost(&b, bytes));

  MY_SAFE_CALL(cudaMallocHost(&c, bytes));

float total_gpu_malloc=gpu_malloc_timer.milliseconds_elapsed();

for(int i = 0; i < numGPUs; i++){

MY_SAFE_CALL( cudaSetDevice(i) );

   MY_SAFE_CALL(cudaMalloc(&plan[i].d_a, ntiled*sizeof(float)));

   MY_SAFE_CALL(cudaMalloc(&plan[i].d_b, ntiled*sizeof(float)));

   MY_SAFE_CALL(cudaMalloc(&plan[i].d_c, ntiled*sizeof(float)));

}

/* Initialize the vectors. */

for(n = 0; n < nelem; n++)

  {

    a[n] = rand() / (float)RAND_MAX;

    b[n] = rand() / (float)RAND_MAX;

  }

printf("init arrays \n");

  /* Get the device pointers for the pinned CPU memory mapped into the GPU

     memory space. */

timer total_gpu_timer;

/* Call the GPU kernel using the device pointers for the mapped memory. */ 

 for(int i = 0; i < numGPUs; i++){

for (int j=0;j<nstreams_per_device;j++){

	//for each stream in the device we create a stream

	cudaStream_t * stream_ptr=&((streams_array[i*nstreams_per_device+j]));

        MY_SAFE_CALL(cudaStreamCreate(stream_ptr ));

        }

}

dim3 block(256);

  dim3 grid((unsigned int)ceil(ntiled2/(float)block.x));

timer kernel_timer;

 for (int j=0;j<nstreams_per_device;j++){

for(int i = 0; i < numGPUs; i++){

        MY_SAFE_CALL( cudaSetDevice(i) );

 MY_SAFE_CALL(cudaMemcpyAsync( plan[i].d_a+j*ntiled2,a+i*ntiled+j*ntiled2 , ntiled2*sizeof(float), cudaMemcpyHostToDevice,streams_array[i*nstreams_per_device+j] ));

MY_SAFE_CALL(cudaMemcpyAsync( plan[i].d_b+j*ntiled2,b+i*ntiled+j*ntiled2 , ntiled2*sizeof(float), cudaMemcpyHostToDevice,streams_array[i*nstreams_per_device+j] ));

vectorAddGPU<<<grid, block,0,streams_array[i*nstreams_per_device+j]>>>(plan[i].d_a+j*ntiled2,plan[i].d_b+j*ntiled2,plan[i].d_c+j*ntiled2, ntiled2);

MY_SAFE_CALL(cudaMemcpyAsync( c+i*ntiled+j*ntiled2, plan[i].d_c+j*ntiled2, ntiled2*sizeof(float), cudaMemcpyDeviceToHost,streams_array[i*nstreams_per_device+j] ));

}

}

float kernel_time=kernel_timer.milliseconds_elapsed();

/* Compare the results */

  for ( int i=0; i<numGPUs; i++ ) 

 {

 MY_SAFE_CALL( cudaSetDevice(i) );

  cudaDeviceSynchronize();

// // cudaStreamSynchronize(streams_array[i]);

 }

#ifdef DEBUG

  printf("Checking the results...\n");

  errorNorm = 0.f;

  refNorm = 0.f;

  for(n = 0; n < nelem; n++)

  {

    ref = a[n] + b[n];

    diff = c[n] - ref;

    errorNorm += diff*diff;

    refNorm += ref*ref;

  }

  errorNorm = (float)sqrt((double)errorNorm);

  refNorm = (float)sqrt((double)refNorm);

#endif

  /* Memory clean up */

timer gpu_free_timer;

    MY_SAFE_CALL(cudaFreeHost(a));

    MY_SAFE_CALL(cudaFreeHost(b));

    MY_SAFE_CALL(cudaFreeHost(c));

float total_gpu_free=gpu_free_timer.milliseconds_elapsed();

float total_gpu_time=total_gpu_timer.milliseconds_elapsed();

float total_time_ms=total_time.milliseconds_elapsed();

printf("total time elapsed was %f \n",total_time_ms);

printf("total mallocGPU time elapsed was %f \n",total_gpu_malloc);

printf("total kernel time elapsed was %f \n",kernel_time);

printf("total gpu free time elapsed was %f \n",total_gpu_free);

printf("total gpu time elapsed was %f \n",total_gpu_time+kernel_time+total_gpu_free);

#ifdef DEBUG

  printf("%s\n", (errorNorm/refNorm < 1.e-6f) ? "PASSED" : "FAILED");

#endif

}

Any further ideas?

Thanks again,

Apostolis

It looks good. I suggest to start with simpler codes to see that you got the streams working.

The cudaDeviceSynchronize(); makes no sense in the loop. Shouldn’t be outside?

/* Compare the results */

  for ( int i=0; i<numGPUs; i++ ) 

 {

 MY_SAFE_CALL( cudaSetDevice(i) );

  cudaDeviceSynchronize();

// // cudaStreamSynchronize(streams_array[i]);

 }

Just another little fix.

The code now works correctly but still no concurrency:

__global__ void vectorAddGPU(float *a, float *b, float *c, int N)

{

  int idx = blockIdx.x*blockDim.x + threadIdx.x;

  if (idx < N)

    c[idx] = a[idx] + b[idx];

}

// Allocate generic memory with malloc() and pin it laster instead of using cudaHostAlloc()

int main(int argc, char *argv[])

{

int nstreams_per_device=atoi(argv[2]);

  int n, nelem;

 int numGPUs;

cudaGetDeviceCount(&numGPUs);

printf("CUDA-capable device count: %i\n", numGPUs);

cudaStream_t  streams_array[numGPUs*nstreams_per_device];

TGPUplan * plan;

plan=(TGPUplan *)malloc(numGPUs*sizeof(TGPUplan));

printf("plans created successfully \n");

unsigned int flags;

  size_t bytes;

  float *a, *b, *c;                      // Pinned memory allocated on the CPU

             // Device pointers for mapped memory  

#ifdef DEBUG

  float errorNorm, refNorm, ref, diff;

#endif

/* Allocate mapped CPU memory. */

nelem = atoi(argv[1])*1048576/4;

printf("Total number of elements in each of the three arrays : %d \n",nelem);

  bytes = nelem*sizeof(float);

printf("Total number of bytes in each of the three arrays : %d \n",bytes);

  timer total_time;

int ntiled=nelem/numGPUs;

int ntiled2=ntiled/nstreams_per_device;

timer gpu_malloc_timer;

  MY_SAFE_CALL(cudaMallocHost(&a, bytes));

  MY_SAFE_CALL(cudaMallocHost(&b, bytes));

  MY_SAFE_CALL(cudaMallocHost(&c, bytes));

float total_gpu_malloc=gpu_malloc_timer.milliseconds_elapsed();

for(int i = 0; i < numGPUs; i++){

MY_SAFE_CALL( cudaSetDevice(i) );

   MY_SAFE_CALL(cudaMalloc(&plan[i].d_a, ntiled*sizeof(float)));

   MY_SAFE_CALL(cudaMalloc(&plan[i].d_b, ntiled*sizeof(float)));

   MY_SAFE_CALL(cudaMalloc(&plan[i].d_c, ntiled*sizeof(float)));

}

/* Initialize the vectors. */

for(n = 0; n < nelem; n++)

  {

    a[n] = rand() / (float)RAND_MAX;

    b[n] = rand() / (float)RAND_MAX;

  }

printf("init arrays \n");

  /* Get the device pointers for the pinned CPU memory mapped into the GPU

     memory space. */

timer total_gpu_timer;

/* Call the GPU kernel using the device pointers for the mapped memory. */ 

 for(int i = 0; i < numGPUs; i++){

MY_SAFE_CALL( cudaSetDevice(i) );

for (int j=0;j<nstreams_per_device;j++){

        //for each stream in the device we create a stream

        cudaStream_t * stream_ptr=&((streams_array[i*nstreams_per_device+j]));

        MY_SAFE_CALL(cudaStreamCreate(stream_ptr ));

        }

}

dim3 block(256);

  dim3 grid((unsigned int)ceil(ntiled2/(float)block.x));

timer kernel_timer;

for(int i = 0; i < numGPUs; i++){

  MY_SAFE_CALL( cudaSetDevice(i) );     

 for (int j=0;j<nstreams_per_device;j++){

MY_SAFE_CALL(cudaMemcpyAsync( plan[i].d_a+j*ntiled2,a+i*ntiled+j*ntiled2 , ntiled2*sizeof(float), cudaMemcpyHostToDevice,streams_array[i*nstreams_per_device+j]

 ));

MY_SAFE_CALL(cudaMemcpyAsync( plan[i].d_b+j*ntiled2,b+i*ntiled+j*ntiled2 , ntiled2*sizeof(float), cudaMemcpyHostToDevice,streams_array[i*nstreams_per_device+j]

 ));

vectorAddGPU<<<grid, block,0,streams_array[i*nstreams_per_device+j]>>>(plan[i].d_a+j*ntiled2,plan[i].d_b+j*ntiled2,plan[i].d_c+j*ntiled2, ntiled2);

MY_SAFE_CALL(cudaMemcpyAsync( c+i*ntiled+j*ntiled2, plan[i].d_c+j*ntiled2, ntiled2*sizeof(float), cudaMemcpyDeviceToHost,streams_array[i*nstreams_per_device+j]

 ));

}

}

float kernel_time=kernel_timer.milliseconds_elapsed();

/* Compare the results */

  for ( int i=0; i<numGPUs; i++ ) 

 {

 MY_SAFE_CALL( cudaSetDevice(i) );

  cudaDeviceSynchronize();

// // cudaStreamSynchronize(streams_array[i]);

 }

#ifdef DEBUG

  printf("Checking the results...\n");

  errorNorm = 0.f;

  refNorm = 0.f;

  for(n = 0; n < nelem; n++)

  {

    ref = a[n] + b[n];

    diff = c[n] - ref;

    errorNorm += diff*diff;

    refNorm += ref*ref;

  }

  errorNorm = (float)sqrt((double)errorNorm);

  refNorm = (float)sqrt((double)refNorm);

#endif

  /* Memory clean up */

timer gpu_free_timer;

    MY_SAFE_CALL(cudaFreeHost(a));

    MY_SAFE_CALL(cudaFreeHost(b));

    MY_SAFE_CALL(cudaFreeHost(c));

float total_gpu_free=gpu_free_timer.milliseconds_elapsed();

float total_gpu_time=total_gpu_timer.milliseconds_elapsed();

float total_time_ms=total_time.milliseconds_elapsed();

printf("total time elapsed was %f \n",total_time_ms);

printf("total mallocGPU time elapsed was %f \n",total_gpu_malloc);

printf("total kernel time elapsed was %f \n",kernel_time);

printf("total gpu free time elapsed was %f \n",total_gpu_free);

printf("total gpu time elapsed was %f \n",total_gpu_time+kernel_time+total_gpu_free);

#ifdef DEBUG

  printf("%s\n", (errorNorm/refNorm < 1.e-6f) ? "PASSED" : "FAILED");

#endif

}

This is what I get when I run the profiler:

Also when there are two memcpy’s in the same direction I get a lot less bandwith than expected.
Bandwith of H2D to the first GPU is 5GB/s and when there is a H2D transfer to the second GPU at the same time the bandwidth to the second GPU is 2.2 GB/s.
When there is only a H2D transfer to the second GPU the bandwidth is still 5GB/s.

Also DeviceSynchronize() is in a loop because I have to synchronize 2 GPUs not just one.
Am I doing something wrong here?

In order to figure out what is going wrong I created a small test case with a single GPU by modifying the SDK example.

I can get 2-way concurrency(kernel+d2h)but not three-way(h2d+kernel+d2h).

I am attaching the code that produces 2-way concurrency.

If I use MemcpyAsync for the H2D transfers the results are correct but the whole process get serialized.

The same thing happened In the CUBLAS code from the webinar,I don’t know why.

Any ideas?

I am terribly buffled by this.please note that in the multi-GPU case it isn’t practical to use 2-way concurrency since I have to wait for the H2D transfer to complete before I change context.

Thanks in advance,

Apostolis

/*

 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.

 *

 * Please refer to the NVIDIA end user license agreement (EULA) associated

 * with this source code for terms and conditions that govern your use of

 * this software. Any use, reproduction, disclosure, or distribution of

 * this software and related documentation outside the terms of the EULA

 * is strictly prohibited.

 *

 *

 *

 * This sample illustrates the usage of CUDA streams for overlapping

 * kernel execution with device/host memcopies.  The kernel is used to 

 * initialize an array to a specific value, after which the array is 

 * copied to the host (CPU) memory.  To increase performance, multiple 

 * kernel/memcopy pairs are launched asynchronously, each pair in its 

 * own stream.  Devices with Compute Capability 1.1 can overlap a kernel

 * and a memcopy as long as they are issued in different streams.  Kernels

 * are serialized.  Thus, if n pairs are launched, streamed approach

 * can reduce the memcopy cost to the (1/n)th of a single copy of the entire

 * data set.

 *

 * Additionally, this sample uses CUDA events to measure elapsed time for

 * CUDA calls.  Events are a part of CUDA API and provide a system independent

 * way to measure execution times on CUDA devices with approximately 0.5 

 * microsecond precision.

 *

 * Elapsed times are averaged over nreps repetitions (10 by default).

 *

*/

const char *sSDKsample = "simpleStreams";

const char *sEventSyncMethod[] = 

{ 

	"cudaEventDefault", 

	"cudaEventBlockingSync", 

	"cudaEventDisableTiming", 

	NULL 

};

const char *sDeviceSyncMethod[] = 

{ 

	"cudaDeviceScheduleAuto", 

	"cudaDeviceScheduleSpin", 

	"cudaDeviceScheduleYield", 

	"INVALID", 

	"cudaDeviceScheduleBlockingSync", 

	NULL 

};

// Include headers

#include <stdio.h>

// CUDA utilities and system includes

#include <cuda_runtime.h>

// Shared Library Test Functions

#include <sdkHelper.h>  // helper for shared that are common to CUDA SDK samples

#include <shrUtils.h>

#include <shrQATest.h>

#include <my_cuda_lib/my_lib.cu>

#ifndef WIN32

#include <sys/mman.h> // for mmap() / munmap()

#endif

////////////////////////////////////////////////////////////////////////////////

// These are CUDA Helper functions

// This will output the proper CUDA error strings in the event that a CUDA host call returns an error

#define checkCudaErrors(err)  __checkCudaErrors (err, __FILE__, __LINE__)

inline void __checkCudaErrors(cudaError err, const char *file, const int line )

{

    if(cudaSuccess != err)

    {

        fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",file, line, (int)err, cudaGetErrorString( err ) );

        exit(-1);        

    }

}

// This will output the proper error string when calling cudaGetLastError

#define getLastCudaError(msg)      __getLastCudaError (msg, __FILE__, __LINE__)

inline void __getLastCudaError(const char *errorMessage, const char *file, const int line )

{

    cudaError_t err = cudaGetLastError();

    if (cudaSuccess != err)

    {

        fprintf(stderr, "%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n",

        file, line, errorMessage, (int)err, cudaGetErrorString( err ) );

        exit(-1);

    }

}

// General GPU Device CUDA Initialization

int gpuDeviceInit(int devID)

{

    int deviceCount;

    checkCudaErrors(cudaGetDeviceCount(&deviceCount));

if (deviceCount == 0)

    {

        fprintf(stderr, "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n");

        exit(-1);

    }

if (devID < 0)

       devID = 0;

if (devID > deviceCount-1)

    {

        fprintf(stderr, "\n");

        fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);

        fprintf(stderr, ">> gpuDeviceInit (-device=%d) is not a valid GPU device. <<\n", devID);

        fprintf(stderr, "\n");

        return -devID;

    }

cudaDeviceProp deviceProp;

    checkCudaErrors( cudaGetDeviceProperties(&deviceProp, devID) );

if (deviceProp.major < 1)

    {

        fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");

        exit(-1);                                                  

    }

checkCudaErrors( cudaSetDevice(devID) );

    printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, deviceProp.name);

return devID;

}

// This function returns the best GPU (with maximum GFLOPS)

int gpuGetMaxGflopsDeviceId()

{

    int current_device     = 0, sm_per_multiproc  = 0;

    int max_compute_perf   = 0, max_perf_device   = 0;

    int device_count       = 0, best_SM_arch      = 0;

    cudaDeviceProp deviceProp;

    cudaGetDeviceCount( &device_count );

// Find the best major SM Architecture GPU device

    while (current_device < device_count)

    {

        cudaGetDeviceProperties( &deviceProp, current_device );

        if (deviceProp.major > 0 && deviceProp.major < 9999)

        {

            best_SM_arch = MAX(best_SM_arch, deviceProp.major);

        }

        current_device++;

    }

// Find the best CUDA capable GPU device

    current_device = 0;

    while( current_device < device_count )

    {

        cudaGetDeviceProperties( &deviceProp, current_device );

        if (deviceProp.major == 9999 && deviceProp.minor == 9999)

        {

            sm_per_multiproc = 1;

        }

        else

        {

            sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);

        }

int compute_perf  = deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;

if( compute_perf  > max_compute_perf )

    {

            // If we find GPU with SM major > 2, search only these

            if ( best_SM_arch > 2 )

            {

                // If our device==dest_SM_arch, choose this, or else pass

                if (deviceProp.major == best_SM_arch)

                {

                    max_compute_perf  = compute_perf;

                    max_perf_device   = current_device;

                 }

            }

            else

            {

                max_compute_perf  = compute_perf;

                max_perf_device   = current_device;

             }

        }

        ++current_device;

    }

    return max_perf_device;

}

// Initialization code to find the best CUDA Device

int findCudaDevice(int argc, const char **argv)

{

    cudaDeviceProp deviceProp;

    int devID = 0;

    // If the command-line has a device number specified, use it

    if (checkCmdLineFlag(argc, argv, "device"))

    {

        devID = getCmdLineArgumentInt(argc, argv, "device=");

        if (devID < 0)

        {

            printf("Invalid command line parameter\n ");

            exit(-1);

        }

        else

        {

            devID = gpuDeviceInit(devID);

            if (devID < 0)

            {

                printf("exiting...\n");

                shrQAFinishExit(argc, (const char **)argv, QA_FAILED);

                exit(-1);

            }

        }

    }

    else

    {

        // Otherwise pick the device with highest Gflops/s

        devID = gpuGetMaxGflopsDeviceId();

        checkCudaErrors( cudaSetDevice( devID ) );

        checkCudaErrors( cudaGetDeviceProperties(&deviceProp, devID) );

        printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);

    }

    return devID;

}

// end of CUDA Helper Functions

// Macro to aligned up to the memory size in question

#define MEMORY_ALIGNMENT  4096

#define ALIGN_UP(x,size) ( ((size_t)x+(size-1))&(~(size-1)) )

__global__ void vectorAddGPU(int *a, int *b, int *c, int N,int nIterations)

{

  int idx = blockIdx.x*blockDim.x + threadIdx.x;

// for( int i=0;i<nIterations;i++){  

if (idx < N){

c[idx] = a[idx] + b[idx];}

}

// }

int correct_data(int *a,int*b,int*c, const int n)

 {

     for(int i = 0; i < n; i++) {

         if(c[i] != a[i]+b[i]) {

            printf("%d: %d %d\n", i, a[i], c);

            return 0;

         }

     }

     return 1;

 }

inline void 

AllocateHostMemory(bool bPinGenericMemory, int **pp_a, int **ppAligned_a, int nbytes)

{

#if CUDART_VERSION >= 4000

    if (bPinGenericMemory)

    {

        // allocate a generic page-aligned chunk of system memory

    #ifdef WIN32

        printf("> VirtualAlloc() allocating %4.2f Mbytes of (generic page-aligned system memory)\n", (float)nbytes/1048576.0f);

        *pp_a = (int *) VirtualAlloc( NULL, (nbytes + MEMORY_ALIGNMENT), MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE );

    #else

        printf("> mmap() allocating %4.2f Mbytes (generic page-aligned system memory)\n", (float)nbytes/1048576.0f);

        *pp_a = (int *) mmap        ( NULL, (nbytes + MEMORY_ALIGNMENT), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0 );

    #endif

*ppAligned_a = (int *)ALIGN_UP(*pp_a, MEMORY_ALIGNMENT);

printf("> cudaHostRegister() registering %4.2f Mbytes of generic allocated system memory\n", (float)nbytes/1048576.0f);

        // pin allocate memory

        checkCudaErrors( cudaHostRegister(*ppAligned_a, nbytes, cudaHostRegisterMapped) );

    }

    else

#endif

    {

        printf("> cudaMallocHost() allocating %4.2f Mbytes of system memory\n", (float)nbytes/1048576.0f);

        // allocate host memory (pinned is required for achieve asynchronicity)

        checkCudaErrors( cudaMallocHost((void**)pp_a, nbytes) ); 

        *ppAligned_a = *pp_a; 

    }

}

inline void

FreeHostMemory(bool bPinGenericMemory, int **pp_a, int **ppAligned_a, int nbytes)

{

#if CUDART_VERSION >= 4000

    // CUDA 4.0 support pinning of generic host memory

    if (bPinGenericMemory)

    {

        // unpin and delete host memory

        checkCudaErrors( cudaHostUnregister(*ppAligned_a) );

  #ifdef WIN32

        VirtualFree(*pp_a, 0, MEM_RELEASE);

  #else

        munmap(*pp_a, nbytes);

  #endif

    }

    else

#endif

    {

        cudaFreeHost(*pp_a);

    }

}

static char *sSyncMethod[] = 

{

    "0 (Automatic Blocking)",

    "1 (Spin Blocking)",

    "2 (Yield Blocking)",

    "3 (Undefined Blocking Method)",

    "4 (Blocking Sync Event) = low CPU utilization",

    NULL

};

void printHelp()

{

    printf("Usage: %s [options below]\n", sSDKsample);

    printf("\t--sync_method=n for CPU/GPU synchronization\n");

    printf("\t             n=%s\n", sSyncMethod[0]);

    printf("\t             n=%s\n", sSyncMethod[1]);

    printf("\t             n=%s\n", sSyncMethod[2]);

    printf("\t   <Default> n=%s\n", sSyncMethod[4]);

    printf("\t--use_generic_memory (default) use generic page-aligned for system memory\n");

    printf("\t--use_cuda_malloc_host (optional) use cudaMallocHost to allocate system memory\n");

}

#if defined(__APPLE__) || defined(MACOSX)

#define DEFAULT_PINNED_GENERIC_MEMORY false

#else

#define DEFAULT_PINNED_GENERIC_MEMORY true

#endif

int main(int argc, char **argv)

{

    int cuda_device = 0;

    int nstreams = 4;               // number of streams for CUDA calls

    int nreps = 1;                 // number of times each experiment is repeated

    int n = 16 * 1024 * 1024;       // number of ints in the data set

    int nbytes = n * sizeof(int);   // number of data bytes

    dim3 threads, blocks;           // kernel launch configuration

    float elapsed_time, time_memcpy, time_kernel;   // timing variables

    float scale_factor = 1.0f;

// allocate generic memory and pin it laster instead of using cudaHostAlloc()

bool bPinGenericMemory  = DEFAULT_PINNED_GENERIC_MEMORY; // we want this to be the default behavior

    int  device_sync_method = cudaDeviceBlockingSync; // by default we use BlockingSync

int niterations;	// number of iterations for the loop inside the kernel

shrQAStart(argc, argv);

printf("[ %s ]\n\n", sSDKsample);

    if( checkCmdLineFlag( argc, (const char **)argv, "help") ) {

        printHelp();

        shrQAFinishExit(argc, (const char **)argv, QA_PASSED);

    }

    if( (device_sync_method = getCmdLineArgumentInt( argc, (const char **)argv, "sync_method" )) >= 0 ) {

        if (device_sync_method == 0 || device_sync_method == 1 || device_sync_method == 2 || device_sync_method == 4) {

            printf("Device synchronization method set to = %s\n", sSyncMethod[device_sync_method]);

            printf("Setting reps to %d to demonstrate steady state\n",nreps);

            nreps = 100;

        } else {

            printf("Invalid command line option sync_method=\"%d\"\n", device_sync_method);

            shrQAFinishExit(argc, (const char **)argv, QA_FAILED);

        }

	} else {

        printHelp();

        shrQAFinishExit(argc, (const char **)argv, QA_PASSED);

	}

if( checkCmdLineFlag( argc, (const char **)argv, "use_generic_memory") ) {

#if defined(__APPLE__) || defined(MACOSX)

        bPinGenericMemory = false;  // Generic Pinning of System Paged memory not currently supported on Mac OSX 

#else

        bPinGenericMemory = true;

#endif

    }

    if( checkCmdLineFlag( argc, (const char **)argv, "use_cuda_malloc_host") ) {

        bPinGenericMemory = false;

    }

printf("\n> ");

    cuda_device = findCudaDevice(argc, (const char **)argv);

// check the compute capability of the device

    int num_devices=0;

    checkCudaErrors( cudaGetDeviceCount(&num_devices) );

    if(0==num_devices)

    {

        printf("your system does not have a CUDA capable device, waiving test...\n");

        shrQAFinishExit(argc, (const char **)argv, QA_PASSED);

    }

	

    // check if the command-line chosen device ID is within range, exit if not

    if( cuda_device >= num_devices )

    {

        printf("cuda_device=%d is invalid, must choose device ID between 0 and %d\n", cuda_device, num_devices-1);

        shrQAFinishExit(argc, (const char **)argv, QA_FAILED);

    }

cudaSetDevice( cuda_device );

// Checking for compute capabilities

    cudaDeviceProp deviceProp;

    checkCudaErrors( cudaGetDeviceProperties(&deviceProp, cuda_device) );

    if( (1 == deviceProp.major) && (deviceProp.minor < 1)) {

        printf("%s does not have Compute Capability 1.1 or newer.  Reducing workload.\n", deviceProp.name);

    }

if(deviceProp.major >= 2) {

        niterations = 100;

    } else {		    

        if(deviceProp.minor > 1) {

            niterations = 5;

        } else {

            niterations = 1; // reduced workload for compute capability 1.0 and 1.1

        }

    }

// Check if GPU can map host memory (Generic Method), if not then we override bPinGenericMemory to be false

    if (bPinGenericMemory) {

       printf("Device: <%s> canMapHostMemory: %s\n", deviceProp.name, deviceProp.canMapHostMemory ? "Yes" : "No");

       if (deviceProp.canMapHostMemory == 0) {

          printf("Using cudaMallocHost, CUDA device does not support mapping of generic host memory\n");

          bPinGenericMemory = false;

       }

    }

// Anything that is less than 32 Cores will have scaled down workload

    scale_factor = max((32.0f / (ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount) ), 1.0f);

    n = (int)rint( (float)n / scale_factor );

printf("> CUDA Capable: SM %d.%d hardware\n", deviceProp.major, deviceProp.minor);

    printf("> %d Multiprocessor(s) x %d (Cores/Multiprocessor) = %d (Cores)\n", 

            deviceProp.multiProcessorCount,

            ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),

            ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);

printf("> scale_factor = %1.4f\n", 1.0f/scale_factor);

    printf("> array_size   = %d\n\n", n);

// enable use of blocking sync, to reduce CPU usage

    printf("> Using CPU/GPU Device Synchronization method (%s)\n", sDeviceSyncMethod[device_sync_method]);

    cudaSetDeviceFlags( device_sync_method | (bPinGenericMemory ? cudaDeviceMapHost : 0 ) );

// allocate host memory

    int c = 5;                      // value to which the array will be initialized

    int *h_a = 0;

    int *h_b = 0;// pointer to the array data in host memory

    int *h_c = 0;

    int *hAligned_a = 0;

    int *hAligned_b = 0;

    int *hAligned_c = 0;

    // pointer to the array data in host memory (aligned to MEMORY_ALIGNMENT)

// Allocate Host memory (could be using cudaMallocHost or VirtualAlloc/mmap if using the new CUDA 4.0 features

    AllocateHostMemory(bPinGenericMemory, &h_a, &hAligned_a, nbytes);

    AllocateHostMemory(bPinGenericMemory, &h_b, &hAligned_b, nbytes);

    AllocateHostMemory(bPinGenericMemory, &h_c, &hAligned_c, nbytes);

      for(int i = 0; i < n; i++)

  {

    h_a[i] = rand() / (int)RAND_MAX;

    h_b[i] = rand() / (int)RAND_MAX;

  }

    // allocate device memory

    int *d_a = 0, *d_c = 0,*d_b=0;             // pointers to data and init value in the device memory

   MY_SAFE_CALL(cudaMalloc(&d_a, nbytes));

   MY_SAFE_CALL(cudaMalloc(&d_b, nbytes));

   MY_SAFE_CALL(cudaMalloc(&d_c, nbytes));

     checkCudaErrors( cudaMemcpy(d_c, &c, sizeof(int), cudaMemcpyHostToDevice) );

printf("\nStarting Test\n");

// allocate and initialize an array of stream handles

    cudaStream_t *streams = (cudaStream_t*) malloc(nstreams * sizeof(cudaStream_t));

    for(int i = 0; i < nstreams; i++) {

        checkCudaErrors( cudaStreamCreate(&(streams[i])) );

    }

// create CUDA event handles

    // use blocking sync

    cudaEvent_t start_event, stop_event;

    int eventflags = ( (device_sync_method == cudaDeviceBlockingSync) ? cudaEventBlockingSync: cudaEventDefault );

checkCudaErrors( cudaEventCreateWithFlags(&start_event, eventflags) );

    checkCudaErrors( cudaEventCreateWithFlags(&stop_event, eventflags) );

// time memcopy from device

    cudaEventRecord(start_event, 0);     // record in stream-0, to ensure that all previous CUDA calls have completed

    cudaMemcpyAsync(hAligned_a, d_a, nbytes, cudaMemcpyDeviceToHost, streams[0]);

    cudaMemcpyAsync(hAligned_b, d_b, nbytes, cudaMemcpyDeviceToHost, streams[0]);

    cudaMemcpyAsync(hAligned_c, d_c, nbytes, cudaMemcpyDeviceToHost, streams[0]);

    cudaEventRecord(stop_event, 0);

    cudaEventSynchronize(stop_event);   // block until the event is actually recorded

    checkCudaErrors( cudaEventElapsedTime(&time_memcpy, start_event, stop_event) );

    printf("memcopy:\t%.2f\n", time_memcpy);

// time kernel

    threads=dim3(512, 1);

    blocks=dim3(n / threads.x, 1);

    cudaEventRecord(start_event, 0);

    vectorAddGPU<<<blocks, threads, 0, streams[0]>>>(d_a,d_b, d_c, n,nreps);

    cudaEventRecord(stop_event, 0);

    cudaEventSynchronize(stop_event);

    checkCudaErrors( cudaEventElapsedTime(&time_kernel, start_event, stop_event) );

    printf("kernel:\t\t%.2f\n", time_kernel);

//////////////////////////////////////////////////////////////////////

    // time non-streamed execution for reference

    threads=dim3(512, 1);

    blocks=dim3(n / threads.x, 1);

    cudaEventRecord(start_event, 0);

	cudaMemcpy(d_a, h_a, nbytes, cudaMemcpyHostToDevice);

	cudaMemcpy(d_b, h_b, nbytes, cudaMemcpyHostToDevice); 

        vectorAddGPU<<<blocks, threads>>>(d_a,d_b, d_c, n,nreps);

         cudaMemcpy(hAligned_c, d_c, nbytes, cudaMemcpyDeviceToHost);

cudaEventRecord(stop_event, 0);

    cudaEventSynchronize(stop_event);

    checkCudaErrors( cudaEventElapsedTime(&elapsed_time, start_event, stop_event) );

    printf("non-streamed:\t%.2f (%.2f expected)\n", elapsed_time, time_kernel + time_memcpy);

//////////////////////////////////////////////////////////////////////

    // time execution with nstreams streams

    threads=dim3(512,1);

    blocks=dim3(n/(nstreams*threads.x),1);

//     memset(hAligned_a, 255, nbytes);     // set host memory bits to all 1s, for testing correctness

//     cudaMemset(d_a, 0, nbytes); // set device memory to all 0s, for testing correctness

    cudaEventRecord(start_event, 0);

	cudaMemcpy(d_a,h_a, nbytes, cudaMemcpyHostToDevice);

  	cudaMemcpy(d_b,h_b, nbytes , cudaMemcpyHostToDevice);

        // asynchronously launch nstreams kernels, each operating on its own portion of data

        for(int i = 0; i < nstreams; i++){

//  	cudaMemcpyAsync(d_a + i * n / nstreams,h_a + i * n / nstreams, nbytes / nstreams, cudaMemcpyHostToDevice, streams[i]);

//  	cudaMemcpyAsync(d_b + i * n / nstreams,h_b + i * n / nstreams, nbytes / nstreams, cudaMemcpyHostToDevice, streams[i]);

        vectorAddGPU<<<blocks, threads, 0, streams[i]>>>(d_a + i * n / nstreams, d_b + i * n / nstreams,d_c + i * n / nstreams, n/nstreams,nreps);

	cudaMemcpyAsync(h_c + i * n / nstreams, d_c + i * n / nstreams, nbytes / nstreams, cudaMemcpyDeviceToHost, streams[i]);

// asynchronously launch nstreams memcopies.  Note that memcopy in stream x will only

        //   commence executing when all previous CUDA calls in stream x have completed

//         for(int i = 0; i < nstreams; i++)

}

    cudaEventRecord(stop_event, 0);

    cudaEventSynchronize(stop_event);

    checkCudaErrors( cudaEventElapsedTime(&elapsed_time, start_event, stop_event) );

    printf("%d streams:\t%.2f (%.2f expected with compute capability 1.1 or later)\n", nstreams, elapsed_time , time_kernel + time_memcpy / nstreams);

// check whether the output is correct

    printf("-------------------------------\n");

    bool bResults = correct_data(hAligned_a,hAligned_b,h_c, n);

// release resources

    for(int i = 0; i < nstreams; i++) {

        cudaStreamDestroy(streams[i]);

    }

    cudaEventDestroy(start_event);

    cudaEventDestroy(stop_event);

// Free cudaMallocHost or Generic Host allocated memory (from CUDA 4.0)

    FreeHostMemory( bPinGenericMemory, &h_a, &hAligned_a, nbytes );

cudaFree(d_a);

    cudaFree(d_c);

cudaDeviceReset();

    shrQAFinishExit(argc, (const char **)argv, bResults ? QA_PASSED : QA_FAILED);

}

Which GPU are you using for your test?
Only recent Tesla and Quadro have two DMA engines to overlap I/O and compute in both directions.

NVIDIA GTS450.and for the Multi-GPU case it is coupled with a GT430.
Is the dual DMA a Tesla feature or do normal desktop GPUs have it also?

Geforce cards have a single DMA.

Which means that I can only overlap kernel execution with data transfer?
Or can I overlap device-2-host with device-2-device but not with a kernel at the same time?

That has been really helpful.

Is this mentioned anywhere in the programming guide?

On Geforce , you can overlap kernel execution with data transfer in one direction.

It should be in the programming guide, not sure where.

Or use a kernel that transfers data in the other direction via zerocopy.

If it is a hardware limitation that shouldn’t work either.

If it works it means that it is just a software limitation.

I will try it.

BTW i checked the programming guide.
The only think that HINTED it might be a problem is where it said I should look into asynchEngineCount to be 2 to get complete overlap.
No mention of GeForce vs Tesla limitation.

asynchEngineCount is 2 on Fermi-based Teslas and 1 on Geforce cards.

There is no hardware limitation preventing simultaneous bidirectional transfers. But on Geforce cards there is only one DMA engine, so the transfer in the other direction needs to be performed by a kernel.

You are correct.

I created a costum kernel to do the transfer and it seems that with a GeForce card you can get up to 2-way concurrency,this time I got D-2-H and H-2-D concurrency but no kernel concurrency.

Here Is the code.

Any additions or further tests you would like to see are mostly welcome.

I tried the dummy-Event trick but it didn’t seem to be of any help.

/*

 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.

 *

 * Please refer to the NVIDIA end user license agreement (EULA) associated

 * with this source code for terms and conditions that govern your use of

 * this software. Any use, reproduction, disclosure, or distribution of

 * this software and related documentation outside the terms of the EULA

 * is strictly prohibited.

 *

 *

 *

 * This sample illustrates the usage of CUDA streams for overlapping

 * kernel execution with device/host memcopies.  The kernel is used to 

 * initialize an array to a specific value, after which the array is 

 * copied to the host (CPU) memory.  To increase performance, multiple 

 * kernel/memcopy pairs are launched asynchronously, each pair in its 

 * own stream.  Devices with Compute Capability 1.1 can overlap a kernel

 * and a memcopy as long as they are issued in different streams.  Kernels

 * are serialized.  Thus, if n pairs are launched, streamed approach

 * can reduce the memcopy cost to the (1/n)th of a single copy of the entire

 * data set.

 *

 * Additionally, this sample uses CUDA events to measure elapsed time for

 * CUDA calls.  Events are a part of CUDA API and provide a system independent

 * way to measure execution times on CUDA devices with approximately 0.5 

 * microsecond precision.

 *

 * Elapsed times are averaged over nreps repetitions (10 by default).

 *

*/

const char *sSDKsample = "simpleStreams";

const char *sEventSyncMethod[] = 

{ 

	"cudaEventDefault", 

	"cudaEventBlockingSync", 

	"cudaEventDisableTiming", 

	NULL 

};

const char *sDeviceSyncMethod[] = 

{ 

	"cudaDeviceScheduleAuto", 

	"cudaDeviceScheduleSpin", 

	"cudaDeviceScheduleYield", 

	"INVALID", 

	"cudaDeviceScheduleBlockingSync", 

	NULL 

};

// Include headers

#include <stdio.h>

// CUDA utilities and system includes

#include <cuda_runtime.h>

// Shared Library Test Functions

#include <sdkHelper.h>  // helper for shared that are common to CUDA SDK samples

#include <shrUtils.h>

#include <shrQATest.h>

#include <my_cuda_lib/my_lib.cu>

#ifndef WIN32

#include <sys/mman.h> // for mmap() / munmap()

#endif

////////////////////////////////////////////////////////////////////////////////

// These are CUDA Helper functions

// This will output the proper CUDA error strings in the event that a CUDA host call returns an error

#define checkCudaErrors(err)  __checkCudaErrors (err, __FILE__, __LINE__)

inline void __checkCudaErrors(cudaError err, const char *file, const int line )

{

    if(cudaSuccess != err)

    {

        fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",file, line, (int)err, cudaGetErrorString( err ) );

        exit(-1);        

    }

}

// This will output the proper error string when calling cudaGetLastError

#define getLastCudaError(msg)      __getLastCudaError (msg, __FILE__, __LINE__)

inline void __getLastCudaError(const char *errorMessage, const char *file, const int line )

{

    cudaError_t err = cudaGetLastError();

    if (cudaSuccess != err)

    {

        fprintf(stderr, "%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n",

        file, line, errorMessage, (int)err, cudaGetErrorString( err ) );

        exit(-1);

    }

}

// General GPU Device CUDA Initialization

int gpuDeviceInit(int devID)

{

    int deviceCount;

    checkCudaErrors(cudaGetDeviceCount(&deviceCount));

if (deviceCount == 0)

    {

        fprintf(stderr, "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n");

        exit(-1);

    }

if (devID < 0)

       devID = 0;

if (devID > deviceCount-1)

    {

        fprintf(stderr, "\n");

        fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);

        fprintf(stderr, ">> gpuDeviceInit (-device=%d) is not a valid GPU device. <<\n", devID);

        fprintf(stderr, "\n");

        return -devID;

    }

cudaDeviceProp deviceProp;

    checkCudaErrors( cudaGetDeviceProperties(&deviceProp, devID) );

if (deviceProp.major < 1)

    {

        fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");

        exit(-1);                                                  

    }

checkCudaErrors( cudaSetDevice(devID) );

    printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, deviceProp.name);

return devID;

}

// This function returns the best GPU (with maximum GFLOPS)

int gpuGetMaxGflopsDeviceId()

{

    int current_device     = 0, sm_per_multiproc  = 0;

    int max_compute_perf   = 0, max_perf_device   = 0;

    int device_count       = 0, best_SM_arch      = 0;

    cudaDeviceProp deviceProp;

    cudaGetDeviceCount( &device_count );

// Find the best major SM Architecture GPU device

    while (current_device < device_count)

    {

        cudaGetDeviceProperties( &deviceProp, current_device );

        if (deviceProp.major > 0 && deviceProp.major < 9999)

        {

            best_SM_arch = MAX(best_SM_arch, deviceProp.major);

        }

        current_device++;

    }

// Find the best CUDA capable GPU device

    current_device = 0;

    while( current_device < device_count )

    {

        cudaGetDeviceProperties( &deviceProp, current_device );

        if (deviceProp.major == 9999 && deviceProp.minor == 9999)

        {

            sm_per_multiproc = 1;

        }

        else

        {

            sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);

        }

int compute_perf  = deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;

if( compute_perf  > max_compute_perf )

    {

            // If we find GPU with SM major > 2, search only these

            if ( best_SM_arch > 2 )

            {

                // If our device==dest_SM_arch, choose this, or else pass

                if (deviceProp.major == best_SM_arch)

                {

                    max_compute_perf  = compute_perf;

                    max_perf_device   = current_device;

                 }

            }

            else

            {

                max_compute_perf  = compute_perf;

                max_perf_device   = current_device;

             }

        }

        ++current_device;

    }

    return max_perf_device;

}

// Initialization code to find the best CUDA Device

int findCudaDevice(int argc, const char **argv)

{

    cudaDeviceProp deviceProp;

    int devID = 0;

    // If the command-line has a device number specified, use it

    if (checkCmdLineFlag(argc, argv, "device"))

    {

        devID = getCmdLineArgumentInt(argc, argv, "device=");

        if (devID < 0)

        {

            printf("Invalid command line parameter\n ");

            exit(-1);

        }

        else

        {

            devID = gpuDeviceInit(devID);

            if (devID < 0)

            {

                printf("exiting...\n");

                shrQAFinishExit(argc, (const char **)argv, QA_FAILED);

                exit(-1);

            }

        }

    }

    else

    {

        // Otherwise pick the device with highest Gflops/s

        devID = gpuGetMaxGflopsDeviceId();

        checkCudaErrors( cudaSetDevice( devID ) );

        checkCudaErrors( cudaGetDeviceProperties(&deviceProp, devID) );

        printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);

    }

    return devID;

}

// end of CUDA Helper Functions

// Macro to aligned up to the memory size in question

#define MEMORY_ALIGNMENT  4096

#define ALIGN_UP(x,size) ( ((size_t)x+(size-1))&(~(size-1)) )

template<typename T>

__global__ void copyKernel(T *dest, T *src, int Nelements)

{

  int idx = blockIdx.x*blockDim.x + threadIdx.x;

// for( int i=0;i<nIterations;i++){  

if (idx < Nelements){

dest[idx] = src[idx];}

}

// }

__global__ void vectorAddGPU(int *a, int *b, int *c, int N,int nIterations)

{

  int idx = blockIdx.x*blockDim.x + threadIdx.x;

// for( int i=0;i<nIterations;i++){  

if (idx < N){

c[idx] = a[idx] + b[idx];}

}

 int correct_data(int *a,int*b,int*c, const int n)

 {

     for(int i = 0; i < n; i++) {

         if(c[i] != a[i]+b[i]) {

            printf("%d: %d %d\n", i, a[i], c);

            return 0;

         }

     }

     return 1;

 }

inline void 

AllocateHostMemory(bool bPinGenericMemory, int **pp_a, int **ppAligned_a, int nbytes)

{

#if CUDART_VERSION >= 4000

    if (bPinGenericMemory)

    {

        // allocate a generic page-aligned chunk of system memory

    #ifdef WIN32

        printf("> VirtualAlloc() allocating %4.2f Mbytes of (generic page-aligned system memory)\n", (float)nbytes/1048576.0f);

        *pp_a = (int *) VirtualAlloc( NULL, (nbytes + MEMORY_ALIGNMENT), MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE );

    #else

        printf("> mmap() allocating %4.2f Mbytes (generic page-aligned system memory)\n", (float)nbytes/1048576.0f);

        *pp_a = (int *) mmap        ( NULL, (nbytes + MEMORY_ALIGNMENT), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0 );

    #endif

*ppAligned_a = (int *)ALIGN_UP(*pp_a, MEMORY_ALIGNMENT);

printf("> cudaHostRegister() registering %4.2f Mbytes of generic allocated system memory\n", (float)nbytes/1048576.0f);

        // pin allocate memory

        checkCudaErrors( cudaHostRegister(*ppAligned_a, nbytes, cudaHostRegisterMapped) );

    }

    else

#endif

    {

        printf("> cudaMallocHost() allocating %4.2f Mbytes of system memory\n", (float)nbytes/1048576.0f);

        // allocate host memory (pinned is required for achieve asynchronicity)

        checkCudaErrors( cudaMallocHost((void**)pp_a, nbytes) ); 

        *ppAligned_a = *pp_a; 

    }

}

inline void

FreeHostMemory(bool bPinGenericMemory, int **pp_a, int **ppAligned_a, int nbytes)

{

#if CUDART_VERSION >= 4000

    // CUDA 4.0 support pinning of generic host memory

    if (bPinGenericMemory)

    {

        // unpin and delete host memory

        checkCudaErrors( cudaHostUnregister(*ppAligned_a) );

  #ifdef WIN32

        VirtualFree(*pp_a, 0, MEM_RELEASE);

  #else

        munmap(*pp_a, nbytes);

  #endif

    }

    else

#endif

    {

        cudaFreeHost(*pp_a);

    }

}

static char *sSyncMethod[] = 

{

    "0 (Automatic Blocking)",

    "1 (Spin Blocking)",

    "2 (Yield Blocking)",

    "3 (Undefined Blocking Method)",

    "4 (Blocking Sync Event) = low CPU utilization",

    NULL

};

void printHelp()

{

    printf("Usage: %s [options below]\n", sSDKsample);

    printf("\t--sync_method=n for CPU/GPU synchronization\n");

    printf("\t             n=%s\n", sSyncMethod[0]);

    printf("\t             n=%s\n", sSyncMethod[1]);

    printf("\t             n=%s\n", sSyncMethod[2]);

    printf("\t   <Default> n=%s\n", sSyncMethod[4]);

    printf("\t--use_generic_memory (default) use generic page-aligned for system memory\n");

    printf("\t--use_cuda_malloc_host (optional) use cudaMallocHost to allocate system memory\n");

}

#if defined(__APPLE__) || defined(MACOSX)

#define DEFAULT_PINNED_GENERIC_MEMORY false

#else

#define DEFAULT_PINNED_GENERIC_MEMORY true

#endif

int main(int argc, char **argv)

{

    int cuda_device = 0;

    int nstreams = 4;               // number of streams for CUDA calls

    int nreps = 1;                 // number of times each experiment is repeated

    int n = 16 * 1024 * 1024;       // number of ints in the data set

    int nbytes = n * sizeof(int);   // number of data bytes

    dim3 threads, blocks;           // kernel launch configuration

    float elapsed_time, time_memcpy, time_kernel;   // timing variables

    float scale_factor = 1.0f;

// allocate generic memory and pin it laster instead of using cudaHostAlloc()

bool bPinGenericMemory  = DEFAULT_PINNED_GENERIC_MEMORY; // we want this to be the default behavior

    int  device_sync_method = cudaDeviceBlockingSync; // by default we use BlockingSync

int niterations;	// number of iterations for the loop inside the kernel

shrQAStart(argc, argv);

printf("[ %s ]\n\n", sSDKsample);

    if( checkCmdLineFlag( argc, (const char **)argv, "help") ) {

        printHelp();

        shrQAFinishExit(argc, (const char **)argv, QA_PASSED);

    }

    if( (device_sync_method = getCmdLineArgumentInt( argc, (const char **)argv, "sync_method" )) >= 0 ) {

        if (device_sync_method == 0 || device_sync_method == 1 || device_sync_method == 2 || device_sync_method == 4) {

            printf("Device synchronization method set to = %s\n", sSyncMethod[device_sync_method]);

            printf("Setting reps to %d to demonstrate steady state\n",nreps);

            nreps = 100;

        } else {

            printf("Invalid command line option sync_method=\"%d\"\n", device_sync_method);

            shrQAFinishExit(argc, (const char **)argv, QA_FAILED);

        }

	} else {

        printHelp();

        shrQAFinishExit(argc, (const char **)argv, QA_PASSED);

	}

if( checkCmdLineFlag( argc, (const char **)argv, "use_generic_memory") ) {

#if defined(__APPLE__) || defined(MACOSX)

        bPinGenericMemory = false;  // Generic Pinning of System Paged memory not currently supported on Mac OSX 

#else

        bPinGenericMemory = true;

#endif

    }

    if( checkCmdLineFlag( argc, (const char **)argv, "use_cuda_malloc_host") ) {

        bPinGenericMemory = false;

    }

printf("\n> ");

    cuda_device = findCudaDevice(argc, (const char **)argv);

// check the compute capability of the device

    int num_devices=0;

    checkCudaErrors( cudaGetDeviceCount(&num_devices) );

    if(0==num_devices)

    {

        printf("your system does not have a CUDA capable device, waiving test...\n");

        shrQAFinishExit(argc, (const char **)argv, QA_PASSED);

    }

	

    // check if the command-line chosen device ID is within range, exit if not

    if( cuda_device >= num_devices )

    {

        printf("cuda_device=%d is invalid, must choose device ID between 0 and %d\n", cuda_device, num_devices-1);

        shrQAFinishExit(argc, (const char **)argv, QA_FAILED);

    }

cudaSetDevice( cuda_device );

// Checking for compute capabilities

    cudaDeviceProp deviceProp;

    checkCudaErrors( cudaGetDeviceProperties(&deviceProp, cuda_device) );

    if( (1 == deviceProp.major) && (deviceProp.minor < 1)) {

        printf("%s does not have Compute Capability 1.1 or newer.  Reducing workload.\n", deviceProp.name);

    }

if(deviceProp.major >= 2) {

        niterations = 100;

    } else {		    

        if(deviceProp.minor > 1) {

            niterations = 5;

        } else {

            niterations = 1; // reduced workload for compute capability 1.0 and 1.1

        }

    }

// Check if GPU can map host memory (Generic Method), if not then we override bPinGenericMemory to be false

    if (bPinGenericMemory) {

       printf("Device: <%s> canMapHostMemory: %s\n", deviceProp.name, deviceProp.canMapHostMemory ? "Yes" : "No");

       if (deviceProp.canMapHostMemory == 0) {

          printf("Using cudaMallocHost, CUDA device does not support mapping of generic host memory\n");

          bPinGenericMemory = false;

       }

    }

// Anything that is less than 32 Cores will have scaled down workload

    scale_factor = max((32.0f / (ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount) ), 1.0f);

    n = (int)rint( (float)n / scale_factor );

printf("> CUDA Capable: SM %d.%d hardware\n", deviceProp.major, deviceProp.minor);

    printf("> %d Multiprocessor(s) x %d (Cores/Multiprocessor) = %d (Cores)\n", 

            deviceProp.multiProcessorCount,

            ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),

            ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);

printf("> scale_factor = %1.4f\n", 1.0f/scale_factor);

    printf("> array_size   = %d\n\n", n);

// enable use of blocking sync, to reduce CPU usage

    printf("> Using CPU/GPU Device Synchronization method (%s)\n", sDeviceSyncMethod[device_sync_method]);

    cudaSetDeviceFlags( device_sync_method | (bPinGenericMemory ? cudaDeviceMapHost : 0 ) );

// allocate host memory

    int c = 5;                      // value to which the array will be initialized

    int *h_a = 0;

    int *h_b = 0;// pointer to the array data in host memory

    int *h_c = 0;

    int *hAligned_a = 0;

    int *hAligned_b = 0;

    int *hAligned_c = 0;

    // pointer to the array data in host memory (aligned to MEMORY_ALIGNMENT)

// Allocate Host memory (could be using cudaMallocHost or VirtualAlloc/mmap if using the new CUDA 4.0 features

    AllocateHostMemory(bPinGenericMemory, &h_a, &hAligned_a, nbytes);

    AllocateHostMemory(bPinGenericMemory, &h_b, &hAligned_b, nbytes);

    AllocateHostMemory(bPinGenericMemory, &h_c, &hAligned_c, nbytes);

      for(int i = 0; i < n; i++)

  {

    h_a[i] = rand() / (int)RAND_MAX;

    h_b[i] = rand() / (int)RAND_MAX;

  }

    // allocate device memory

    int *d_a = 0, *d_c = 0,*d_b=0;             // pointers to data and init value in the device memory

   MY_SAFE_CALL(cudaMalloc(&d_a, nbytes));

   MY_SAFE_CALL(cudaMalloc(&d_b, nbytes));

   MY_SAFE_CALL(cudaMalloc(&d_c, nbytes));

     checkCudaErrors( cudaMemcpy(d_c, &c, sizeof(int), cudaMemcpyHostToDevice) );

printf("\nStarting Test\n");

// allocate and initialize an array of stream handles

    cudaStream_t *streams = (cudaStream_t*) malloc(nstreams * sizeof(cudaStream_t));

    for(int i = 0; i < nstreams; i++) {

        checkCudaErrors( cudaStreamCreate(&(streams[i])) );

    }

// create CUDA event handles

    // use blocking sync

    cudaEvent_t start_event, stop_event,dummy_event;

    int eventflags = ( (device_sync_method == cudaDeviceBlockingSync) ? cudaEventBlockingSync: cudaEventDefault );

checkCudaErrors( cudaEventCreateWithFlags(&start_event, eventflags) );

    checkCudaErrors( cudaEventCreateWithFlags(&stop_event, eventflags) );

    checkCudaErrors( cudaEventCreateWithFlags(&dummy_event, eventflags) );

// time memcopy from device

    cudaEventRecord(start_event, 0);     // record in stream-0, to ensure that all previous CUDA calls have completed

    cudaMemcpyAsync(hAligned_a, d_a, nbytes, cudaMemcpyDeviceToHost, streams[0]);

    cudaMemcpyAsync(hAligned_b, d_b, nbytes, cudaMemcpyDeviceToHost, streams[0]);

    cudaMemcpyAsync(hAligned_c, d_c, nbytes, cudaMemcpyDeviceToHost, streams[0]);

    cudaEventRecord(stop_event, 0);

    cudaEventSynchronize(stop_event);   // block until the event is actually recorded

    checkCudaErrors( cudaEventElapsedTime(&time_memcpy, start_event, stop_event) );

    printf("memcopy:\t%.2f\n", time_memcpy);

// time kernel

    threads=dim3(512, 1);

    blocks=dim3(n / threads.x, 1);

    cudaEventRecord(start_event, 0);

    vectorAddGPU<<<blocks, threads, 0, streams[0]>>>(d_a,d_b, d_c, n,nreps);

    cudaEventRecord(stop_event, 0);

    cudaEventSynchronize(stop_event);

    checkCudaErrors( cudaEventElapsedTime(&time_kernel, start_event, stop_event) );

    printf("kernel:\t\t%.2f\n", time_kernel);

//////////////////////////////////////////////////////////////////////

    // time non-streamed execution for reference

    threads=dim3(512, 1);

    blocks=dim3(n / threads.x, 1);

    cudaEventRecord(start_event, 0);

	cudaMemcpy(d_a, h_a, nbytes, cudaMemcpyHostToDevice);

	cudaMemcpy(d_b, h_b, nbytes, cudaMemcpyHostToDevice); 

        vectorAddGPU<<<blocks, threads>>>(d_a,d_b, d_c, n,nreps);

         cudaMemcpy(hAligned_c, d_c, nbytes, cudaMemcpyDeviceToHost);

cudaEventRecord(stop_event, 0);

    cudaEventSynchronize(stop_event);

    checkCudaErrors( cudaEventElapsedTime(&elapsed_time, start_event, stop_event) );

    printf("non-streamed:\t%.2f (%.2f expected)\n", elapsed_time, time_kernel + time_memcpy);

//////////////////////////////////////////////////////////////////////

int*d_srcA,*d_srcB;

  checkCudaErrors(cudaHostGetDevicePointer((void **)&d_srcA, (void *)h_a, 0));

   checkCudaErrors(cudaHostGetDevicePointer((void **)&d_srcB, (void *)h_b, 0));

//   checkCudaErrors(cudaHostGetDevicePointer((void **)&d_c, (void *)c, 0));

    // time execution with nstreams streams

    threads=dim3(512,1);

    blocks=dim3(n/(nstreams*threads.x),1);

//     memset(hAligned_a, 255, nbytes);     // set host memory bits to all 1s, for testing correctness

//     cudaMemset(d_a, 0, nbytes); // set device memory to all 0s, for testing correctness

    cudaEventRecord(start_event, 0);

//   	cudaMemcpy(d_a,h_a, nbytes, cudaMemcpyHostToDevice);

//   	cudaMemcpy(d_b,h_b, nbytes , cudaMemcpyHostToDevice);

        // asynchronously launch nstreams kernels, each operating on its own portion of data

        for(int i = 0; i < nstreams; i++){

	copyKernel<int><<<blocks, threads, 0, streams[i]>>>(d_a + i * n / nstreams, d_srcA + i * n / nstreams,n/nstreams);

// cudaEventRecord(dummy_event, 0);

	copyKernel<int><<<blocks, threads, 0, streams[i]>>>(d_b + i * n / nstreams, d_srcB + i * n / nstreams,n/nstreams);

// cudaEventRecord(dummy_event, 0);

// checkCudaErrors( cudaEventElapsedTime(&elapsed_time, start_event, stop_event) );

//  	cudaMemcpyAsync(d_a + i * n / nstreams,h_a + i * n / nstreams, nbytes / nstreams, cudaMemcpyHostToDevice, streams[i]);

//  	cudaMemcpyAsync(d_b + i * n / nstreams,h_b + i * n / nstreams, nbytes / nstreams, cudaMemcpyHostToDevice, streams[i]);

        vectorAddGPU<<<blocks, threads, 0, streams[i]>>>(d_a + i * n / nstreams, d_b + i * n / nstreams,d_c + i * n / nstreams, n/nstreams,nreps);

	cudaMemcpyAsync(h_c + i * n / nstreams, d_c + i * n / nstreams, nbytes / nstreams, cudaMemcpyDeviceToHost, streams[i]);

// asynchronously launch nstreams memcopies.  Note that memcopy in stream x will only

        //   commence executing when all previous CUDA calls in stream x have completed

//         for(int i = 0; i < nstreams; i++)

}

    cudaEventRecord(stop_event, 0);

    cudaEventSynchronize(stop_event);

    checkCudaErrors( cudaEventElapsedTime(&elapsed_time, start_event, stop_event) );

    printf("%d streams:\t%.2f (%.2f expected with compute capability 1.1 or later)\n", nstreams, elapsed_time , time_kernel + time_memcpy / nstreams);

// check whether the output is correct

    printf("-------------------------------\n");

    bool bResults = correct_data(hAligned_a,hAligned_b,h_c, n);

// release resources

    for(int i = 0; i < nstreams; i++) {

        cudaStreamDestroy(streams[i]);

    }

    cudaEventDestroy(start_event);

    cudaEventDestroy(stop_event);

// Free cudaMallocHost or Generic Host allocated memory (from CUDA 4.0)

    FreeHostMemory( bPinGenericMemory, &h_a, &hAligned_a, nbytes );

cudaFree(d_a);

    cudaFree(d_c);

cudaDeviceReset();

    shrQAFinishExit(argc, (const char **)argv, bResults ? QA_PASSED : QA_FAILED);

}

Anyway,interesting findings.