Trying to determine host to device (and back) memory throughput when using cudaMallocManaged

Hi all,

I am trying to assess the efficiency of host to device (and back again) memory transfers using the
different transfer options available (cudaMemcpy, cudaMemcpyAsync, and cudaMallocManaged). My current setup involves making arbitrarily sized transfers and measuring:

  1. the time it takes to complete using both the host timer and CUDA Events
  2. and parsing the output of an API trace using nvprof

I collect this information by running my test applications and parse it using a set of scripts I’ve created to generate some graphs.

This method for determining bandwidth has worked very well for me in the first two cases, but does not work when using cudaMallocManaged as there is no explicit memcpying for me to measure. Using nvprof doesn’t help as the API trace shows no calls to memcpy (or the like). Using the Visual Profiler (NVVP), I can see that device allocation and memcpying from host to device happens as part of the kernel launch (see image in link):

http://imgur.com/wXZcfPT

I can see that NVVP provides an average bandwidth for the transfers, but I am unable to extract this information into a format that I can parse (generating a CSV only gives me details about the kernel execution, and NVVP’s profile stores the timeline information in encoded form).

My question then is this:

  • Is there a way to determine host to device and vice versa transfer bandwidth when using cudaMallocManaged programmatically?
  • Or would it be possible to parse the NVVP profile file and extract the information from the timeline?

Below is an example of my code using cudaMallocManaged:

#include <cstdio>
#include <cstdlib>
#include <cstdarg>

/* Activate CUDA checks - may incur a performance penalty */ 
#define CUDA_CHECK_ERRORS

// Cuda related error handling - no synchronising is done...
#define cudaError(ans) { __cudaAssert((ans), __FILE__, __LINE__); }
inline
void __cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
#ifdef CUDA_CHECK_ERRORS
  if (code != cudaSuccess) 
  {
    fprintf(stderr,"cudaError: %s in %s:%d\n", cudaGetErrorString(code), file, line);
    if (abort) exit(code);
  }
#endif
  return;
}

__global__
void kernel(int * data, int iter)
{
  int i;
  int index = blockIdx.x * blockDim.x + threadIdx.x;

  for(i = 0; i < iter; i++)
  {
    data[index] = iter * i;
  }
}

int main(int argc, char ** argv)
{
  int sizemb, sizeb, iters, exits = EXIT_SUCCESS;
  int * sidata, testing = 0;
  int devID, gridsize, i;
  cudaDeviceProp props;
  cudaEvent_t scomp, ecomp;
  float ccomp;

  if( argc < 3) {
    cprintf(stderr, "USAGE: %s SIZE[MB] COMPLEX[ITERS]\n", argv[0]);
    return EXIT_FAILURE;
  } else {
    // assume user enters in MB
    sscanf(argv[1], "%d", &sizemb);
    sscanf(argv[2], "%d", &iters);
    // convert to bytes
    sizeb = MB2B(sizemb) / sizeof(int);
    if( sizeb % BLOCK != 0)
    {
      cprintf(stderr, "-> scaling down array size by %d\n", sizeb % BLOCK);
      sizeb = (sizeb / BLOCK) * BLOCK;
    }
    fprintf(stderr, "-> { %d MB, %d iterations, %d at %d bytes}\n", sizemb, iters, sizeb, sizeof(int));
  }

  // Assumes that there is only one CUDA device on the system
  devID = 0;

  //Get GPU information
  cudaError(cudaGetDevice(&devID));
  cudaError(cudaGetDeviceProperties(&props, devID));
  fprintf(stderr, "-> { GPU %d: \"%s\", %d.%d CC}\n",
          devID, props.name, props.major, props.minor);

  // cuda events
  cudaEventCreate(&scomp);
  cudaEventCreate(&ecomp);

  // create host buffers and fill it
  cudaError(cudaMallocManaged(&sidata, sizeof(int) * sizeb));
  for(i = 0; i < sizeb; i++) { sidata[i] = iters; }
  fprintf(stderr, "-> filled %f MB\n", B2MB(sizeof(int) * sizeb));

  gridsize = (sizeb + BLOCK - 1) / BLOCK;

  // launch kernel
  fprintf(stderr, "-> kernal launch {%d, %d}\n", gridsize, BLOCK);
  cudaError(cudaEventRecord(scomp));
  kernelordered<<<gridsize, BLOCK>>>( sidata, iters);
  cudaError(cudaPeekAtLastError());
  cudaError(cudaEventRecord(ecomp));
  cudaError(cudaEventSynchronize(ecomp));

  // get cude event elapsed times
  cudaError(cudaEventElapsedTime(&ccomp, scomp, ecomp));

  // check data
  fprintf(stderr, "-> compute on host\n");
  for(i = 0; i < sizeb; i++)
  {
    testing = iters * (iters - 1);

    if(sidata[i] != testing)
    {
      fprintf(stderr, "The kernel did not compute correctly! [%d]: %d != %d\n", i, sidata[i], testing);
      exits = EXIT_FAILURE;
      goto frees;
    }
  }

  fprintf(stdout, "-> Print metrics (CUDA Events):\n");
  fprintf(stdout, "Kernel: %f\n", ccomp);
  
frees:
  // free cuda buffer
  cudaError(cudaFree(sidata));
  
  // destroy cuda events
  cudaEventDestroy(scomp);
  cudaEventDestroy(ecomp);
  
  // explicitly clear the device
  cudaError(cudaDeviceReset());

  return exits;
}

// vim: set ts=2 sw=2 :

have you looked at the options for nvprof? There is an option to track managed memory transfers.

http://docs.nvidia.com/cuda/profiler-users-guide/index.html#unified-memory-profiling

These options will give useful output if either --print-api-trace or --print-gpu-trace is specified. Here is sample output with --print-api-trace:

$ nvprof --print-api-trace --unified-memory-profiling per-process-device ./t1260
==29336== NVPROF is profiling process 29336, command: ./t1260
data = 1
host data = 2
==29336== Profiling application: ./t1260
==29336== Profiling result:
   Start  Duration             Unified Memory  Name
<snip>
160.23ms  318.73ms                          -  cudaMallocManaged
478.97ms         -                          1  [Unified Memory CPU page faults]
478.99ms  2.2720us                 4.000000KB  [Unified Memory Memcpy DtoH]
479.04ms  2.4860us                          -  cudaConfigureCall
479.04ms  4.0800us                          -  cudaSetupArgument
479.05ms  1.1679ms                          -  cudaLaunch (test(int*) [391])
480.10ms  2.9760us                 4.000000KB  [Unified Memory Memcpy HtoD]
480.22ms  165.97us                          -  cudaDeviceSynchronize
480.39ms  1.9520us                 4.000000KB  [Unified Memory Memcpy DtoH]
480.39ms         -                          1  [Unified Memory CPU page faults]

Hi txbob, that’s is exactly what I was looking for! Thank you!

I didn’t realise that that was what the unified-memory-profiling flag provided…