Hi all,
I am trying to assess the efficiency of host to device (and back again) memory transfers using the
different transfer options available (cudaMemcpy, cudaMemcpyAsync, and cudaMallocManaged). My current setup involves making arbitrarily sized transfers and measuring:
- the time it takes to complete using both the host timer and CUDA Events
- and parsing the output of an API trace using nvprof
I collect this information by running my test applications and parse it using a set of scripts I’ve created to generate some graphs.
This method for determining bandwidth has worked very well for me in the first two cases, but does not work when using cudaMallocManaged as there is no explicit memcpying for me to measure. Using nvprof doesn’t help as the API trace shows no calls to memcpy (or the like). Using the Visual Profiler (NVVP), I can see that device allocation and memcpying from host to device happens as part of the kernel launch (see image in link):
I can see that NVVP provides an average bandwidth for the transfers, but I am unable to extract this information into a format that I can parse (generating a CSV only gives me details about the kernel execution, and NVVP’s profile stores the timeline information in encoded form).
My question then is this:
- Is there a way to determine host to device and vice versa transfer bandwidth when using cudaMallocManaged programmatically?
- Or would it be possible to parse the NVVP profile file and extract the information from the timeline?
Below is an example of my code using cudaMallocManaged:
#include <cstdio>
#include <cstdlib>
#include <cstdarg>
/* Activate CUDA checks - may incur a performance penalty */
#define CUDA_CHECK_ERRORS
// Cuda related error handling - no synchronising is done...
#define cudaError(ans) { __cudaAssert((ans), __FILE__, __LINE__); }
inline
void __cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
#ifdef CUDA_CHECK_ERRORS
if (code != cudaSuccess)
{
fprintf(stderr,"cudaError: %s in %s:%d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
#endif
return;
}
__global__
void kernel(int * data, int iter)
{
int i;
int index = blockIdx.x * blockDim.x + threadIdx.x;
for(i = 0; i < iter; i++)
{
data[index] = iter * i;
}
}
int main(int argc, char ** argv)
{
int sizemb, sizeb, iters, exits = EXIT_SUCCESS;
int * sidata, testing = 0;
int devID, gridsize, i;
cudaDeviceProp props;
cudaEvent_t scomp, ecomp;
float ccomp;
if( argc < 3) {
cprintf(stderr, "USAGE: %s SIZE[MB] COMPLEX[ITERS]\n", argv[0]);
return EXIT_FAILURE;
} else {
// assume user enters in MB
sscanf(argv[1], "%d", &sizemb);
sscanf(argv[2], "%d", &iters);
// convert to bytes
sizeb = MB2B(sizemb) / sizeof(int);
if( sizeb % BLOCK != 0)
{
cprintf(stderr, "-> scaling down array size by %d\n", sizeb % BLOCK);
sizeb = (sizeb / BLOCK) * BLOCK;
}
fprintf(stderr, "-> { %d MB, %d iterations, %d at %d bytes}\n", sizemb, iters, sizeb, sizeof(int));
}
// Assumes that there is only one CUDA device on the system
devID = 0;
//Get GPU information
cudaError(cudaGetDevice(&devID));
cudaError(cudaGetDeviceProperties(&props, devID));
fprintf(stderr, "-> { GPU %d: \"%s\", %d.%d CC}\n",
devID, props.name, props.major, props.minor);
// cuda events
cudaEventCreate(&scomp);
cudaEventCreate(&ecomp);
// create host buffers and fill it
cudaError(cudaMallocManaged(&sidata, sizeof(int) * sizeb));
for(i = 0; i < sizeb; i++) { sidata[i] = iters; }
fprintf(stderr, "-> filled %f MB\n", B2MB(sizeof(int) * sizeb));
gridsize = (sizeb + BLOCK - 1) / BLOCK;
// launch kernel
fprintf(stderr, "-> kernal launch {%d, %d}\n", gridsize, BLOCK);
cudaError(cudaEventRecord(scomp));
kernelordered<<<gridsize, BLOCK>>>( sidata, iters);
cudaError(cudaPeekAtLastError());
cudaError(cudaEventRecord(ecomp));
cudaError(cudaEventSynchronize(ecomp));
// get cude event elapsed times
cudaError(cudaEventElapsedTime(&ccomp, scomp, ecomp));
// check data
fprintf(stderr, "-> compute on host\n");
for(i = 0; i < sizeb; i++)
{
testing = iters * (iters - 1);
if(sidata[i] != testing)
{
fprintf(stderr, "The kernel did not compute correctly! [%d]: %d != %d\n", i, sidata[i], testing);
exits = EXIT_FAILURE;
goto frees;
}
}
fprintf(stdout, "-> Print metrics (CUDA Events):\n");
fprintf(stdout, "Kernel: %f\n", ccomp);
frees:
// free cuda buffer
cudaError(cudaFree(sidata));
// destroy cuda events
cudaEventDestroy(scomp);
cudaEventDestroy(ecomp);
// explicitly clear the device
cudaError(cudaDeviceReset());
return exits;
}
// vim: set ts=2 sw=2 :