Hi
Thank you for you explanation. However,it is clear in our case that the overhead of unified memory is significant on TK1 and TX2. It is even more significant the memory copy itself. Consider the code example below. In this example I’m using the 3 different mechanisms to compare them. Depending on the value of the variable mode, I select which mechanism to use, with some variations to the managed mode.
Mode = 0: Managed with CPU initialization
Mode = 1: Memory copy
Mode = 2: Pinned (zero copy)
Mode = 4: Managed with GPU initialization
Mode = 5: Managed with pre-fetching and memory advising
It is always Mode 1 gives the best performance, in all platforms. Because in my example, the initialisation is performed before the kernel actually accesses the data, in mode 0, the kernel launch takes significantly longer time compared to mode 1 and even mode 2. When, I perform the initialisation on the GPU (mode 4), the kernel launch time decreases to become close to mode 1. Mode 2 always slower because of the cache issue. Mode 5 didn’t really help, it seems I’m not using the pre-fetching correctly.
I can not attach photos to this thread to show the profiling result on TX2 which will explain more my point.
I wish you can help me explain what I’m missing in this code example to improve the performance of the managed memory. But just using nvprof to get total execution time for each API, here is sum for cudaLaunch:
Mode 0: 120.17 ms
Mode 1: 11 ms
Mode 2: 52 ms
#include <iostream>
#include <math.h>
// CUDA kernel to add elements of two arrays
__global__
void add(int n, float *x, float *y)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride)
y[i] = x[i] + y[i];
}
// CUDA kernel to add elements of two arrays
__global__
void initialize(int n, float *x, float *y)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride)
{
x[i] = 1.0f;
y[i] = 2.0f;
}
}
int DoAddManaged (int mode)
{
int N = 1<<20;
float *x, *y;
// Allocate Unified Memory -- accessible from CPU or GPU
cudaMallocManaged(&x, N*sizeof(float));
cudaMallocManaged(&y, N*sizeof(float));
#ifndef TK1
if (mode==5)
{
cudaMemAdvise(x, N*sizeof(float),cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId );
cudaMemAdvise(y, N*sizeof(float),cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId );
cudaMemPrefetchAsync (x,N*sizeof(float),cudaCpuDeviceId);
cudaMemPrefetchAsync (y,N*sizeof(float),cudaCpuDeviceId);
}
#endif
//cudaMemPrefetchAsync (x,N/2,0);
//cudaMemPrefetchAsync (y,N/2,0);
//std::cout << "X pointer = " << std::hex << x << " Y pointer = " << std::hex << y << std::endl;
// Calculate the block size and number of blocks
int blockSize = 256;
int numBlocks = (N + blockSize - 1) / blockSize;
// initialize x and y arrays on the host
if (mode==4)
{
initialize<<<numBlocks, blockSize>>>(N, x, y);
}
else
{
for (int i = 0; i < N; i++)
{
x[i] = 1.0f;
y[i] = 2.0f;
}
}
#ifndef TK1
if (mode==5)
{
cudaMemPrefetchAsync (x,N*sizeof(float),0);
cudaMemPrefetchAsync (y,N*sizeof(float),0);
}
#endif
// Launch kernel on 1M elements on the GPU
add<<<numBlocks, blockSize>>>(N, x, y);
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
// Check for errors (all values should be 3.0f)
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i]-3.0f));
//std::cout << "Max error: " << maxError << std::endl;
//if (mode==4)
//add<<<numBlocks, blockSize>>>(N, x, y);
// Free memory
cudaFree(x);
cudaFree(y);
return maxError;
}
int DoAddCopy()
{
int N = 1<<20;
// Allocate host memory
float *x = new float[N], *y = new float[N];
//std::cout << "X pointer = " << std::hex << x << " Y pointer = " << std::hex << y << std::endl;
// Device arrays
float *d_x, *d_y;
// Allocate device memory, only accessable from the GPU
cudaMalloc((void **) &d_x, N*sizeof(float));
cudaMalloc((void **) &d_y, N*sizeof(float));
//std::cout << "Device X pointer = " << std::hex << d_x << " Device Y pointer = " << std::hex << d_y << std::endl;
// initialize x and y arrays on the host
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
// Copy array contents of input from the host (CPU) to the device (GPU)
cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice);
// Launch kernel on 1M elements on the GPU
int blockSize = 256;
int numBlocks = (N + blockSize - 1) / blockSize;
add<<<numBlocks, blockSize>>>(N, d_x, d_y);
// Wait for GPU to finish before accessing on host
//cudaDeviceSynchronize(); // May be I don't need it here
// Copy result back
cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);
// Check for errors (all values should be 3.0f)
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i]-3.0f));
//std::cout << "Max error: " << maxError << std::endl;
// Free memory
cudaFree(d_x);
cudaFree(d_y);
delete x;
delete y;
return maxError;
}
int DoAddZeroCopy()
{
int N = 1<<20;
// Declare empty pointers
float *x = NULL, *y = NULL;
// Allocate host memory
cudaHostAlloc((void **)&x, N*sizeof(float), cudaHostAllocMapped);
cudaHostAlloc((void **)&y, N*sizeof(float), cudaHostAllocMapped);
//std::cout << "X pointer = " << std::hex << x << " Y pointer = " << std::hex << y << std::endl;
// initialize x and y arrays on the host
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
// Device pointers
float *d_x, *d_y;
// Get device pointer from host memory. No allocation or memcpy
cudaHostGetDevicePointer((void **)&d_x, (void *) x , 0);
cudaHostGetDevicePointer((void **)&d_y, (void *) y, 0);
//std::cout << "Device X pointer = " << std::hex << d_x << " Device Y pointer = " << std::hex << d_y << std::endl;
// no need to memory copy
// Launch kernel on 1M elements on the GPU
int blockSize = 256;
int numBlocks = (N + blockSize - 1) / blockSize;
add<<<numBlocks, blockSize>>>(N, d_x, d_y);
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize(); // May be I don't need it here
// No need to copy back
// Check for errors (all values should be 3.0f)
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i]-3.0f));
//std::cout << "Max error: " << maxError << std::endl;
// Free memory
cudaFreeHost(x);
cudaFreeHost(y);
return maxError;
}
int main(int argc, char **argv)
{
int mode = 0;
int loop = 10;
int i = 0;
int rval = 0;
if (argc==3)
{
mode = atoi(argv[1]);
loop = atoi(argv[2]);
}
int device_count=0;
cudaGetDeviceCount (&device_count);
std::cout << "Mode = " << mode << " , Loop = " << loop << std::endl;
std::cout << "The number of cuda devices = " << device_count << std::endl;
switch(mode)
{
case 0:
case 4:
for (i=0;i<loop;i++)#include <iostream>
#include <math.h>
// CUDA kernel to add elements of two arrays
__global__
void add(int n, float *x, float *y)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride)
y[i] = x[i] + y[i];
}
// CUDA kernel to add elements of two arrays
__global__
void initialize(int n, float *x, float *y)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride)
{
x[i] = 1.0f;
y[i] = 2.0f;
}
}
int DoAddManaged (int mode)
{
int N = 1<<20;
float *x, *y;
// Allocate Unified Memory -- accessible from CPU or GPU
cudaMallocManaged(&x, N*sizeof(float));
cudaMallocManaged(&y, N*sizeof(float));
#ifndef TK1
if (mode==5)
{
cudaMemAdvise(x, N*sizeof(float),cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId );
cudaMemAdvise(y, N*sizeof(float),cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId );
cudaMemPrefetchAsync (x,N*sizeof(float),cudaCpuDeviceId);
cudaMemPrefetchAsync (y,N*sizeof(float),cudaCpuDeviceId);
}
#endif
//cudaMemPrefetchAsync (x,N/2,0);
//cudaMemPrefetchAsync (y,N/2,0);
//std::cout << "X pointer = " << std::hex << x << " Y pointer = " << std::hex << y << std::endl;
// Calculate the block size and number of blocks
int blockSize = 256;
int numBlocks = (N + blockSize - 1) / blockSize;
// initialize x and y arrays on the host
if (mode==4)
{
initialize<<<numBlocks, blockSize>>>(N, x, y);
}
else
{
for (int i = 0; i < N; i++)
{
x[i] = 1.0f;
y[i] = 2.0f;
}
}
#ifndef TK1
if (mode==5)
{
cudaMemPrefetchAsync (x,N*sizeof(float),0);
cudaMemPrefetchAsync (y,N*sizeof(float),0);
}
#endif
// Launch kernel on 1M elements on the GPU
add<<<numBlocks, blockSize>>>(N, x, y);
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
// Check for errors (all values should be 3.0f)
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i]-3.0f));
//std::cout << "Max error: " << maxError << std::endl;
//if (mode==4)
//add<<<numBlocks, blockSize>>>(N, x, y);
// Free memory
cudaFree(x);
cudaFree(y);
return maxError;
}
int DoAddCopy()
{
int N = 1<<20;
// Allocate host memory
float *x = new float[N], *y = new float[N];
//std::cout << "X pointer = " << std::hex << x << " Y pointer = " << std::hex << y << std::endl;
// Device arrays
float *d_x, *d_y;
// Allocate device memory, only accessable from the GPU
cudaMalloc((void **) &d_x, N*sizeof(float));
cudaMalloc((void **) &d_y, N*sizeof(float));
//std::cout << "Device X pointer = " << std::hex << d_x << " Device Y pointer = " << std::hex << d_y << std::endl;
// initialize x and y arrays on the host
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
// Copy array contents of input from the host (CPU) to the device (GPU)
cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice);
// Launch kernel on 1M elements on the GPU
int blockSize = 256;
int numBlocks = (N + blockSize - 1) / blockSize;
add<<<numBlocks, blockSize>>>(N, d_x, d_y);
// Wait for GPU to finish before accessing on host
//cudaDeviceSynchronize(); // May be I don't need it here
// Copy result back
cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);
// Check for errors (all values should be 3.0f)
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i]-3.0f));
//std::cout << "Max error: " << maxError << std::endl;
// Free memory
cudaFree(d_x);
cudaFree(d_y);
delete x;
delete y;
return maxError;
}
int DoAddZeroCopy()
{
int N = 1<<20;
// Declare empty pointers
float *x = NULL, *y = NULL;
// Allocate host memory
cudaHostAlloc((void **)&x, N*sizeof(float), cudaHostAllocMapped);
cudaHostAlloc((void **)&y, N*sizeof(float), cudaHostAllocMapped);
//std::cout << "X pointer = " << std::hex << x << " Y pointer = " << std::hex << y << std::endl;
// initialize x and y arrays on the host
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
// Device pointers
float *d_x, *d_y;
// Get device pointer from host memory. No allocation or memcpy
cudaHostGetDevicePointer((void **)&d_x, (void *) x , 0);
cudaHostGetDevicePointer((void **)&d_y, (void *) y, 0);
//std::cout << "Device X pointer = " << std::hex << d_x << " Device Y pointer = " << std::hex << d_y << std::endl;
// no need to memory copy
// Launch kernel on 1M elements on the GPU
int blockSize = 256;
int numBlocks = (N + blockSize - 1) / blockSize;
add<<<numBlocks, blockSize>>>(N, d_x, d_y);
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize(); // May be I don't need it here
// No need to copy back
// Check for errors (all values should be 3.0f)
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i]-3.0f));
//std::cout << "Max error: " << maxError << std::endl;
// Free memory
cudaFreeHost(x);
cudaFreeHost(y);
return maxError;
}
int main(int argc, char **argv)
{
int mode = 0;
int loop = 10;
int i = 0;
int rval = 0;
if (argc==3)
{
mode = atoi(argv[1]);
loop = atoi(argv[2]);
}
int device_count=0;
cudaGetDeviceCount (&device_count);
std::cout << "Mode = " << mode << " , Loop = " << loop << std::endl;
std::cout << "The number of cuda devices = " << device_count << std::endl;
switch(mode)
{
case 0:
case 4:
for (i=0;i<loop;i++)
rval+=DoAddManaged (mode);
break;
case 1:
for (i=0;i<loop;i++)
rval+=DoAddCopy();
break;
case 2:
for (i=0;i<loop;i++)
rval+=DoAddZeroCopy();
break;
default:
for (i=0;i<loop;i++)
rval+=DoAddManaged(mode);
break;
}
return rval;
}
rval+=DoAddManaged (mode);
break;
case 1:
for (i=0;i<loop;i++)
rval+=DoAddCopy();
break;
case 2:
for (i=0;i<loop;i++)
rval+=DoAddZeroCopy();
break;
default:
for (i=0;i<loop;i++)
rval+=DoAddManaged(mode);
break;
}
return rval;
}