Why does it take longer for a program to use Unified Memory than not to use Uuified Memoery?

zzy3797777861 · January 19, 2021, 5:44pm

When I run ./simpleMemcpy, the result is as follows:(without Unified Memory)
xavier@xavier-desktop:~/cuda_demo/cuda-benchmarks-master$ ./simpleMemcpy 40000000
host: MallocHost: 0.017753
host: init arrays: 0.359125
device: malloc+copy+compute: 0.052061
host: access all arrays: 0.216636
host: access all arrays a second time: 0.053593
host: free: 0.033321
total: 0.732845

When I run ./simpleManaged, the result is as follows:(witht Unified Memory)
xavier@xavier-desktop:~/cuda_demo/cuda-benchmarks-master$ ./simpleManaged 40000000
host: MallocManaged: 0.015639
host: init arrays: 0.385382
device: uvm+compute+synchronize: 0.001809
host: access all arrays: 0.613202
host: access all arrays a second time: 0.054150
host: free: 0.033925
total: 1.104471

why the time of “host: access all array” in simpleManaged.cu is longer than that of “host: access all array” in simpleMemcpy.cu???

the Xavier is in the highest performance mode (nvpmodel –m 0 jetson_clocks)

Code of simpleMemcpy.cu：
include
include
include
include <cuda_runtime.h>
include “common.hh”

static global void
f(const uint64_t a, const uint64_t b, uint64_t c, int64_t N)
{
int64_t index = threadIdx.x + blockIdx.x * blockDim.x;
int64_t stride = blockDim.x * gridDim.x;

for (int64_t i = index; i < N; i += stride) {
    c[i] = a[i] * b[i];
}

}

static void
doit(const uint64_t a, const uint64_t b, uint64_t c, int64_t N)
{
int blockSize = 256;
int64_t numBlocks = (N + blockSize - 1) / blockSize;

f<<<numBlocks, blockSize>>>(a, b, c, N);

}

int
main(int argc, char *argv)
{
size_t N = 10000000;
clock_t start_program, end_program;
clock_t start, end;
uint64_t *a, *b, *c;
uint64_t *da, *db, *dc;
size_t count;

if (argc == 2) {
    N = checked_strtosize(argv[1]);
}
count = checked_mul(N, sizeof(uint64_t));

/* Initialize context */
check(cudaMallocHost(&a, 128));
check(cudaDeviceSynchronize());
check(cudaFreeHost(a));

start_program = clock();

start = clock();
check(cudaMallocHost(&a, count));
check(cudaMallocHost(&b, count));
check(cudaMallocHost(&c, count));
end = clock();
log("host: MallocHost", start, end);

start = clock();
for (size_t i = 0; i < N; i++) {
    a[i] = 3;
    b[i] = 5;
}
end = clock();
log("host: init arrays", start, end);

start = clock();
check(cudaMalloc(&da, count));
check(cudaMalloc(&db, count));
check(cudaMalloc(&dc, count));

check(cudaMemcpy(da, a, count, cudaMemcpyHostToDevice));
check(cudaMemcpy(db, b, count, cudaMemcpyHostToDevice));

doit(da, db, dc, N);

check(cudaMemcpy(c, dc, count, cudaMemcpyDeviceToHost));

check(cudaFree(da));
check(cudaFree(db));
check(cudaFree(dc));
end = clock();
log("device: malloc+copy+compute", start, end);

start = clock();
for (size_t i = 0; i < N; i++) {
    if (a[i] != 3 || b[i] != 5 || c[i] != 15) {
        fprintf(stderr, "unexpected result a: %lu  b: %lu  c: %lu\n",
                a[i], b[i], c[i]);
        exit(1);
    }
}
end = clock();
log("host: access all arrays", start, end);

start = clock();
for (size_t i = 0; i < N; i++) {
    if (a[i] != 3 || b[i] != 5 || c[i] != 15) {
        fprintf(stderr, "unexpected result a: %lu  b: %lu  c: %lu\n",
                a[i], b[i], c[i]);
        exit(1);
    }
}
end = clock();
log("host: access all arrays a second time", start, end);

start = clock();
check(cudaFreeHost(a));
check(cudaFreeHost(b));
check(cudaFreeHost(c));
end = clock();
log("host: free", start, end);

end_program = clock();
log("total", start_program, end_program);

return 0;

}

Code of simpleMemcpy.cu：
include
include
include <cuda_runtime.h>
include “common.hh”

static global void
f(const uint64_t a, const uint64_t b, uint64_t c, int64_t N)
{
int64_t index = threadIdx.x + blockIdx.x * blockDim.x;
int64_t stride = blockDim.x * gridDim.x;

for (int64_t i = index; i < N; i += stride) {
    c[i] = a[i] * b[i];
}

}

static void
doit(const uint64_t a, const uint64_t b, uint64_t c, int64_t N)
{
int blockSize = 256;
int64_t numBlocks = (N + blockSize - 1) / blockSize;

f<<<numBlocks, blockSize>>>(a, b, c, N);

}

int
main(int argc, char *argv)
{
size_t N = 10000000;
clock_t start_program, end_program;
clock_t start, end;
uint64_t *a, *b, *c;
size_t count;

if (argc == 2) {
    N = checked_strtosize(argv[1]);
}
count = checked_mul(N, sizeof(uint64_t));

/* Initialize context */
check(cudaMallocManaged(&a, 128));
check(cudaDeviceSynchronize());
check(cudaFree(a));

start_program = clock();

start = clock();
check(cudaMallocManaged(&a, count));
check(cudaMallocManaged(&b, count));
check(cudaMallocManaged(&c, count));
end = clock();
log("host: MallocManaged", start, end);

start = clock();
for (size_t i = 0; i < N; i++) {
    a[i] = 3;
    b[i] = 5;
}
end = clock();
log("host: init arrays", start, end);

start = clock();
doit(a, b, c, N);
check(cudaDeviceSynchronize());
end = clock();
log("device: uvm+compute+synchronize", start, end);

start = clock();
for (size_t i = 0; i < N; i++) {
    if (a[i] != 3 || b[i] != 5 || c[i] != 15) {
        fprintf(stderr, "unexpected result a: %lu  b: %lu  c: %lu\n",
                a[i], b[i], c[i]);
        exit(1);
    }
}
end = clock();
log("host: access all arrays", start, end);

start = clock();
for (size_t i = 0; i < N; i++) {
    if (a[i] != 3 || b[i] != 5 || c[i] != 15) {
        fprintf(stderr, "unexpected result a: %lu  b: %lu  c: %lu\n",
                a[i], b[i], c[i]);
        exit(1);
    }
}
end = clock();
log("host: access all arrays a second time", start, end);

start = clock();
check(cudaFree(a));
check(cudaFree(b));
check(cudaFree(c));
end = clock();
log("host: free", start, end);

end_program = clock();
log("total", start_program, end_program);

return 0;

}

AastaLLL · January 25, 2021, 3:14am

Hi,

Unified memory uses the same buffer pointer for CPU and GPU, so some overhead occurs due to synchronization.
Below is a good tutorial about improving the unified memory’s performance:

https://developer.nvidia.com/blog/maximizing-unified-memory-performance-cuda/

Thanks.

zzy3797777861 · January 26, 2021, 9:56am

NICE！
thaks！

Topic		Replies	Views
Why does it take longer for a program to use Unified Memory than not to use Uuified Memoery? Jetson AGX Xavier cuda	1	370	January 20, 2021
Bad performance when using unified memory CUDA Programming and Performance	2	3505	April 21, 2019
Unified Memory Access Performance of Arrays of Structures Problem on Jetson TX2 Jetson TX2 cuda	4	771	May 27, 2020
Zero Copy Memory vs Unified memory CUDA processing Jetson TX1	27	21257	February 23, 2018
Kernel lunch overhead increases significantly (10x) when using unified memory on TK1 and TX1 Jetson TK1	5	3509	August 31, 2018
Unified Memory for CUDA Beginners Technical Blog	46	3289	December 1, 2023
Significant performance problem with Unified Memory based on driver version CUDA Programming and Performance	2	1496	July 31, 2018
Unified Memory has poor performance on Jetson AGX Xavier Jetson AGX Xavier cuda	5	1345	January 18, 2022
Unified Memory in CUDA 6 Technical Blog	87	2903	August 16, 2019
Abysmal performance with Unified Memory and CUBLAS CUDA Programming and Performance	15	4689	November 29, 2014

Why does it take longer for a program to use Unified Memory than not to use Uuified Memoery?

the Xavier is in the highest performance mode (nvpmodel –m 0 jetson_clocks)

Related topics