Why does it take longer for a program to use Unified Memory than not to use Uuified Memoery?

When I run ./simpleMemcpy, the result is as follows:(without Unified Memory)
xavier@xavier-desktop:~/cuda_demo/cuda-benchmarks-master$ ./simpleMemcpy 40000000
host: MallocHost: 0.017753
host: init arrays: 0.359125
device: malloc+copy+compute: 0.052061
host: access all arrays: 0.216636
host: access all arrays a second time: 0.053593
host: free: 0.033321
total: 0.732845

When I run ./simpleManaged, the result is as follows:(witht Unified Memory)
xavier@xavier-desktop:~/cuda_demo/cuda-benchmarks-master$ ./simpleManaged 40000000
host: MallocManaged: 0.015639
host: init arrays: 0.385382
device: uvm+compute+synchronize: 0.001809
host: access all arrays: 0.613202
host: access all arrays a second time: 0.054150
host: free: 0.033925
total: 1.104471

why the time of “host: access all array” in simpleManaged.cu is longer than that of “host: access all array” in simpleMemcpy.cu???

the Xavier is in the highest performance mode (nvpmodel –m 0 jetson_clocks)

Code of simpleMemcpy.cu:
#include
#include
#include
#include <cuda_runtime.h>
#include “common.hh”

static global void
f(const uint64_t a, const uint64_t b, uint64_t c, int64_t N)
{
int64_t index = threadIdx.x + blockIdx.x * blockDim.x;
int64_t stride = blockDim.x * gridDim.x;

for (int64_t i = index; i < N; i += stride) {
    c[i] = a[i] * b[i];
}

}

static void
doit(const uint64_t a, const uint64_t b, uint64_t c, int64_t N)
{
int blockSize = 256;
int64_t numBlocks = (N + blockSize - 1) / blockSize;

f<<<numBlocks, blockSize>>>(a, b, c, N);

}

int
main(int argc, char *argv)
{
size_t N = 10000000;
clock_t start_program, end_program;
clock_t start, end;
uint64_t *a, *b, *c;
uint64_t *da, *db, *dc;
size_t count;

if (argc == 2) {
    N = checked_strtosize(argv[1]);
}
count = checked_mul(N, sizeof(uint64_t));

/* Initialize context */
check(cudaMallocHost(&a, 128));
check(cudaDeviceSynchronize());
check(cudaFreeHost(a));

start_program = clock();

start = clock();
check(cudaMallocHost(&a, count));
check(cudaMallocHost(&b, count));
check(cudaMallocHost(&c, count));
end = clock();
log("host: MallocHost", start, end);

start = clock();
for (size_t i = 0; i < N; i++) {
    a[i] = 3;
    b[i] = 5;
}
end = clock();
log("host: init arrays", start, end);

start = clock();
check(cudaMalloc(&da, count));
check(cudaMalloc(&db, count));
check(cudaMalloc(&dc, count));

check(cudaMemcpy(da, a, count, cudaMemcpyHostToDevice));
check(cudaMemcpy(db, b, count, cudaMemcpyHostToDevice));

doit(da, db, dc, N);

check(cudaMemcpy(c, dc, count, cudaMemcpyDeviceToHost));

check(cudaFree(da));
check(cudaFree(db));
check(cudaFree(dc));
end = clock();
log("device: malloc+copy+compute", start, end);

start = clock();
for (size_t i = 0; i < N; i++) {
    if (a[i] != 3 || b[i] != 5 || c[i] != 15) {
        fprintf(stderr, "unexpected result a: %lu  b: %lu  c: %lu\n",
                a[i], b[i], c[i]);
        exit(1);
    }
}
end = clock();
log("host: access all arrays", start, end);

start = clock();
for (size_t i = 0; i < N; i++) {
    if (a[i] != 3 || b[i] != 5 || c[i] != 15) {
        fprintf(stderr, "unexpected result a: %lu  b: %lu  c: %lu\n",
                a[i], b[i], c[i]);
        exit(1);
    }
}
end = clock();
log("host: access all arrays a second time", start, end);

start = clock();
check(cudaFreeHost(a));
check(cudaFreeHost(b));
check(cudaFreeHost(c));
end = clock();
log("host: free", start, end);

end_program = clock();
log("total", start_program, end_program);

return 0;

}

Code of simpleMemcpy.cu:
#include
#include
#include <cuda_runtime.h>
#include “common.hh”

static global void
f(const uint64_t a, const uint64_t b, uint64_t c, int64_t N)
{
int64_t index = threadIdx.x + blockIdx.x * blockDim.x;
int64_t stride = blockDim.x * gridDim.x;

for (int64_t i = index; i < N; i += stride) {
    c[i] = a[i] * b[i];
}

}

static void
doit(const uint64_t a, const uint64_t b, uint64_t c, int64_t N)
{
int blockSize = 256;
int64_t numBlocks = (N + blockSize - 1) / blockSize;

f<<<numBlocks, blockSize>>>(a, b, c, N);

}

int
main(int argc, char *argv)
{
size_t N = 10000000;
clock_t start_program, end_program;
clock_t start, end;
uint64_t *a, *b, *c;
size_t count;

if (argc == 2) {
    N = checked_strtosize(argv[1]);
}
count = checked_mul(N, sizeof(uint64_t));

/* Initialize context */
check(cudaMallocManaged(&a, 128));
check(cudaDeviceSynchronize());
check(cudaFree(a));

start_program = clock();

start = clock();
check(cudaMallocManaged(&a, count));
check(cudaMallocManaged(&b, count));
check(cudaMallocManaged(&c, count));
end = clock();
log("host: MallocManaged", start, end);

start = clock();
for (size_t i = 0; i < N; i++) {
    a[i] = 3;
    b[i] = 5;
}
end = clock();
log("host: init arrays", start, end);

start = clock();
doit(a, b, c, N);
check(cudaDeviceSynchronize());
end = clock();
log("device: uvm+compute+synchronize", start, end);

start = clock();
for (size_t i = 0; i < N; i++) {
    if (a[i] != 3 || b[i] != 5 || c[i] != 15) {
        fprintf(stderr, "unexpected result a: %lu  b: %lu  c: %lu\n",
                a[i], b[i], c[i]);
        exit(1);
    }
}
end = clock();
log("host: access all arrays", start, end);

start = clock();
for (size_t i = 0; i < N; i++) {
    if (a[i] != 3 || b[i] != 5 || c[i] != 15) {
        fprintf(stderr, "unexpected result a: %lu  b: %lu  c: %lu\n",
                a[i], b[i], c[i]);
        exit(1);
    }
}
end = clock();
log("host: access all arrays a second time", start, end);

start = clock();
check(cudaFree(a));
check(cudaFree(b));
check(cudaFree(c));
end = clock();
log("host: free", start, end);

end_program = clock();
log("total", start_program, end_program);

return 0;

}

Hi,

Unified memory uses the same buffer pointer for CPU and GPU, so some overhead occurs due to synchronization.
Below is a good tutorial about improving the unified memory’s performance:

Thanks.

1 Like

NICE!
thaks!