Why the access speed of memory allocated by cudaMallocHost is so slow?

Hi:
We’ve developed a product with video. There are 4 input signals connected to the TX2, every signal has a video format YUV422 8bits Packed UYVY. I use cuda to complete the colorspace transformation. But the access speed of memory which allocated by cudaMallocHost is so slow, I can’t fetch the result in time, final, the FPS can’t reach 60.

So, I tested the performance of those memory with codes below.

void stdm_test(size_t cx, size_t cy, int times)
{
    char* src;
    char* dst;
    const size_t sz = cx*cy*4;
    src = new char[sz];
    dst = new char[sz];

    auto t1 = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < times; ++i) {
        memcpy(dst, src, sz);
    }
    auto t2 = std::chrono::high_resolution_clock::now();
    auto ts = std::chrono::duration<double>(t2 - t1).count();
    printf("stdm takes:%f seconds, Avg speed:%8.3fM/s\n", ts, (double)sz*times/ts/1024/1024.);

    delete[] src;
    delete[] dst;
}

void cuda_test(size_t cx, size_t cy, int times)
{
    char* src;
    char* dst;
    const size_t sz = cx*cy*4;
    cudaHostAlloc((void**)&src, sz, cudaHostAllocMapped);
    cudaHostAlloc((void**)&dst, sz, cudaHostAllocMapped);

    auto t1 = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < times; ++i) {
        memcpy(dst, src, sz);
    }
    auto t2 = std::chrono::high_resolution_clock::now();
    auto ts = std::chrono::duration<double>(t2 - t1).count();
    printf("cuda takes:%f seconds, Avg speed:%8.3fM/s\n", ts, (double)sz*times/ts/1024/1024.);

    cudaFreeHost(src);
    cudaFreeHost(dst);
}

On TX2, the result is:
stdm: 3783 M/s
cuda: 582 M/s

I did the same testing on my PC with Quadra P620, the result is:
stdm: 4556 M/s
cuda: 4601 M/s

Why? Could you give me some suggestions to improve my program please? Please give me a minute to introduce my program.

I create a thread to poll the video device(/dev/video0~N), when a frame was ready, the thread will copy the frame data to the buffer which was allocated by cudaMallocHost, then start the cuda-kernel to complete colorspace transformation(To RGBA), finally, the thread broadcast the result to it’s subscribers.

I created a sample subscriber for our customers to demonstrate the usage of our product. When the subscriber received a frame, I clone it and share the clone to the renderer thread. The procedure of clone is just a cudaMemcpy invocation, but this operation will take more than 16 milliseconds, it’s really bad.

Hi,

First, have you maximized the device performance first?

$ sudo nvpmodel -m 0
$ sudo jetson_clocks.

If the issue goes on, would you mind to share a complete sample with cx, cy info with us?
Thanks.
Thanks.

Hi,

sudo jetson_clocks

This command succeeded, jetson_clocks --show printed some texts endup “MAXP_CORE_ARM”.

cx,cy is the resolution of my video, we used 3 formats in our application:

  • 1920x1080
  • 1280x720
  • 720x576
//
// (c) 2020 chiv
//
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <memory.h>
#include <chrono>
#include <algorithm>

void stdm_test(size_t cx, size_t cy, int times)
{
    char* src;
    char* dst;
    const size_t sz = cx*cy*4;
    src = new char[sz];
    dst = new char[sz];

    auto t1 = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < times; ++i) {
        memcpy(dst, src, sz);
    }
    auto t2 = std::chrono::high_resolution_clock::now();
    auto ts = std::chrono::duration<double>(t2 - t1).count();
    printf("stdm takes:%f seconds, Avg speed:%8.3fM/s\n", ts, (double)sz*times/ts/1024/1024.);

    delete[] src;
    delete[] dst;
}

void cuda_test(size_t cx, size_t cy, int times)
{
    char* src;
    char* dst;
    const size_t sz = cx*cy*4;
    cudaHostAlloc((void**)&src, sz, cudaHostAllocMapped);
    cudaHostAlloc((void**)&dst, sz, cudaHostAllocMapped);

    auto t1 = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < times; ++i) {
        memcpy(dst, src, sz);
    }
    auto t2 = std::chrono::high_resolution_clock::now();
    auto ts = std::chrono::duration<double>(t2 - t1).count();
    printf("cuda takes:%f seconds, Avg speed:%8.3fM/s\n", ts, (double)sz*times/ts/1024/1024.);

    cudaFreeHost(src);
    cudaFreeHost(dst);
}

int main(int argc, char** argv)
{
    size_t cx = 1280, cy = 720;
    int times = 10;
    switch (argc) {
    case 4:
        times = atoi(argv[3]);
    case 3:
        cy = strtoul(argv[2], nullptr, 10);
    case 2:
        cx = strtoul(argv[1], nullptr, 10);
    default:break;
    }
    cx = std::max(cx, 1ul);
    cy = std::max(cy, 1ul);
    times = std::max(times, 1);
    printf("cx = %lu, cy = %lu, times = %d\n", cx, cy, times);

    cuda_test(cx, cy, times);
    stdm_test(cx, cy, times);    

    return 0;
}

I saved these codes to a file named “test.cpp”, then compile it with

g++ -O2 -o test test.cpp -I/usr/local/cuda/include -L/usr/local/cuda/lib64 -lcudart

Run it

./test 1920 1080 10
./test 1280 720
./test 720 576
./test 720 576 100

.etc

Thanks, we are going to reproduce this in our environment.
Will update here if any progress.

Thanks, before this program, I move the result to GL context with CUDA API, so I do not need to access the result from CPU. But, our customers want to process the result themselves, we should provide a block of memory for them, so I use cudaMallocHost to allocate it.

1280x720 reaches 60 FPS. The maximized size is 1920x1080, and our L4T is 32.4.

Hi,

Thanks for your patience.
We found this issue is not a bug but comes from a limitation of TX2.

Please noticed that TX2 doesn’t support IO coherency so the pinned memory is uncached.
Without cache, the performance tends to be much slower than the pageable host memory.
You can find some information in our document: https://docs.nvidia.com/cuda/cuda-for-tegra-appnote/index.html

To improve this, we recommends to use unified memory instead.

...
void cuda_test(size_t cx, size_t cy, int times)
{
    char* src;
    char* dst;
    const size_t sz = cx*cy*4;
    cudaMallocManaged((void**)&src, sz);
    cudaMallocManaged((void**)&dst, sz);

    auto t1 = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < times; ++i) {
        memcpy(dst, src, sz);
        cudaDeviceSynchronize();
    }
    auto t2 = std::chrono::high_resolution_clock::now();
    auto ts = std::chrono::duration<double>(t2 - t1).count();
    printf("cuda takes:%f seconds, Avg speed:%8.3fM/s\n", ts, (double)sz*times/ts/1024/1024.);

    cudaFree(src);
    cudaFree(dst);
}
...

We can get a much better performance with unified memory on the TX2.

$ ./test 1920 1080 10
cx = 1920, cy = 1080, times = 10
cuda takes:0.028232 seconds, Avg speed:2801.879M/s
stdm takes:0.023944 seconds, Avg speed:3303.550M/s

Thanks.

1 Like

Thanks very much. I got it, and I tested the unified memory, it’s okay to go.