Is it correct that using Thrust:: Device_vector on Nano takes less time than using Managed?

I use three ways to implement the kernel: cudaMalloc, cudaMallocManaged, and thrust:: device_vector。

On Nano, CPU and GPU use the same memory. Theoretically, using cudaMallocManaged requires no data transmission, so it should take the shortest time. However, the test result shows that using cudaMallocManaged takes the longest time. I want to count the total time taken to define the data, transfer the data, and compute the process.

#include <stdio.h>
#include <cuda_runtime.h>
#include <iostream>
#include <chrono>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>

int rows = 1000;

__global__ void cudatest1D(int* hd_dev, int label, const int count){
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  if(i < count){
     hd_dev[i] = label;
  }
}

__global__ void cudatestManaged2D(int** hd_2D, int label, const int rows, const int cols){
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  int j = blockDim.y * blockIdx.y + threadIdx.y;
  if(i < rows && j < cols){
     hd_2D[i][j]= label;
  }
}

void testManaged(){
  printf("***********testManaged************\n");
  // initial data
  int count = rows;
  int label = 3;
  dim3 dimBlock(32, 1);
  dim3 dimGrid(ceil((rows + dimBlock.x - 1) / dimBlock.x), 1);

  auto start2 = std::chrono::steady_clock::now();
  int *hd_dev;
  cudaMallocManaged(&hd_dev, count * sizeof(int));
 
  for (int i = 0; i < count; i++) {
    hd_dev[i] = -1;
  }
  cudatest1D <<<dimGrid, dimBlock>>>(hd_dev, label, count);
  cudaDeviceSynchronize();
  auto end2 = std::chrono::steady_clock::now();
  printf("-----------result---------------\n");
  printf("testManaged: %f s, \n", std::chrono::duration_cast<std::chrono::duration<double>>(end2 - start2).count());
  printf("--------------------------------\n");

  cudaFree(hd_dev);
}

void testThrust(){
  printf("***********testThrust************\n");
  // initial data
  int count = rows;
  int label = 3;
  dim3 dimBlock(32, 1);
  dim3 dimGrid(ceil((rows + dimBlock.x - 1) / dimBlock.x), 1);

  auto start2 = std::chrono::steady_clock::now();
  thrust::host_vector<int> h_vec(rows, -1);
  thrust::device_vector<int> d_vec = h_vec;
  int * ptr_d_vec = thrust::raw_pointer_cast(&d_vec[0]);
 
  cudatest1D <<<dimGrid, dimBlock>>>(ptr_d_vec, label, count);
  thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());
  auto end2 = std::chrono::steady_clock::now();
  printf("-----------result---------------\n");
  printf("testThrust: %f s, \n", std::chrono::duration_cast<std::chrono::duration<double>>(end2 - start2).count());
  printf("--------------------------------\n");
}

void testMalloc(){
  printf("***********testMalloc************\n");
  // initial data
  int count = rows;
  int label = 3;
  dim3 dimBlock(32, 1);
  dim3 dimGrid(ceil((rows + dimBlock.x - 1) / dimBlock.x), 1);

  auto start2 = std::chrono::steady_clock::now();
  int *h_dev = (int *)malloc(sizeof(int) * count);
  for (int i = 0; i < count; i++) {
    h_dev[i] = -1;
  }
  int *d_dev;
  cudaMalloc((void**)&d_dev, sizeof(int) * count);
  cudaMemcpy(d_dev, h_dev, count * sizeof(int), cudaMemcpyHostToDevice);
 
  cudatest1D <<<dimGrid, dimBlock>>>(d_dev, label, count);
  cudaMemcpy(h_dev, d_dev, count * sizeof(int), cudaMemcpyDeviceToHost);
  auto end2 = std::chrono::steady_clock::now();
  printf("-----------result---------------\n");
  printf("testMalloc: %f s, \n", std::chrono::duration_cast<std::chrono::duration<double>>(end2 - start2).count());
  printf("--------------------------------\n");

  cudaFree(d_dev);
}


int main()
{
  printf("rows: %d\n", rows);
  testManaged();
  testThrust();
  testMalloc();
}

The compile command is

nvcc --gpu-architecture=compute_53 --gpu-code=sm_53 -o managedVSthrust main.cu

The result on Jetson Nano is:

rows: 1000
***********testManaged************
-----------result---------------
testManaged: 0.091949 s, 
--------------------------------
***********testThrust************
-----------result---------------
testThrust: 0.001158 s, 
--------------------------------
***********testMalloc************
-----------result---------------
testMalloc: 0.001002 s, 
--------------------------------

Hi,

Have you maximized the devive performance first?

$ sudo nvpmodel -m 0
$ sudo jetson_clocks

Although CPU and GPU share the same physical memory, they have their own cache and will affect the performance.
You can find some details about memory on Jetson in the below document:
https://docs.nvidia.com/cuda/cuda-for-tegra-appnote/index.html#memory-management

Thanks.

Hi AastaLLL,
Thank you very much for your advice!
Your advice is work. But if it is executed only once, cudaMallocManaged is slower than Thrust, UVM will be faster than Thrust after multiple executions. Is this because of the cache?

Thanks.

Hi,

The underlying mechanism is different.

Trust uses the same address for CPU and GPU.
Unified memory uses different addresses for processors and lets the GPU driver handle the synchronization underlying.

For just only one access, the synchronize overhead is higher than the performance gain from unified memory.
But it improves with multiple access since unified memory tends to allocate the buffer to ensure both CPU and GPU can be fast.

Thanks.