Is it correct that using Thrust:: Device_vector on Nano takes less time than using Managed？

Zhuoqian · August 30, 2021, 3:59am

I use three ways to implement the kernel: cudaMalloc, cudaMallocManaged, and thrust:: device_vector。

On Nano, CPU and GPU use the same memory. Theoretically, using cudaMallocManaged requires no data transmission, so it should take the shortest time. However, the test result shows that using cudaMallocManaged takes the longest time. I want to count the total time taken to define the data, transfer the data, and compute the process.

#include <stdio.h>
#include <cuda_runtime.h>
#include <iostream>
#include <chrono>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>

int rows = 1000;

__global__ void cudatest1D(int* hd_dev, int label, const int count){
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  if(i < count){
     hd_dev[i] = label;
  }
}

__global__ void cudatestManaged2D(int** hd_2D, int label, const int rows, const int cols){
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  int j = blockDim.y * blockIdx.y + threadIdx.y;
  if(i < rows && j < cols){
     hd_2D[i][j]= label;
  }
}

void testManaged(){
  printf("***********testManaged************\n");
  // initial data
  int count = rows;
  int label = 3;
  dim3 dimBlock(32, 1);
  dim3 dimGrid(ceil((rows + dimBlock.x - 1) / dimBlock.x), 1);

  auto start2 = std::chrono::steady_clock::now();
  int *hd_dev;
  cudaMallocManaged(&hd_dev, count * sizeof(int));
 
  for (int i = 0; i < count; i++) {
    hd_dev[i] = -1;
  }
  cudatest1D <<<dimGrid, dimBlock>>>(hd_dev, label, count);
  cudaDeviceSynchronize();
  auto end2 = std::chrono::steady_clock::now();
  printf("-----------result---------------\n");
  printf("testManaged: %f s, \n", std::chrono::duration_cast<std::chrono::duration<double>>(end2 - start2).count());
  printf("--------------------------------\n");

  cudaFree(hd_dev);
}

void testThrust(){
  printf("***********testThrust************\n");
  // initial data
  int count = rows;
  int label = 3;
  dim3 dimBlock(32, 1);
  dim3 dimGrid(ceil((rows + dimBlock.x - 1) / dimBlock.x), 1);

  auto start2 = std::chrono::steady_clock::now();
  thrust::host_vector<int> h_vec(rows, -1);
  thrust::device_vector<int> d_vec = h_vec;
  int * ptr_d_vec = thrust::raw_pointer_cast(&d_vec[0]);
 
  cudatest1D <<<dimGrid, dimBlock>>>(ptr_d_vec, label, count);
  thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());
  auto end2 = std::chrono::steady_clock::now();
  printf("-----------result---------------\n");
  printf("testThrust: %f s, \n", std::chrono::duration_cast<std::chrono::duration<double>>(end2 - start2).count());
  printf("--------------------------------\n");
}

void testMalloc(){
  printf("***********testMalloc************\n");
  // initial data
  int count = rows;
  int label = 3;
  dim3 dimBlock(32, 1);
  dim3 dimGrid(ceil((rows + dimBlock.x - 1) / dimBlock.x), 1);

  auto start2 = std::chrono::steady_clock::now();
  int *h_dev = (int *)malloc(sizeof(int) * count);
  for (int i = 0; i < count; i++) {
    h_dev[i] = -1;
  }
  int *d_dev;
  cudaMalloc((void**)&d_dev, sizeof(int) * count);
  cudaMemcpy(d_dev, h_dev, count * sizeof(int), cudaMemcpyHostToDevice);
 
  cudatest1D <<<dimGrid, dimBlock>>>(d_dev, label, count);
  cudaMemcpy(h_dev, d_dev, count * sizeof(int), cudaMemcpyDeviceToHost);
  auto end2 = std::chrono::steady_clock::now();
  printf("-----------result---------------\n");
  printf("testMalloc: %f s, \n", std::chrono::duration_cast<std::chrono::duration<double>>(end2 - start2).count());
  printf("--------------------------------\n");

  cudaFree(d_dev);
}


int main()
{
  printf("rows: %d\n", rows);
  testManaged();
  testThrust();
  testMalloc();
}

The compile command is

nvcc --gpu-architecture=compute_53 --gpu-code=sm_53 -o managedVSthrust main.cu

The result on Jetson Nano is:

rows: 1000
***********testManaged************
-----------result---------------
testManaged: 0.091949 s, 
--------------------------------
***********testThrust************
-----------result---------------
testThrust: 0.001158 s, 
--------------------------------
***********testMalloc************
-----------result---------------
testMalloc: 0.001002 s, 
--------------------------------

AastaLLL · August 30, 2021, 9:01am

Hi,

Have you maximized the devive performance first?

$ sudo nvpmodel -m 0
$ sudo jetson_clocks

Although CPU and GPU share the same physical memory, they have their own cache and will affect the performance.
You can find some details about memory on Jetson in the below document:

Thanks.

Zhuoqian · August 31, 2021, 2:46am

Hi AastaLLL,
Thank you very much for your advice!
Your advice is work. But if it is executed only once, cudaMallocManaged is slower than Thrust, UVM will be faster than Thrust after multiple executions. Is this because of the cache?

Thanks.

AastaLLL · September 1, 2021, 8:15am

Hi,

The underlying mechanism is different.

Trust uses the same address for CPU and GPU.
Unified memory uses different addresses for processors and lets the GPU driver handle the synchronization underlying.

For just only one access, the synchronize overhead is higher than the performance gain from unified memory.
But it improves with multiple access since unified memory tends to allocate the buffer to ensure both CPU and GPU can be fast.

Thanks.

Topic		Replies	Views
[Thrust] is there a managed_vector? With Unified Memory do we still only have device_vector? [Cuda Thrust Managed Vectors] GPU-Accelerated Libraries	4	4657	June 23, 2020
Why is MallocManagedMemory slower CUDA Programming and Performance	4	1297	July 22, 2018
Thrust and Cuda 6.0 CUDA Programming and Performance	2	746	March 31, 2014
Unified Memory Behavior... CUDA Programming and Performance	4	3497	August 11, 2014
Why I create memory with cudaMallocManaged() would lower the speed of my process? Jetson TX2	2	1194	October 18, 2021
CUDA kernel 10x slower when operating on cudaMallocManaged memory even when prefetched CUDA Programming and Performance	2	448	March 24, 2023
Using Thrust to sort Unified Memory Buffer? GPU-Accelerated Libraries	8	5275	May 7, 2015
Managed memory vs cudaHostAlloc - TK1 CUDA Programming and Performance	10	6293	February 22, 2016
cudaMalloc() vs cudaMallocManaged() wrt to cudaMemcpy() CUDA Programming and Performance	9	4783	October 11, 2018
Managed Memory is slower on host than memory allocated with new[] Jetson AGX Orin cuda	6	795	September 11, 2023

Is it correct that using Thrust:: Device_vector on Nano takes less time than using Managed？

Related topics