I use three ways to implement the kernel: cudaMalloc, cudaMallocManaged, and thrust:: device_vector。
On Nano, CPU and GPU use the same memory. Theoretically, using cudaMallocManaged requires no data transmission, so it should take the shortest time. However, the test result shows that using cudaMallocManaged takes the longest time. I want to count the total time taken to define the data, transfer the data, and compute the process.
#include <stdio.h>
#include <cuda_runtime.h>
#include <iostream>
#include <chrono>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
int rows = 1000;
__global__ void cudatest1D(int* hd_dev, int label, const int count){
int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i < count){
hd_dev[i] = label;
}
}
__global__ void cudatestManaged2D(int** hd_2D, int label, const int rows, const int cols){
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockDim.y * blockIdx.y + threadIdx.y;
if(i < rows && j < cols){
hd_2D[i][j]= label;
}
}
void testManaged(){
printf("***********testManaged************\n");
// initial data
int count = rows;
int label = 3;
dim3 dimBlock(32, 1);
dim3 dimGrid(ceil((rows + dimBlock.x - 1) / dimBlock.x), 1);
auto start2 = std::chrono::steady_clock::now();
int *hd_dev;
cudaMallocManaged(&hd_dev, count * sizeof(int));
for (int i = 0; i < count; i++) {
hd_dev[i] = -1;
}
cudatest1D <<<dimGrid, dimBlock>>>(hd_dev, label, count);
cudaDeviceSynchronize();
auto end2 = std::chrono::steady_clock::now();
printf("-----------result---------------\n");
printf("testManaged: %f s, \n", std::chrono::duration_cast<std::chrono::duration<double>>(end2 - start2).count());
printf("--------------------------------\n");
cudaFree(hd_dev);
}
void testThrust(){
printf("***********testThrust************\n");
// initial data
int count = rows;
int label = 3;
dim3 dimBlock(32, 1);
dim3 dimGrid(ceil((rows + dimBlock.x - 1) / dimBlock.x), 1);
auto start2 = std::chrono::steady_clock::now();
thrust::host_vector<int> h_vec(rows, -1);
thrust::device_vector<int> d_vec = h_vec;
int * ptr_d_vec = thrust::raw_pointer_cast(&d_vec[0]);
cudatest1D <<<dimGrid, dimBlock>>>(ptr_d_vec, label, count);
thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());
auto end2 = std::chrono::steady_clock::now();
printf("-----------result---------------\n");
printf("testThrust: %f s, \n", std::chrono::duration_cast<std::chrono::duration<double>>(end2 - start2).count());
printf("--------------------------------\n");
}
void testMalloc(){
printf("***********testMalloc************\n");
// initial data
int count = rows;
int label = 3;
dim3 dimBlock(32, 1);
dim3 dimGrid(ceil((rows + dimBlock.x - 1) / dimBlock.x), 1);
auto start2 = std::chrono::steady_clock::now();
int *h_dev = (int *)malloc(sizeof(int) * count);
for (int i = 0; i < count; i++) {
h_dev[i] = -1;
}
int *d_dev;
cudaMalloc((void**)&d_dev, sizeof(int) * count);
cudaMemcpy(d_dev, h_dev, count * sizeof(int), cudaMemcpyHostToDevice);
cudatest1D <<<dimGrid, dimBlock>>>(d_dev, label, count);
cudaMemcpy(h_dev, d_dev, count * sizeof(int), cudaMemcpyDeviceToHost);
auto end2 = std::chrono::steady_clock::now();
printf("-----------result---------------\n");
printf("testMalloc: %f s, \n", std::chrono::duration_cast<std::chrono::duration<double>>(end2 - start2).count());
printf("--------------------------------\n");
cudaFree(d_dev);
}
int main()
{
printf("rows: %d\n", rows);
testManaged();
testThrust();
testMalloc();
}
The compile command is
nvcc --gpu-architecture=compute_53 --gpu-code=sm_53 -o managedVSthrust main.cu
The result on Jetson Nano is:
rows: 1000
***********testManaged************
-----------result---------------
testManaged: 0.091949 s,
--------------------------------
***********testThrust************
-----------result---------------
testThrust: 0.001158 s,
--------------------------------
***********testMalloc************
-----------result---------------
testMalloc: 0.001002 s,
--------------------------------