Recently, I conducted a study on Unified Memory for Jetson and reviewed the following materials.
https://docs.nvidia.com/cuda/cuda-for-tegra-appnote/index.html#memory-management
https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-coherency-hd
https://forums.developer.nvidia.com/t/memory-type-quesions/250802
Based on what I read, I wrote the following test code:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <chrono>
void randFill(int *p, int *q, int hp, int wp) {
for (int i = 0; i < hp * wp; i++) {
p[i] = 1;
}
for (int i = 0; i < wp; i++) {
q[i] = 1;
}
}
void MatrixVectorMul(int hp, int wp, int num_tests) {
int *p, *q, *r;
size_t sizeP = hp * wp;
size_t sizeQ = wp;
size_t sizeR = hp;
cudaMallocManaged(&p, sizeP * sizeof(int), cudaMemAttachHost);
cudaMallocManaged(&q, sizeQ * sizeof(int), cudaMemAttachHost);
cudaMallocManaged(&r, sizeR * sizeof(int));
randFill(p, q, hp, wp);
cublasHandle_t handle;
cublasCreate(&handle);
float *fp, *fq, *fr;
cudaMallocManaged(&fp, sizeP * sizeof(float), cudaMemAttachHost);
cudaMallocManaged(&fq, sizeQ * sizeof(float), cudaMemAttachHost);
cudaMallocManaged(&fr, sizeR * sizeof(float), cudaMemAttachHost);
for (int i = 0; i < hp * wp; i++) {
fp[i] = static_cast<float>(p[i]);
}
for (int i = 0; i < wp; i++) {
fq[i] = static_cast<float>(q[i]);
}
// Used to record the total time for multiple tests
float total_gpu_time = 0.0f;
float total_cpu_time = 0.0f;
for (int test = 0; test < num_tests; test++) {
float alpha = 1.0f;
float beta = 0.0f;
//> Prefetch p, q to GPU
cudaStreamAttachMemAsync(NULL, fp, sizeP * sizeof(int), cudaMemAttachGlobal);
cudaStreamAttachMemAsync(NULL, fq, sizeQ * sizeof(int), cudaMemAttachGlobal);
cudaStreamAttachMemAsync(NULL, fr, sizeR * sizeof(int), cudaMemAttachGlobal);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
cublasSgemv(handle, CUBLAS_OP_N, hp, wp, &alpha, fp, hp, fq, 1, &beta, fr, 1);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float gpu_time = 0;
cudaEventElapsedTime(&gpu_time, start, stop);
total_gpu_time += gpu_time;
//> Prefetch 'r' to CPU
cudaStreamAttachMemAsync(NULL, fr, 0, cudaMemAttachHost);
cudaStreamSynchronize(NULL);
auto cpu_start = std::chrono::high_resolution_clock::now();
//> do some computation on CPU side.
float *fr_host = new float[hp];
float sum = 0;
for (int i = 0; i < hp; i++) {
// fr_host[i] = fr[i];
sum += fr_host[i];
}
// for (int i = 0; i < sizeP; i++) {
// sum += fp[i];
// }
// for (int i = 0; i < sizeQ; i++) {
// sum += fq[i];
// }
// printf("%f ", sum);
auto cpu_end = std::chrono::high_resolution_clock::now();
std::chrono::duration<float, std::milli> cpu_time = cpu_end - cpu_start;
total_cpu_time += cpu_time.count();
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
printf("Average CPU Execution Time: %.3f ms\n", total_cpu_time / num_tests);
printf("Average GPU Execution Time (cuBLAS): %.3f ms\n", total_gpu_time / num_tests);
cudaFree(p);
cudaFree(q);
cudaFree(r);
cudaFree(fp);
cudaFree(fq);
cudaFree(fr);
cublasDestroy(handle);
}
int main() {
int hp = 3200; // Number of rows in the matrix
int wp = 3200; // Number of columns in the matrix (i.e., vector size)
int num_tests = 1000;
MatrixVectorMul(hp, wp, num_tests);
return 0;
}
I compiled it with the instructions below and ran the test on an AGX Orin development kit (64GB version).
nvcc -o matrixVectorMul_cublas tegra_mem_cublas.cu -l cublas
However, I obtained completely opposite results.
when i use cudaStreamAttachMemAsync:
when i comment cudaStreamAttachMemAsync:
From the results above, I found that using cudaStreamAttachMemAsync actually yielded worse performance, which was quite unexpected. I then performed nsys profiling on both cases, obtaining the following results.
when i use cudaStreamAttachMemAsync:
when i comment cudaStreamAttachMemAsync:
Now, I am very confused:
- How exactly does cudaStreamAttachMemAsync prefetch? Specifically, which part of it appears in the nsys output?
- How should I test memory performance on the AGX? Could there be errors in my code? Could you provide a very standard example?