I have observed that the performance of UVM fluctuates significantly based on CPU population.
In my test, I allocated two 40GB buffers on a 40GB A100 GPU. I placed buffer1 on the CPU and buffer2 on the GPU, then launched a GPU kernel to access buffer1. The access kernel took 13 seconds to complete. However, when I populated buffer2 on the CPU first and then moved it to the GPU, the access time was reduced to 6 seconds.
I am unsure why this discrepancy occurs.
#include <cstdio>
#include <cuda_runtime.h>
#include <iostream>
#include <unistd.h>
#include <chrono>
#define CUDA_CALL(func) { \
cudaError_t status = func; \
if (status != cudaSuccess) { \
std::cerr << "CUDA API failed with error: " << cudaGetErrorString(status) << std::endl; \
return 1; \
} \
}
#define CALCULATE_TIME(func) { \
cudaEvent_t start, stop; \
cudaEventCreate(&start); \
cudaEventCreate(&stop); \
cudaEventRecord(start); \
func; \
cudaEventRecord(stop); \
cudaEventSynchronize(stop); \
float milliseconds = 0; \
cudaEventElapsedTime(&milliseconds, start, stop); \
std::cout << "Touch buffer time: " << milliseconds << "ms" << std::endl; \
}
// launch a kernel to touch the buffer concurrently
__global__ void populate_on_gpu(void* buffer, size_t size) {
size_t* buffer_ptr = (size_t*)buffer;
size_t buffer_size = size / sizeof(size_t);
size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
size_t stride = blockDim.x * gridDim.x;
for (size_t i = tid; i < buffer_size; i += stride) {
buffer_ptr[i] = i;
}
}
void populate_on_cpu(void* buffer, size_t size) {
size_t* buffer_ptr = (size_t*)buffer;
size_t buffer_size = size / sizeof(size_t);
for (size_t i = 0; i < buffer_size; i++) {
buffer_ptr[i] = i;
}
}
int main() {
// Set the GPU device to use
cudaSetDevice(0);
int deviceId;
CUDA_CALL(cudaGetDevice(&deviceId));
std::cout << "GPU Device ID: " << deviceId << std::endl;
// Define the buffer size in bytes
size_t bufferSize = 39ULL * 1024 * 1024 * 1024;
// Allocate 40GB memory
void* gpuBuffer1;
CUDA_CALL(cudaMallocManaged(&gpuBuffer1, bufferSize, cudaMemAttachGlobal));
// populate on GPU
populate_on_gpu<<<4096, 512>>>(gpuBuffer1, bufferSize);
// move to CPU
populate_on_cpu(gpuBuffer1, bufferSize);
CUDA_CALL(cudaDeviceSynchronize());
// Allocate another 40GB memory
void* gpuBuffer2;
CUDA_CALL(cudaMallocManaged(&gpuBuffer2, bufferSize, cudaMemAttachGlobal));
// populate on CPU
// populate_on_cpu(gpuBuffer2, bufferSize);
CUDA_CALL(cudaDeviceSynchronize());
// populate on GPU
populate_on_gpu<<<4096, 512>>>(gpuBuffer2, bufferSize);
CUDA_CALL(cudaDeviceSynchronize());
std::cout << "First touching buffer1: ";
CALCULATE_TIME((populate_on_gpu<<<4096, 512>>>(gpuBuffer1, bufferSize)));
std::cout << "Second touching buffer1: ";
CALCULATE_TIME((populate_on_gpu<<<4096, 512>>>(gpuBuffer1, bufferSize)));
// Free the allocated memory
cudaFree(gpuBuffer1);
cudaFree(gpuBuffer2);
return 0;
}