I am running a memory swapping code on a workstation with 4 A100s, by default the code is running on the first A100. I try to specify the GPU to run the code but the program slows down significantly even if I just specify CUDA_VISIBLE_DEVICES=0, which should be the same as the default case. But when I am running with CUDA_VISIBLE_DEVICES=0,1,2,3 the performance is normal although only the first A100 is active.
Here is the topology
GPU0 GPU1 GPU2 GPU3 NIC0 NIC1 CPU Affinity NUMA Affinity GPU NUMA ID
GPU0 X NV12 SYS SYS SYS SYS 0-63,128-191 0 N/A
GPU1 NV12 X SYS SYS SYS SYS 0-63,128-191 0 N/A
GPU2 SYS SYS X NV12 PHB PHB 64-127,192-255 1 N/A
GPU3 SYS SYS NV12 X NODE NODE 64-127,192-255 1 N/A
Is that related to the sharing of PCIE bandwidth?
Below is the test code I created,
#include <iostream>
#include <math.h>
#include <cuda_runtime.h>
#include <unistd.h>
#include <cstdlib>
#include <chrono>
#include <assert.h>
// Dummy process function:
__global__ void process_data_chunk(float *d_data, size_t data_size){
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
for(; tid < data_size; tid += gridDim.x * blockDim.x){
if(tid < data_size){
d_data[tid] *= 2;
}
}
}
int main(int argc, char **argv) {
// Input: oversubscription factor & chunk_size
size_t granularity = atoi(argv[1]);
size_t input_chunks = atoi(argv[2]);
size_t mf, ma;
cudaMemGetInfo(&mf, &ma);
std::cout << "Free: " << mf << " Total: " << ma << std::endl;
size_t max_total_float = (mf-500000000) / sizeof(float);
size_t chunk_size = max_total_float / granularity; // Per chunk size decided
// print the summary of profiling config
std::cout << "Profiling with config:\n"
<< "Device memory: " << mf/1024/1024 << " / " << ma/1024/1024 << " MB\n"
<< "Input chunks: " << input_chunks << " over " << granularity << " pages by " << input_chunks/granularity << " factor\n"
<< "Chunk size: " << chunk_size/1024/1024 << "M floats x " << input_chunks << " / " << max_total_float/1024/1024 << "M floats" << std::endl;
// Allocate host memory
float *h_data = new float[chunk_size * input_chunks];
size_t *table = new size_t[granularity];
std::fill_n(h_data, chunk_size * input_chunks, 1.5f); // Fill the array with 1
// Allocate device memory
float *d_data;
cudaError_t err = cudaMalloc(&d_data, granularity * chunk_size * sizeof(float)); // Allocate memory for one chunk at a time
if (err != cudaSuccess) {
std::cerr << "cudaMemAlloc failed: " << cudaGetErrorString(err) << std::endl;
return 1;
}
dim3 dimBlock(1024, 1, 1);
dim3 dimGrid(1024, 1, 1);
// assume we are done from the last cycle
for (size_t i = 0; i < granularity; ++i) {
size_t selected_page = (input_chunks - i - 1) % granularity;
table[selected_page] = (input_chunks - i - 1);
}
cudaDeviceSynchronize();
std::cout << "Sleeping for 3 seconds" << std::endl;
sleep(3);
auto start = std::chrono::high_resolution_clock::now();
for (size_t chunk = 0; chunk < input_chunks; ++chunk) {
size_t selected_page = chunk % granularity;
size_t evicted_chunk = table[selected_page];
std::cout << "Processing chunk: " << chunk << ", evicting chunk " << evicted_chunk << std::endl;
// Copy previous chunk back from device to host memory
cudaMemcpy(h_data + evicted_chunk * chunk_size, d_data+selected_page*chunk_size, chunk_size * sizeof(float), cudaMemcpyDeviceToHost);
// Copy current chunk from host to device memory
cudaMemcpy(d_data+selected_page*chunk_size, h_data + chunk * chunk_size, chunk_size * sizeof(float), cudaMemcpyHostToDevice);
// Check for errors in kernel launch
err = cudaGetLastError();
if (err != cudaSuccess) {
std::cerr << "Memcpy failed: " << cudaGetErrorString(err) << std::endl;
return 1;
}
table[selected_page] = chunk;
// Process current chunk on the device
process_data_chunk<<<dimGrid, dimBlock>>>(d_data+selected_page*chunk_size, chunk_size);
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
}
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = end - start;
std::cout << "Elapsed time: " << elapsed.count() << " seconds\n";
// Free the allocated device memory
cudaFree(d_data);
// Free host memory
delete[] h_data;
delete[] table;
return 0;
}
Results:
$ nvcc mvp.cu -o mvp
# set visible device
$ CUDA_VISIBLE_DEVICES=0 ./mvp 4 8
Free: 84544192512 Total: 84987740160
Profiling with config:
Device memory: 80627 / 81050 MB
Input chunks: 8 over 4 pages by 2 factor
Chunk size: 5009M floats x 8 / 20037M floats
Sleeping for 3 seconds
Processing chunk: 0, evicting chunk 4
Processing chunk: 1, evicting chunk 5
Processing chunk: 2, evicting chunk 6
Processing chunk: 3, evicting chunk 7
Processing chunk: 4, evicting chunk 0
Processing chunk: 5, evicting chunk 1
Processing chunk: 6, evicting chunk 2
Processing chunk: 7, evicting chunk 3
Elapsed time: 57.4109 seconds
# use default device
$ ./mvp 4 8
Free: 84544192512 Total: 84987740160
Profiling with config:
Device memory: 80627 / 81050 MB
Input chunks: 8 over 4 pages by 2 factor
Chunk size: 5009M floats x 8 / 20037M floats
Sleeping for 3 seconds
Processing chunk: 0, evicting chunk 4
Processing chunk: 1, evicting chunk 5
Processing chunk: 2, evicting chunk 6
Processing chunk: 3, evicting chunk 7
Processing chunk: 4, evicting chunk 0
Processing chunk: 5, evicting chunk 1
Processing chunk: 6, evicting chunk 2
Processing chunk: 7, evicting chunk 3
Elapsed time: 43.3565 seconds
I also created a UVM version of this code which is not affected by specifying the visible device.