nvidia-smi looks ok:
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 575.51.03 Driver Version: 575.51.03 CUDA Version: 12.9 |
|-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA GeForce RTX 5090 Off | 00000000:01:00.0 Off | N/A |
| 0% 55C P8 16W / 600W | 39MiB / 32607MiB | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
| 1 NVIDIA GeForce RTX 5090 Off | 00000000:04:00.0 Off | N/A |
| 0% 52C P8 19W / 600W | 18MiB / 32607MiB | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| 0 N/A N/A 2191 G /usr/bin/gnome-shell 10MiB |
| 0 N/A N/A 2242 G /usr/bin/Xwayland 8MiB |
| 1 N/A N/A 2191 G /usr/bin/gnome-shell 6MiB |
+-----------------------------------------------------------------------------------------+
Simple cuda app:
#include <stdio.h>
#include <cuda_runtime.h>
#include <math.h>
// Vector addition kernel
__global__ void vectorAdd(const float *A, const float *B, float *C, int N) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N) {
C[i] = A[i] + B[i];
}
}
int main() {
int deviceCount = 0;
cudaError_t err = cudaGetDeviceCount(&deviceCount);
if (err != cudaSuccess) {
printf("cudaGetDeviceCount returned %d: %s\n", err, cudaGetErrorString(err));
return 1;
}
printf("Detected %d CUDA capable device(s)\n", deviceCount);
const int N = 1 << 20; // Number of elements (1M)
size_t size = N * sizeof(float);
// Loop over each device
for (int dev = 0; dev < deviceCount; ++dev) {
cudaSetDevice(dev);
printf("\n=== Running on Device %d ===\n", dev);
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, dev);
printf("Device Name: %s\n", prop.name);
// Allocate host memory
float *h_A = (float*)malloc(size);
float *h_B = (float*)malloc(size);
float *h_C = (float*)malloc(size);
for (int i = 0; i < N; ++i) {
h_A[i] = i;
h_B[i] = i * 2;
}
// Allocate device memory
float *d_A, *d_B, *d_C;
cudaMalloc((void**)&d_A, size);
cudaMalloc((void**)&d_B, size);
cudaMalloc((void**)&d_C, size);
// Copy data to device
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
// Launch kernel
int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
cudaDeviceSynchronize();
// Copy result back to host
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
// Validate results
int errorCount = 0;
for (int i = 0; i < N; ++i) {
float expected = h_A[i] + h_B[i];
if (fabs(h_C[i] - expected) > 1e-5) {
if (errorCount < 10) {
printf("Mismatch at index %d: %f (got) vs %f (expected)\n", i, h_C[i], expected);
}
errorCount++;
}
}
if (errorCount == 0) {
printf("Result = PASS on device %d\n", dev);
} else {
printf("Result = FAIL on device %d (errors: %d)\n", dev, errorCount);
}
// Cleanup
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);
}
return 0;
}
nvcc:
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Mon_Apr__3_17:16:06_PDT_2023
Cuda compilation tools, release 12.1, V12.1.105
Build cuda_12.1.r12.1/compiler.32688072_0
Output:
$ nvcc simple_multi_gpu_test.cu -o test_multi_gpu
./test_multi_gpu
cudaGetDeviceCount returned 3: initialization error