GPU1 show C instead of M+C

| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 4090 Off | 00000000:1A:00.0 Off | Off |
| 0% 48C P2 102W / 450W | 1201MiB / 24564MiB | 100% Default |
| | | N/A |
±----------------------------------------±---------------------±---------------------+
| 1 NVIDIA GeForce RTX 4090 Off | 00000000:68:00.0 Off | Off |
| 0% 46C P2 103W / 450W | 853MiB / 24564MiB | 100% Default |
| | | N/A |
±----------------------------------------±---------------------±---------------------+

±--------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| 0 N/A N/A 5728 G /usr/lib/xorg/Xorg 4MiB |
| 0 N/A N/A 2618901 M+C ./mps_test 384MiB |
| 0 N/A N/A 2618904 M+C ./mps_test 384MiB |
| 0 N/A N/A 2618905 M+C ./mps_test 384MiB |
| 0 N/A N/A 2618906 C nvidia-cuda-mps-server 28MiB |
| 1 N/A N/A 5728 G /usr/lib/xorg/Xorg 40MiB |
| 1 N/A N/A 2618902 C ./mps_test2 384MiB |
| 1 N/A N/A 2618903 C ./mps_test2 384MiB |
| 1 N/A N/A 2618906 C nvidia-cuda-mps-server 28MiB |

#!/bin/bash

MPS=true
#sleep 3
if [ “$MPS” = true ]; then
nvidia-smi -i 0 -c DEFAULT
nvidia-smi -i 1 -c DEFAULT

export CUDA_VISIBLE_DEVICES=0,1

rm -rf /tmp/nvidia-mps

#rm -rf /tmp/nvidia-log
#rm -rf /tmp/nvidia-mps2
#export CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps

export CUDA_MPS_LOG_DIRECTORY=/tmp/nvidia-log

nvidia-cuda-mps-control -d
./mps_test 0 &
./mps_test2 1 &
./mps_test2 1 &
./mps_test 0 &
./mps_test 0
#./mps_test 1 &
#./mps_test2 1

#export CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps
echo quit | nvidia-cuda-mps-control
nvidia-smi -i 0 -c DEFAULT
nvidia-smi -i 1 -c DEFAULT

else
./mps_test 0 &
./mps_test 0
fi

int main(int argc, char *argv)
{
int gpuid = atoi(argv[1]);
// cudaSetDevice(gpuid);
printf(“CUDA_VISIBLE_DEVICES=%s\n”,getenv(“CUDA_VISIBLE_DEVICES”));
if(gpuid == 1) setenv(“CUDA_VISIBLE_DEVICES”, argv[1], 1);
printf(“CUDA_VISIBLE_DEVICES=%s\n”,getenv(“CUDA_VISIBLE_DEVICES”));
dim3 block(1, 1, 1);
dim3 grid(1, 1, 1);
float *gpu_mem;
float cpu_mem[8];
cudaMalloc((void **)&gpu_mem, 8 * sizeof(float));
timeval tv;
gettimeofday(&tv, NULL);
time_t begin = ((time_t)tv.tv_sec * (time_t)1000000 + tv.tv_usec);
cudaCheck(cudaMemcpy(gpu_mem, cpu_mem, 8 * sizeof(float), cudaMemcpyHostToDevice));
testMaxFlopsKernel<<<grid, block>>>(gpu_mem, 1000000000, 1.0f, 2.0f);
cudaCheck(cudaMemcpy(cpu_mem, gpu_mem, 8 * sizeof(float), cudaMemcpyDeviceToHost));
cudaCheck(cudaDeviceSynchronize());
gettimeofday(&tv, NULL);
time_t end = ((time_t)tv.tv_sec * (time_t)1000000 + tv.tv_usec);
float latency = (end - begin) / 1000000.0f;
cudaFree(gpu_mem);

nobody?