P2p Bandwidth 150% higher than maximum achievable

Thank you for correcting me. I am not sure about what SLI really is, what the --sli="mosaic" really does.
Anyway, after this day of investigation, I ran the simpleP2P cuda samples, and this happened:

[./simpleP2P] - Starting...
Checking for multiple GPUs...
CUDA-capable device count: 2

Checking GPU(s) for support of peer to peer memory access...
> Peer access from NVIDIA RTX A4500 (GPU0) -> NVIDIA RTX A4500 (GPU1) : Yes
> Peer access from NVIDIA RTX A4500 (GPU1) -> NVIDIA RTX A4500 (GPU0) : Yes
Enabling peer access between GPU0 and GPU1...
Allocating buffers (64MB on GPU0, GPU1 and CPU Host)...
Creating event handles...
cudaMemcpyPeer / cudaMemcpy between GPU0 and GPU1: 36.23GB/s
Preparing host buffer and memcpy to GPU0...
Run kernel on GPU1, taking source data from GPU0 and writing to GPU1...
Run kernel on GPU0, taking source data from GPU1 and writing to GPU0...
Copy data back to host from GPU0 and verify results...
Verification error @ element 1: val = 0.000000, ref = 4.000000
Verification error @ element 2: val = 0.000000, ref = 8.000000
Verification error @ element 3: val = 0.000000, ref = 12.000000
Verification error @ element 4: val = 0.000000, ref = 16.000000
Verification error @ element 5: val = 0.000000, ref = 20.000000
Verification error @ element 6: val = 0.000000, ref = 24.000000
Verification error @ element 7: val = 0.000000, ref = 28.000000
Verification error @ element 8: val = 0.000000, ref = 32.000000
Verification error @ element 9: val = 0.000000, ref = 36.000000
Verification error @ element 10: val = 0.000000, ref = 40.000000
Verification error @ element 11: val = 0.000000, ref = 44.000000
Verification error @ element 12: val = 0.000000, ref = 48.000000
Disabling peer access...
Shutting down...
Test failed!

Before finding this issue, I was trying to code something myself, ended up with this :

#include <cuda_runtime.h>

#include <iostream>

// Macros to check cuda errors

#define CUDA_CALL(call)                                                                     \
  {                                                                                         \
    cudaError_t result = call;                                                              \
    if (cudaSuccess != result) {                                                            \
      std::cerr << "CUDA error " << result << " in " << __FILE__ << ":" << __LINE__ << ": " \
                << cudaGetErrorString(result) << " (" << #call << ")" << std::endl;         \
      exit(EXIT_FAILURE);                                                                   \
    }                                                                                       \
  }

#define CHK_CUDA_ERR                                                                              \
    {                                                                                             \
        cudaError_t error = cudaGetLastError();                                                   \
        if (cudaSuccess != error) {                                                              \
            std::cerr << "CUDA error " << error << " in " << __FILE__ << ":" << __LINE__ << ": " \
                    << cudaGetErrorString(error) << std::endl;           \
            exit(EXIT_FAILURE);                                                                   \
        }                                                                                         \
    }

// Kernels to increment

__global__ void plus_one(int *m){

    *m = *m + 1;

}

__global__ void plus(int *in, int *out){
    *out = in[1] + 1;
}

int main(){

    // Constants init

    int N = 1;

    int dimGrid = 1;
    int dimBlock = N;

    int size = N * sizeof(int);

    // Value storages

    int *d0, *d1;

    int verif[2];
    
    // Initialization for device 0

    CUDA_CALL(cudaSetDevice(0))
    CUDA_CALL(cudaMalloc(&d0, size))
    CUDA_CALL(cudaMemset(d0, 0, size))
    CUDA_CALL(cudaDeviceEnablePeerAccess(1,0))

    // Initialization for device 1

    CUDA_CALL(cudaSetDevice(1))
    CUDA_CALL(cudaMalloc(&d1, size))
    CUDA_CALL(cudaMemset(d1, 0, size))
    CUDA_CALL(cudaDeviceEnablePeerAccess(0,0))

    // Incrementation on Device 0

    CUDA_CALL(cudaSetDevice(0))
    
    plus_one<<<dimGrid, dimBlock>>>(d0);
    CHK_CUDA_ERR

    // Incrementation on top of the previous one on Device 1

    CUDA_CALL(cudaMemcpyPeer(d1, 1, d0, 0, size))

    CUDA_CALL(cudaSetDevice(1))
    plus_one<<<dimGrid, dimBlock>>>(d1);
    CHK_CUDA_ERR
    CUDA_CALL(cudaSetDevice(0))

    // Pull values for verification

    CUDA_CALL(cudaMemcpy(verif, d0, size, cudaMemcpyDeviceToHost))
    
    CUDA_CALL(cudaMemcpy(verif + 1, d1, size, cudaMemcpyDeviceToHost))

    // Print in stdout

    std::cout << "On Device 0 (should be 1): " << verif[0] << std::endl;
    std::cout << "On Device 1 (should be 2): " << verif[1] << std::endl;

    // Frees + disable peer access

    CUDA_CALL(cudaSetDevice(0))
    CUDA_CALL(cudaFree(d0))
    CUDA_CALL(cudaDeviceDisablePeerAccess(1))

    CUDA_CALL(cudaSetDevice(1))
    CUDA_CALL(cudaFree(d1))
    CUDA_CALL(cudaDeviceDisablePeerAccess(0))

}

And this output :

On Device 0 (should be 1): 1
On Device 1 (should be 2): 1

Important fact: Commenting all cudaDeviceEnablePeerAccess and cudaDeviceDisablePeerAccess change the output for the expected one. So peer to peer is definitely broken here.

If you have any clues, that would help a lot. I will try Tuesday to disable the SLI, but I suspect the issue of my previous post to be back.

In any case, thank you for the time you invest helping me.