wrong results when using Cuda functions on multiple GPUs

Summary
General questions to multi-GPU programming and communication between GPUs. Primary focus: malloc, setDevice, memcpy, synchronize

Relevant Area
Multi GPU programming, multi GPU matrix multiplication

Description
General understanding concerning the usage of cudaSetDevice, the implications of switching devices, correct usage of memcpy (cudaMemcpyDeviceToHost, cudaMemcpyDeviceToDevice, …), correct usage of cudaDeviceSynchronize, …

NVIDIA GPU or System
P5000

NVIDIA Software Version
NVIDIA-SMI 418.87.00 Driver Version: 418.87.00 CUDA Version: 10.1

OS
Linux Fedora 29

Other Details

// Setup: for the sake of completeness only

int numGPU=2;
    int primaryGPU = 0;
    checkCudaErrors(cudaSetDevice(primaryGPU));

    multi_A22 [2] = {/* Matrix 0 */,    //each matrix 20x40, 2 halves of a bigger 40x40 matrix
                     /* Matrix 1 */};   //sparse matrix, ~3 values per row

    int separatorRow[3] = (0, 20, 40);      //defines at which row the matrix was cut
    int separatorVal[3] = (0, 60, 120);     //defines at which value the matrix was cut
    int multi_M[2] = (20, 20);              //number of rows per Matrix
    int multi_N[2] = (40, 40);              //number of columns per Matrix
    int multi_nz[2] = (60, 60);             //number of non-zeros per Matrix

    for (int curGPU = 0; curGPU < numGPU; ++curGPU) {
        multi_I[curGPU] = multi_A22[curGPU].outerIndexPtr();                  //rowPtr (size +1)
        multi_J[curGPU] = multi_A22[curGPU].innerIndexPtr();                  //colPtr (size nz)
        multi_val[curGPU] = multi_A22[curGPU].valuePtr();                     //valuesPtr (size nz)
    }

    checkCudaErrors(cudaSetDevice(primaryGPU));
    cudaDeviceSynchronize();

    ///Code that is needed on the primary gpu
    cudaMemcpy(d_x, x, N * sizeof(double), cudaMemcpyHostToDevice);
    checkCudaErrors(cudaMalloc((void **) &d_x, N * sizeof(double)));


    ///Code that needs to be run on the individual GPUs
    for (int curGPU = 0; curGPU < numGPU; ++curGPU) {
        checkCudaErrors(cudaSetDevice(curGPU));
        cudaDeviceSynchronize();
        checkCudaErrors(cudaMalloc((void **) &multi_d_row[curGPU], (multi_M[curGPU] + 1) * sizeof(int)));
        checkCudaErrors(cudaMalloc((void **) &multi_d_col[curGPU], multi_nz[curGPU] * sizeof(int)));
        checkCudaErrors(cudaMalloc((void **) &multi_d_val[curGPU], multi_nz[curGPU] * sizeof(double)));
        checkCudaErrors(cudaMalloc((void **) &multi_d_Ax[curGPU], multi_M[curGPU] * sizeof(double)));
        checkCudaErrors(cudaMalloc((void **) &multi_d_p[curGPU], multi_N[curGPU] * sizeof(double)));
    }

    for (int curGPU = 0; curGPU < numGPU; ++curGPU) {
        checkCudaErrors(cudaSetDevice(curGPU));
        cudaDeviceSynchronize();
        cudaMemcpy(multi_d_row[curGPU], multi_I[curGPU], (multi_M[curGPU] + 1) * sizeof(int),
                   cudaMemcpyHostToDevice);
        cudaMemcpy(multi_d_col[curGPU], multi_J[curGPU], multi_nz[curGPU] * sizeof(int),
                   cudaMemcpyHostToDevice);
        cudaMemcpy(multi_d_val[curGPU], multi_val[curGPU], multi_nz[curGPU] * sizeof(double),
                   cudaMemcpyHostToDevice);
    }
    checkCudaErrors(cudaSetDevice(curGPU));
    cudaDeviceSynchronize();
///Questions:
/*
 * 1) When is cudaDeviceSynchronize necessary?
 * 2) Does cudaSetDevice before an operation result in the operation being executed on the given GPU?
 *     a) Is the setup above done correctly?
 *     b) Which operations are required to enable functions like cusparseDcsrmv to be run on the individual GPU?
 * 3) Is the cudaMemcpyDeviceToDevice handled correctly?
 */

// the relevant part

///Code that is needed on the primary gpu
    checkCudaErrors(cudaMalloc((void ) &d_Ax, N * sizeof(double)));

    ///Code that needs to be run on the individual GPUs
    for (int curGPU = 0; curGPU < numGPU; ++curGPU) {
        checkCudaErrors(cudaSetDevice(curGPU));
        cudaDeviceSynchronize();
        checkCudaErrors(cudaMalloc((void **) &multi_d_Ax[curGPU], multi_M[curGPU] * sizeof(double)));
    }
    checkCudaErrors(cudaSetDevice(primaryGPU));
    //1)
    cudaDeviceSynchronize();

    //2)
    double alpha = 1.0;
    double beta = 0.0;
    for (int curGPU = 0; curGPU < numGPU; ++curGPU) {
        checkCudaErrors(cudaSetDevice(curGPU));
        cudaDeviceSynchronize();
        cusparseDcsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, multi_M[curGPU], multi_N[curGPU],
                       multi_nz[curGPU], &alpha, descr, multi_d_val[curGPU], multi_d_row[curGPU],
                       multi_d_col[curGPU], d_x, &beta, multi_d_Ax[curGPU]);
    }
    // 3)
    for (int curGPU = 0; curGPU < numGPU; ++curGPU) {
        checkCudaErrors(cudaSetDevice(curGPU));
        cudaDeviceSynchronize();
        double *dst = d_Ax + separatorRow[curGPU];
        double *src = multi_d_Ax[curGPU];
        size_t count = separatorRow[curGPU + 1] - separatorRow[curGPU];
        if (curGPU == primaryGPU) {
            cublasStatus = cublas_copy(cublasHandle, count, src, 1, dst, 1);
        } else {
            cudaMemcpy(dst, src, count * sizeof(double), cudaMemcpyDeviceToDevice);
        }
    }
}