How to read from another GPU device?

My expertise is not programming and I suppose to use multiGPU to solve a PDE equation. So, I really appreciate any help to modify my current problem.

I want to run the following simple code on two GPUs simultaneously. Here I have a variable A[i]=[0 1 2 3 4 5 6 7 8 9] and want to calculate C[i]=A[i+1]+A[i]+A[i-1]. This is the answer: C[i]=[ 1 3 6 9 7 11 18 21 24 17 ]. Bold numbers are wrong. For two devices, C[4] from device=1 needs to access to A[5] from device=2. How can I do it in the simplest way?

Thank you.
,

#include <stdio.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <stdlib.h>
#include<time.h>

__global__ void iKernel(float *A, float *C, const int N)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    if (i < N) C[i] = A[i-1] + A[i] + A[i+1];
}


int main(int argc, char **argv)
{
    int ngpus;

    printf("> starting %s", argv[0]);

    cudaGetDeviceCount(&ngpus);
    printf(" CUDA-capable devices: %i\n", ngpus);

    ngpus = 2;

    int size = 10;

    int    iSize = size / ngpus;
    size_t iBytes = iSize * sizeof(float);

    printf("> total array size %d M, using %d devices with each device "
        "handling %d M\n", size / 1024 / 1024, ngpus, iSize / 1024 / 1024);


    // allocate device memory
    float **d_A = (float **)malloc(sizeof(float *) * ngpus);
    float **d_C = (float **)malloc(sizeof(float *) * ngpus);

    float **h_A = (float **)malloc(sizeof(float *) * ngpus);
    float **gpuRef = (float **)malloc(sizeof(float *) * ngpus);
    cudaStream_t *stream = (cudaStream_t *)malloc(sizeof(cudaStream_t) * ngpus);

    for (int i = 0; i < ngpus; i++){
        // set current device
        cudaSetDevice(i);

        // allocate device memory
        cudaMalloc((void **)&d_A[i], iBytes);
        cudaMalloc((void **)&d_C[i], iBytes);

        // allocate page locked host memory for asynchronous data transfer
        cudaMallocHost((void **)&h_A[i], iBytes);
        cudaMallocHost((void **)&gpuRef[i], iBytes);

        // create streams for timing and synchronizing
        cudaStreamCreate(&stream[i]);
    }

    dim3 block(512);
    dim3 grid((iSize + block.x - 1) / block.x);

    //h_A[ngpus][index]
    for (int i = 0; i < ngpus; i++){
        cudaSetDevice(i);
        for (int j = 0; j < iSize; j++){
            h_A[i][j] = j + i*iSize;
            printf("%d %d %d %0.8f \n", i,j,iSize, h_A[i][j]);
        }
    }
    // record start time
    double iStart = clock();

    // distributing the workload across multiple devices
    for (int i = 0; i < ngpus; i++){
        cudaSetDevice(i);

        cudaMemcpyAsync(d_A[i], h_A[i], iBytes, cudaMemcpyHostToDevice, stream[i]);

        iKernel << <grid, block, 0, stream[i] >> >(d_A[i], d_C[i], iSize);

        cudaMemcpyAsync(gpuRef[i], d_C[i], iBytes, cudaMemcpyDeviceToHost,
            stream[i]);
    }

    // synchronize streams
    for (int i = 0; i < ngpus; i++){
        cudaSetDevice(i);
        cudaStreamSynchronize(stream[i]);
    }

    for (int i = 0; i < ngpus; i++){
        for (int j = 0; j < iSize; j++){
            printf("%d %d %0.8f \n", i,j,gpuRef[i][j]);
        }
    }
    return EXIT_SUCCESS;
}

Probably best to ask this question over on the CUDA Forum, https://forums.developer.nvidia.com/c/accelerated-computing/cuda/206, since this forum is more for the HPC Compilers.

Though, my one suggestion is to look at using NVSHEMM, https://docs.nvidia.com/hpc-sdk/nvshmem/developer-guide/index.html, which allows for a partitioned global address space which can be accessed from multiple GPUs from within a CUDA Kernel.