hi

i hope you are doing good. I am pasting a code here that i am using as a testbench to find out if i can use the p2p memory access routines to use 8 GPUs on a single machine to solve a problem.

My code is called by a fortran code where i set the device according to the MPI rank issued to a node. Further the fortran code calls a C code that looks for GPUs in the system and finds out which peers it can access. Then each GPU (uniquely identified by the mpi rank) initiates a transfer to its next neighbor to which peer access is possible.

What i notice is that instead of copying from the peer the GPUs seem to access their own pointers. I guess  this is because all GPUs use the same code and hence the same pointer names. Is this true?

If yes then how can i use a code that runs on each GPU ( and has same variable names on each GPU) to access peer memory?
[\code]
-------------------------------C function called from fortran----------------------

int cgpu_dpcg__(double *A, double *b, double *x, double *eps, int *rank, int *iminx,
		  int *iminy, int *iminz, int *imaxx, int *imaxy, int *imaxz,
		 int *ihalo, int *jhalo, int *khalo, MPI_Fint *commobjt, MPI_Fint *ierr,
		 int *xprd, int *yprd, int *zprd, double *tol, cusparseHandle_t  *frtcsprshndl)
{
  int localrank, numprox, mpierr, mpi_ndims, i, gpu_n;
  MPI_Comm ccomm;	
  MPI_Status status;
  MPI_Datatype datatype;
  localrank=*rank;
  
  printf("\n This is Scaling test on processor %d.",localrank);
  
  ccomm= MPI_Comm_f2c(*commobjt);
   mpierr=MPI_Comm_rank(ccomm, &localrank);
   if(mpierr!=MPI_SUCCESS)
   {
     printf("\n error getting rank. exiting now");
     return 0;
   }
  mpierr=MPI_Comm_size(ccomm,&numprox);
  if(mpierr!=MPI_SUCCESS)
   {
     printf("\n error getting numprox. exiting now");
     return 0;
   }
   
  MPI_Cartdim_get(ccomm, &mpi_ndims);
  if(mpierr!=MPI_SUCCESS)
  {
    printf("\n error getting cartesian dims. exiting now");
    return 0;
  }
  p2ptester(localrank, numprox);
  
  printf("\n Proc%d: end of output",localrank);
  
  return 0;
}

-----------------------------p2ptester.cu---------------------

#include "macros.h"
#include "cuda_definitions.h"

#include "cuda.h"
#include "cusparse_v2.h"
#include "cublas_v2.h"

 
inline bool IsGPUCapableP2P(cudaDeviceProp *pProp)
{
#ifdef _WIN32
    return (bool)(pProp->tccDriver ? true : false);
#else
    return (bool)(pProp->major >= 2);
#endif
}

inline bool IsAppBuiltAs64()
{
#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
    return 1;
#else
    return 0;
#endif
}

void p2ptester(int localrank, int numprox)
{
  int gpu_n, i, *can_access_peer, size, nxt;
  double *hst_arr, *d_arr, *d_rcvarr, *checkn;
  FILE *fp;
  size=20;
  char name[size];
  if (!IsAppBuiltAs64())
    {
        printf("is only supported with on 64-bit OSs and the application must be built as a 64-bit target.  Test is being waived.\n");
        exit(EXIT_SUCCESS);
    }
  can_access_peer=(int*)initVector(numprox, MYINT);
  cudaGetDeviceCount(&gpu_n);
  printf("CUDA-capable device count: %i\n", gpu_n);
  if (gpu_n < 2)
    {
        printf("Two or more GPUs with SM 2.0 or higher capability are required for.\n");
        printf("Waiving test.\n");
        exit(EXIT_SUCCESS);
    }
  // Query device properties
    cudaDeviceProp prop[64];
    int gpuid[64]; // we want to find the first two GPU's that can support P2P
    int gpu_count = 0;   // GPUs that meet the criteria

    for (int i=0; i < gpu_n; i++)
    {
        checkCudaErrors(cudaGetDeviceProperties(&prop[i], i));

        // Only boards based on Fermi can support P2P
        if ((prop[i].major >= 2)
#ifdef _WIN32
            // on Windows (64-bit), the Tesla Compute Cluster driver for windows must be enabled
            && prop[i].tccDriver
#endif
           )
        {
            // This is an array of P2P capable GPUs
            gpuid[gpu_count++] = i;
        }

//         printf("%d:> GPU%d = \"%15s\" %s capable of Peer-to-Peer (P2P)\n", localrank, i, prop[i].name, (IsGPUCapableP2P(&prop[i]) ? "IS " : "NOT"));
    }
    
  
  for(i=0;i<numprox;i++)
  {
    if(i==localrank)
      ;
    else
    {
      //checkCudaErrors()
	cudaDeviceCanAccessPeer(&can_access_peer[i], localrank, i);  
//       if(can_access_peer[i])
// 	printf("\n GPU id=%d can access GPU id=%d",localrank, i);
    }
  }
  for(i=0;i<numprox;i++)
  {
    if(i==localrank)
      ;
    else
    {
  printf("%d:> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", localrank, prop[localrank].name, localrank,
           prop[i].name, i ,
           can_access_peer[i] ? "Yes" : "No");
    }
  }
  nxt=(localrank+1)%8;
  printf("\n %d has nxt as %d",localrank, nxt);
  if(can_access_peer[nxt]){  
    printf("\n %d can access %d using P2P in CUDA", localrank, nxt);
    checkCudaErrors(cudaDeviceEnablePeerAccess(nxt, 0));
  }
  printf("%d:> %s (GPU%d) supports UVA: %s\n", localrank, prop[localrank].name, gpuid[localrank], (prop[localrank].unifiedAddressing ? "Yes" : "No"));
  
  // initialize an array locally
  
  hst_arr = (double*)initVector(size, MYDOUBLE);
  checkn = (double*)initVector(size, MYDOUBLE);
  for(i=0;i<size;i++)
    hst_arr[i]=(((double)(localrank+1))*((double)(i+1)))/((double)gpu_count);
  sprintf(name,"%dhas.rec",localrank);
  fp=fopen(name,"wt");
  for(i=0;i<size;i++)
    fprintf(fp,"%0.12f\n",hst_arr[i]);
  fclose(fp);
  checkCudaErrors( cudaMalloc((void**)&d_arr, size*sizeof(double)) );
  checkCudaErrors( cudaMalloc((void**)&d_rcvarr, size*sizeof(double)) );
  checkCudaErrors(cudaMemcpy(d_arr, hst_arr, sizeof(double)*size, cudaMemcpyHostToDevice));
  /// each GPU has something unique in its array now.*/
//   
//   
//   
// 
  /// do a peer to peer copy
  if(can_access_peer[nxt]){
    checkCudaErrors(cudaMemcpyPeer(d_rcvarr, localrank, d_arr, nxt, size*sizeof(double)));
  
//  if(localrank==0){
    checkCudaErrors(cudaMemcpy(checkn, d_rcvarr, sizeof(double)*size, cudaMemcpyDeviceToHost));
    sprintf(name,"%dRCVD4m%d.rec",localrank,nxt);
    fp=fopen(name,"wt");
    for(i=0;i<size;i++)
      fprintf(fp,"%0.12f\n",checkn[i]);
    fclose(fp);
  
  
    checkCudaErrors(cudaDeviceDisablePeerAccess(nxt));
   } 
	free(hst_arr);	free(checkn);
  checkCudaErrors(cudaFree(d_arr));	checkCudaErrors(cudaFree(d_rcvarr));
   free(can_access_peer);
}
[\code]
-------------------------------------------------------------------------------
kindly suggest/advise.
thanks in advance
with kind regards
rohit