hi i hope you are doing good. I am pasting a code here that i am using as a testbench to find out if i can use the p2p memory access routines to use 8 GPUs on a single machine to solve a problem. My code is called by a fortran code where i set the device according to the MPI rank issued to a node. Further the fortran code calls a C code that looks for GPUs in the system and finds out which peers it can access. Then each GPU (uniquely identified by the mpi rank) initiates a transfer to its next neighbor to which peer access is possible. What i notice is that instead of copying from the peer the GPUs seem to access their own pointers. I guess this is because all GPUs use the same code and hence the same pointer names. Is this true? If yes then how can i use a code that runs on each GPU ( and has same variable names on each GPU) to access peer memory? [\code] -------------------------------C function called from fortran---------------------- int cgpu_dpcg__(double *A, double *b, double *x, double *eps, int *rank, int *iminx, int *iminy, int *iminz, int *imaxx, int *imaxy, int *imaxz, int *ihalo, int *jhalo, int *khalo, MPI_Fint *commobjt, MPI_Fint *ierr, int *xprd, int *yprd, int *zprd, double *tol, cusparseHandle_t *frtcsprshndl) { int localrank, numprox, mpierr, mpi_ndims, i, gpu_n; MPI_Comm ccomm; MPI_Status status; MPI_Datatype datatype; localrank=*rank; printf("\n This is Scaling test on processor %d.",localrank); ccomm= MPI_Comm_f2c(*commobjt); mpierr=MPI_Comm_rank(ccomm, &localrank); if(mpierr!=MPI_SUCCESS) { printf("\n error getting rank. exiting now"); return 0; } mpierr=MPI_Comm_size(ccomm,&numprox); if(mpierr!=MPI_SUCCESS) { printf("\n error getting numprox. exiting now"); return 0; } MPI_Cartdim_get(ccomm, &mpi_ndims); if(mpierr!=MPI_SUCCESS) { printf("\n error getting cartesian dims. exiting now"); return 0; } p2ptester(localrank, numprox); printf("\n Proc%d: end of output",localrank); return 0; } -----------------------------p2ptester.cu--------------------- #include "macros.h" #include "cuda_definitions.h" #include "cuda.h" #include "cusparse_v2.h" #include "cublas_v2.h" inline bool IsGPUCapableP2P(cudaDeviceProp *pProp) { #ifdef _WIN32 return (bool)(pProp->tccDriver ? true : false); #else return (bool)(pProp->major >= 2); #endif } inline bool IsAppBuiltAs64() { #if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64) return 1; #else return 0; #endif } void p2ptester(int localrank, int numprox) { int gpu_n, i, *can_access_peer, size, nxt; double *hst_arr, *d_arr, *d_rcvarr, *checkn; FILE *fp; size=20; char name[size]; if (!IsAppBuiltAs64()) { printf("is only supported with on 64-bit OSs and the application must be built as a 64-bit target. Test is being waived.\n"); exit(EXIT_SUCCESS); } can_access_peer=(int*)initVector(numprox, MYINT); cudaGetDeviceCount(&gpu_n); printf("CUDA-capable device count: %i\n", gpu_n); if (gpu_n < 2) { printf("Two or more GPUs with SM 2.0 or higher capability are required for.\n"); printf("Waiving test.\n"); exit(EXIT_SUCCESS); } // Query device properties cudaDeviceProp prop[64]; int gpuid[64]; // we want to find the first two GPU's that can support P2P int gpu_count = 0; // GPUs that meet the criteria for (int i=0; i < gpu_n; i++) { checkCudaErrors(cudaGetDeviceProperties(&prop[i], i)); // Only boards based on Fermi can support P2P if ((prop[i].major >= 2) #ifdef _WIN32 // on Windows (64-bit), the Tesla Compute Cluster driver for windows must be enabled && prop[i].tccDriver #endif ) { // This is an array of P2P capable GPUs gpuid[gpu_count++] = i; } // printf("%d:> GPU%d = \"%15s\" %s capable of Peer-to-Peer (P2P)\n", localrank, i, prop[i].name, (IsGPUCapableP2P(&prop[i]) ? "IS " : "NOT")); } for(i=0;i Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", localrank, prop[localrank].name, localrank, prop[i].name, i , can_access_peer[i] ? "Yes" : "No"); } } nxt=(localrank+1)%8; printf("\n %d has nxt as %d",localrank, nxt); if(can_access_peer[nxt]){ printf("\n %d can access %d using P2P in CUDA", localrank, nxt); checkCudaErrors(cudaDeviceEnablePeerAccess(nxt, 0)); } printf("%d:> %s (GPU%d) supports UVA: %s\n", localrank, prop[localrank].name, gpuid[localrank], (prop[localrank].unifiedAddressing ? "Yes" : "No")); // initialize an array locally hst_arr = (double*)initVector(size, MYDOUBLE); checkn = (double*)initVector(size, MYDOUBLE); for(i=0;i