gpudirect v2 and MPI


In this presentation

it is stated on slide 17 that with gpudirect v2 the user does not have to express the memory transfers from device

to system memory to send/receive data with MPI through infiniband (“User sees direct transfer”), I think thanks to UVA.

So I try to write this simple program to validate this feature, but it does not work.

#include <stdio.h>

#include <stdlib.h>

#include <cuda.h>

#include <cuda_runtime.h>

#include <sys/time.h>

#include <mpi.h>

#define NREPEAT 1

#define NBYTES  10.e6

int IsAppBuiltAs64() {

#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)

    return 1;


    return 0;



int main (int argc, char *argv[])


    int rank, size, n, len;

    int result;

    void *a_h, *a_d;

    struct timeval time[2];

    double bandwidth;

    char hostname[MPI_MAX_PROCESSOR_NAME];

    MPI_Status status;

    int device=0;

MPI_Init (&argc, &argv);

    MPI_Comm_rank (MPI_COMM_WORLD, &rank);

    MPI_Comm_size (MPI_COMM_WORLD, &size);

MPI_Get_processor_name(hostname, &len);

    printf("Process %d is on %s\n", rank, hostname);

	/* select device */

	if ( rank == 0 ) {



	else {



	/* display the unified capabilities of the device */

	struct cudaDeviceProp prop;

	result = cudaGetDeviceProperties(&prop,device);

	if ( result != cudaSuccess ) {

        printf("ERROR: %s: cudaGetDeviceProperties failed, error code: %d, which means: %s\n",

               hostname, result, cudaGetErrorString(result));


	printf( "rank %d prop.unifiedAddressing=%d\n", rank, prop.unifiedAddressing );

	/* device memory allocation */

    result = cudaMalloc( (void **) &a_d, NBYTES);

    if ( result != cudaSuccess ) {

        printf("ERROR: %s: cudaMalloc failed, error code: %d, which means: %s\n",

               hostname, result, cudaGetErrorString(result));



printf( "rank %d build as 64 %d\n", rank, IsAppBuiltAs64() );

/* Test MPI send/recv bandwidth. */


gettimeofday(&time[0], NULL);

    for (n=0; n<NREPEAT; n++)


        if (rank == 0) {

            MPI_Send(a_d, NBYTES/sizeof(int), MPI_INT, 1, 0, MPI_COMM_WORLD);


        else {

            MPI_Recv(a_d, NBYTES/sizeof(int), MPI_INT, 0, 0, MPI_COMM_WORLD, &status);



    gettimeofday(&time[1], NULL);

bandwidth  =        time[1].tv_sec  - time[0].tv_sec;

    bandwidth += 1.e-6*(time[1].tv_usec - time[0].tv_usec);

    bandwidth  = NBYTES*NREPEAT/1.e6/bandwidth;

if (rank == 0)

        printf("MPI send/recv bandwidth: %f MB/sec\n", bandwidth);



    return 0;


Here the console output I obtain:

[xxxx@dhcp1 test_mpi_gpudirect_v2]$ ./runib

Process 1 is on dhcp1

Process 0 is on dhcp2

rank 1 prop.unifiedAddressing=1

rank 0 prop.unifiedAddressing=1

rank 1 build as 64 1

rank 0 build as 64 1

[dhcp2:06536] *** Process received signal ***

[dhcp2:06536] Signal: Segmentation fault (11)

[dhcp2:06536] Signal code: Invalid permissions (2)

[dhcp2:06536] Failing at address: 0x200100000

[dhcp2:06536] [ 0] /lib64/ [0x3f8b80eb10]

[dhcp2:06536] [ 1] /lib64/ [0x3f8ac7c39b]

[dhcp2:06536] [ 2] /usr/mpi/gcc/openmpi-1.4.3/lib64/ [0x2aadcbc4e84c]

[dhcp2:06536] [ 3] /usr/mpi/gcc/openmpi-1.4.3/lib64/openmpi/ [0x2aadcf216c82]

[dhcp2:06536] [ 4] /usr/mpi/gcc/openmpi-1.4.3/lib64/openmpi/ [0x2aadce5d0935]

[dhcp2:06536] [ 5] /usr/mpi/gcc/openmpi-1.4.3/lib64/openmpi/ [0x2aadce5c6f90]

[dhcp2:06536] [ 6] /usr/mpi/gcc/openmpi-1.4.3/lib64/ [0x2aadcbc6495d]

[dhcp2:06536] [ 7] ./gpudirect_v2(main+0x199) [0x400e1c]

[dhcp2:06536] [ 8] /lib64/ [0x3f8ac1d994]

[dhcp2:06536] [ 9] ./gpudirect_v2 [0x400bc9]

[dhcp2:06536] *** End of error message ***


mpirun noticed that process rank 0 with PID 6536 on node exited on signal 11 (Segmentation fault).


So, my question is does the sentence “User sees direct transfer” mean that it is possible to pass a device pointer to MPI functions?

and if so how is it possible?

Best regards.