gpudirect v2 and MPI

Hello,

In this presentation http://www.cc.gatech.edu/~vetter/keeneland/tutorial-2011-04-14/13-cuda_advmpi_keeneland.pdf

it is stated on slide 17 that with gpudirect v2 the user does not have to express the memory transfers from device

to system memory to send/receive data with MPI through infiniband (“User sees direct transfer”), I think thanks to UVA.

So I try to write this simple program to validate this feature, but it does not work.

#include <stdio.h>

#include <stdlib.h>

#include <cuda.h>

#include <cuda_runtime.h>

#include <sys/time.h>

#include <mpi.h>

#define NREPEAT 1

#define NBYTES  10.e6

int IsAppBuiltAs64() {

#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)

    return 1;

#else

    return 0;

#endif

}

int main (int argc, char *argv[])

{

    int rank, size, n, len;

    int result;

    void *a_h, *a_d;

    struct timeval time[2];

    double bandwidth;

    char hostname[MPI_MAX_PROCESSOR_NAME];

    MPI_Status status;

    int device=0;

MPI_Init (&argc, &argv);

    MPI_Comm_rank (MPI_COMM_WORLD, &rank);

    MPI_Comm_size (MPI_COMM_WORLD, &size);

MPI_Get_processor_name(hostname, &len);

    printf("Process %d is on %s\n", rank, hostname);

	/* select device */

	if ( rank == 0 ) {

		cudaSetDevice(0);

	}

	else {

		cudaSetDevice(0);

	}

	/* display the unified capabilities of the device */

	struct cudaDeviceProp prop;

	result = cudaGetDeviceProperties(&prop,device);

	if ( result != cudaSuccess ) {

        printf("ERROR: %s: cudaGetDeviceProperties failed, error code: %d, which means: %s\n",

               hostname, result, cudaGetErrorString(result));

	}

	printf( "rank %d prop.unifiedAddressing=%d\n", rank, prop.unifiedAddressing );

	/* device memory allocation */

    result = cudaMalloc( (void **) &a_d, NBYTES);

    if ( result != cudaSuccess ) {

        printf("ERROR: %s: cudaMalloc failed, error code: %d, which means: %s\n",

               hostname, result, cudaGetErrorString(result));

        exit(1);

    }

printf( "rank %d build as 64 %d\n", rank, IsAppBuiltAs64() );

/* Test MPI send/recv bandwidth. */

    MPI_Barrier(MPI_COMM_WORLD);

gettimeofday(&time[0], NULL);

    for (n=0; n<NREPEAT; n++)

    {

        if (rank == 0) {

            MPI_Send(a_d, NBYTES/sizeof(int), MPI_INT, 1, 0, MPI_COMM_WORLD);

		}

        else {

            MPI_Recv(a_d, NBYTES/sizeof(int), MPI_INT, 0, 0, MPI_COMM_WORLD, &status);

	}

    }

    gettimeofday(&time[1], NULL);

bandwidth  =        time[1].tv_sec  - time[0].tv_sec;

    bandwidth += 1.e-6*(time[1].tv_usec - time[0].tv_usec);

    bandwidth  = NBYTES*NREPEAT/1.e6/bandwidth;

if (rank == 0)

        printf("MPI send/recv bandwidth: %f MB/sec\n", bandwidth);

cudaFree(a_d);

MPI_Finalize();

    return 0;

}

Here the console output I obtain:

[xxxx@dhcp1 test_mpi_gpudirect_v2]$ ./runib

Process 1 is on dhcp1

Process 0 is on dhcp2

rank 1 prop.unifiedAddressing=1

rank 0 prop.unifiedAddressing=1

rank 1 build as 64 1

rank 0 build as 64 1

[dhcp2:06536] *** Process received signal ***

[dhcp2:06536] Signal: Segmentation fault (11)

[dhcp2:06536] Signal code: Invalid permissions (2)

[dhcp2:06536] Failing at address: 0x200100000

[dhcp2:06536] [ 0] /lib64/libpthread.so.0 [0x3f8b80eb10]

[dhcp2:06536] [ 1] /lib64/libc.so.6(memcpy+0x15b) [0x3f8ac7c39b]

[dhcp2:06536] [ 2] /usr/mpi/gcc/openmpi-1.4.3/lib64/libmpi.so.0(ompi_convertor_pack+0x12c) [0x2aadcbc4e84c]

[dhcp2:06536] [ 3] /usr/mpi/gcc/openmpi-1.4.3/lib64/openmpi/mca_btl_openib.so [0x2aadcf216c82]

[dhcp2:06536] [ 4] /usr/mpi/gcc/openmpi-1.4.3/lib64/openmpi/mca_pml_ob1.so [0x2aadce5d0935]

[dhcp2:06536] [ 5] /usr/mpi/gcc/openmpi-1.4.3/lib64/openmpi/mca_pml_ob1.so [0x2aadce5c6f90]

[dhcp2:06536] [ 6] /usr/mpi/gcc/openmpi-1.4.3/lib64/libmpi.so.0(PMPI_Send+0x13d) [0x2aadcbc6495d]

[dhcp2:06536] [ 7] ./gpudirect_v2(main+0x199) [0x400e1c]

[dhcp2:06536] [ 8] /lib64/libc.so.6(__libc_start_main+0xf4) [0x3f8ac1d994]

[dhcp2:06536] [ 9] ./gpudirect_v2 [0x400bc9]

[dhcp2:06536] *** End of error message ***

--------------------------------------------------------------------------

mpirun noticed that process rank 0 with PID 6536 on node 192.168.0.1 exited on signal 11 (Segmentation fault).

--------------------------------------------------------------------------

So, my question is does the sentence “User sees direct transfer” mean that it is possible to pass a device pointer to MPI functions?

and if so how is it possible?

Best regards.