MPIFORT + CUDA FORTRAN - Passing pointer from Fortran (MPIFORT) to CUDA Fortran (PGIF90) and allocating memory on device

Hi All,

I have existing Fortran code (compiled with MPIFORT) which I want to add a CUDA-Fortran extension to (using PGF90). What I’m trying to do is pass a Fortran pointer to CUDA-Fortran, allocate memory on the device, and then pass this back to the original Fortran code.

I’ve used this query to do the same for Fortran/C-CUDA, and now I’m trying it for Fortran-CUDA.

Below is my code, which compiles but errors out with a segmentation fault at h2d:

memtest.f90

program memtest
use iso_c_binding

integer, allocatable     :: a(:)
type(c_ptr) :: pa(:)
integer(c_int)  :: n=2
integer i

allocate(a(n))
a=1

write (*,*) "a = "
do i = 1,n
    print*,a(i)
enddo

write (*,*) "init:"
call init_device(pa,n)

write (*,*) "h2d:"
call h2d(a,pa,n)

write (*,*) "increment:"
call increment(pa,10,n)

write (*,*) "d2h:"
call d2h(pa,a,n)
write (*,*) "a = "
do i = 1,n
    print*,a(i)
enddo

write (*,*) "increment again:"
call increment(pa,10,n)

write (*,*) "d2h:"
call d2h(pa,a,n)
write (*,*) "a = "
do i = 1,n
    print*,a(i)
enddo

write (*,*) "free:"
call free_device(pa)
deallocate(a)
end program

fcuda.cuf

module increment_m
contains
attributes(global) subroutine increment_d(a_d, inc, n)
	implicit none
	integer, value  :: inc, n
	integer, device :: a_d(n)
	integer id
	id=threadIdx%x
	if (id.le.n) then
		a_d(id) = a_d(id) + inc;
	endif
end subroutine increment_d
end module increment_m

subroutine increment(pa, inc, n)
use cudafor
use increment_m
	implicit none
	integer :: inc, n
	integer, device :: pa(n)
	call increment_d<<<1,n>>>(pa, inc, n);
end subroutine increment

subroutine init_device(pa, n)
use cudafor
use increment_m
	implicit none
	integer :: n
	integer, device, pointer :: pa(:)
	allocate(pa(n))
end subroutine init_device

subroutine free_device(pa, n)
use cudafor
use increment_m
	implicit none
	integer :: n
	integer, device, pointer :: pa(:)
	deallocate(pa)
end subroutine free_device

subroutine h2d(h, d, n)
use cudafor
use increment_m
	integer :: n
	integer :: h(n)
	integer, device, pointer :: d(:)
	d=h
end subroutine h2d

subroutine d2h(d, h, n)
use cudafor
use increment_m
	integer :: n
	integer :: h(n)
	integer, device, pointer :: d(:)
	h=d
end subroutine d2h

Makefile

CUDA_INSTALL_PATH=/[NVIDIA_PATH]/20.11/compilers/
MPI_INSTALL_PATH=/[NVIDIA_PATH}/20.11/comm_libs/mpi/

PGF90=$(CUDA_INSTALL_PATH)/bin/pgf90
IFORT90=$(MPI_INSTALL_PATH)/bin/mpifort

CUDALIB=-L/[NVIDIA PATH]/20.11/cuda/10.2/lib64 -lcudart

FFILES=memtest.f90
CUFILES=fcuda.cuf
OBJECTS=memtest.o fcuda.o
EXENAME= memtest

all:
	$(IFORT90) -c $(FFILES)
	$(PGF90)  $(CUDALIB) -c $(CUFILES)
	$(IFORT90) $(OBJECTS) -Mcuda $(CUDALIB) -o ${EXENAME}

clean:
	rm -f *.o
	rm -f *.mod
	rm -f ${EXENAME}

Thanks in advance for your help!

Hi GitPuller,

Since you’re pointing the mpifort we ship but using “IFORT90” as the label in the Makefile so I’m not 100% clear if the larger code is being compiled by Intel or NVHPC. The answer below works for both, but the Makefiles will be different. However, if your intent was to only use NVHPC, there’s probably easier ways to do this. Though, I’m answering as if you want to use Intel (below I’m using Intel’s OpenAPI ifx compiler)

This is a bit tricky since Intel as no notion of CUDA and F90 features such as modules and array’s with descriptors are not compatible between Fortran compilers. What I would suggest is to pass raw C pointers between the two and then use the Cuda API calls directly. For the kernel calls, you’ll need to associate the raw C pointer with a device array using an “c_f_pointer” call.

fcuda.cuf:

module increment_m

contains
attributes(global) subroutine increment_d(a_d, inc, n)
        implicit none
        integer, value  :: inc, n
        integer :: a_d(n)
        integer id
        id=threadIdx%x
        if (id.le.n) then
                a_d(id) = a_d(id) + inc;
        endif
end subroutine increment_d
end module increment_m

subroutine increment(pa, inc, n)
use cudafor
use, intrinsic :: iso_c_binding
use increment_m
        implicit none
        integer :: inc, n, err
        type(C_DEVPTR) :: pa
        integer, device, allocatable :: p(:)
        call c_f_pointer(pa,p,(/n/))
        call increment_d<<<1,n>>>(p, inc, n);
        err = cudaDeviceSynchronize()
end subroutine increment

subroutine init_device(pa, n)
use cudafor
use increment_m
        implicit none
        integer :: n, err
        type(C_DEVPTR) :: pa
        err = cudaMalloc(pa,4*n)
end subroutine init_device

subroutine free_device(pa)
use cudafor
use increment_m
        implicit none
        type(C_DEVPTR) :: pa
        integer err
        err = cudaFree(pa)
end subroutine free_device

subroutine h2d(h, d, n)
use cudafor
use iso_c_binding
use increment_m
        integer :: n, err
        integer, target :: h(*)
        type(C_DEVPTR) :: d
        type(C_PTR) :: hptr
        hptr = C_LOC(h)
        err = cudaMemcpy(d,hptr,n*4,cudaMemcpyHostToDevice)
end subroutine h2d

subroutine d2h(d, h, n)
use cudafor
use iso_c_binding
use increment_m
        integer :: n, err
        integer, target :: h(*)
        type(C_DEVPTR) :: d
        type(C_PTR) :: hptr
        hptr = C_LOC(h)
        err = cudaMemcpy(hptr,d,n*4,cudaMemcpyDeviceToHost)
end subroutine d2h

memtest.f90

program memtest
use iso_c_binding

integer, allocatable     :: a(:)
type(c_ptr) :: pa
integer(c_int)  :: n=2
integer i

allocate(a(n))
a=1

write (*,*) "a = "
do i = 1,n
    print*,a(i)
enddo

write (*,*) "init:"
call init_device(pa,n)

write (*,*) "h2d:"
call h2d(a,pa,n)

write (*,*) "increment:"
call increment(pa,10,n)

write (*,*) "d2h:"
call d2h(pa,a,n)
write (*,*) "a = "
do i = 1,n
    print*,a(i)
enddo

write (*,*) "increment again:"
call increment(pa,10,n)

write (*,*) "d2h:"
call d2h(pa,a,n)
write (*,*) "a = "
do i = 1,n
    print*,a(i)
enddo

write (*,*) "free:"
call free_device(pa)
deallocate(a)
end program

Makefile,ifx:

NVIDIA_PATH=/opt/nv  # << change this
NV_INSTALL_PATH=$(NVIDIA_PATH)/Linux_x86_64/21.5/compilers
NV_CUDA_PATH=$(NVIDIA_PATH)/Linux_x86_64/21.5/cuda/11.3/lib64

NVF90=nvfortran
IFORT90=ifx

FFILES=memtest.f90
CUFILES=fcuda.cuf
OBJECTS=memtest.o fcuda.o
EXENAME= memtest
NVFLAGS=-Mallocatable=95
NVLIBS=-L$(NV_INSTALL_PATH)/lib -L$(NV_CUDA_PATH) -lcudafor_113 -lcudafor -lcudadevrt -lcudart -lcudafor2 -lnvf -lnvomp  -lnvcpumath -lnsnvc -lnvc

all:
        $(IFORT90) -c $(FFILES)
        $(NVF90) $(NVFLAGS) -gpu=nordc -c $(CUFILES)
        $(IFORT90) $(OBJECTS) $(NVLIBS) -o ${EXENAME}

clean:
        rm -f *.o
        rm -f *.mod
        rm -f ${EXENAME}

Makefile.nv

NVIDIA_PATH=/proj/nv
NV_INSTALL_PATH=$(NVIDIA_PATH)/Linux_x86_64/21.5/compilers
NV_CUDA_PATH=$(NVIDIA_PATH)/Linux_x86_64/21.5/cuda/11.3/lib64

NVF90=nvfortran
IFORT90=mpifort

FFILES=memtest.f90
CUFILES=fcuda.cuf
OBJECTS=memtest.o fcuda.o
EXENAME= memtest
NVFLAGS=-Mallocatable=95
NVLIBS=-cuda

all:
        $(IFORT90) -c $(FFILES)
        $(NVF90) $(NVFLAGS) -c $(CUFILES)
        $(IFORT90) $(OBJECTS) $(NVLIBS) -o ${EXENAME}

clean:
        rm -f *.o
        rm -f *.mod
        rm -f ${EXENAME}

Output:

% make -f Makefile.ifx
ifx -c memtest.f90
nvfortran -Mallocatable=95 -gpu=nordc -c fcuda.cuf
ifx memtest.o fcuda.o -L/proj/nv/Linux_x86_64/21.5/compilers/lib -L/proj/nv/Linux_x86_64/21.5/cuda/11.3/lib64 -lcudafor_113 -lcudafor -lcudadevrt -lcudart -lcudafor2 -lnvf -lnvomp  -lnvcpumath -lnsnvc -lnvc  -o memtest
% ./memtest
 a =
           1
           1
 init:
 h2d:
 increment:
 d2h:
 a =
          11
          11
 increment again:
 d2h:
 a =
          21
          21
 free:
% make -f Makefile.nv
mpifort -c memtest.f90
nvfortran -Mallocatable=95 -c fcuda.cuf
mpifort memtest.o fcuda.o -cuda -o memtest
% ./memtest
 a =
            1
            1
 init:
 h2d:
 increment:
 d2h:
 a =
           11
           11
 increment again:
 d2h:
 a =
           21
           21
 free:

Hope this helps,
Mat

1 Like

Hi Mat,

That’s correct that the larger project is none-NVHPC mpifort. These solutions worked perfectly for me, thank you!