Do we have any example showing how to use the CUDA Fortran data (say 2D array) for visualization purpose, i.e. calling OpenGL interoperability
Thanks,
Tuan
Do we have any example showing how to use the CUDA Fortran data (say 2D array) for visualization purpose, i.e. calling OpenGL interoperability
Thanks,
Tuan
Hi Tuan,
I don’t know if it’s exactly what you want, but below is an example CUDA Fortran program that includes some simple OpenGL calls. You’ll need to build the F90gl library ( f90gl: Fortran interface for OpenGL and GLUT), see http://www.pgroup.com/resources/f90gl/f90gl129_pgi60.htm for details. Although I originally wrote this guide for use with the 6.0 compilers so may be dated, the basic steps should be the same.
Auxiliary files: delay.f90 and dclock_64.s
#ifndef DELAY
#define DELAY 1000
#endif
subroutine delay
double precision :: s,t, dclock
s=dclock()*1.0e+6
t=s+DELAY
!print *, s, t
do while (s<t)
s=dclock()*1.0e+6
!print *, s
enddo
end subroutine delay
.file "dclock-hammer.s"
.align 8
.data
.clock: .double 0.000000000376 # 2.66 GHz
# .clock: .double 0.000000000357 # 2.8 GHz
# .clock: .double 0.0000000003333 # 3.0 GHz
# .clock: .double 0.0000000003125 # 3.2 GHz
.low: .long 0x00000000
.high: .long 0x00000000
.text
.globl _DCLOCK, dclock, _dclock, _dclock_, dclock_
_DCLOCK:
dclock:
_dclock:
_dclock_:
dclock_:
.byte 0x0f, 0x31
movl %eax, .low(%RIP)
movl %edx, .high(%RIP)
fildll .low(%RIP)
fmull .clock(%RIP)
fstpl -24(%rsp)
movsd -24(%rsp), %xmm0
ret
Main code for the game of life “lifeD.F90”
#ifdef _CUDA
#define DEVICE device,
#else
#define DEVICE
#endif
module lifeD
integer :: NXSIZE, NYSIZE, SEED
parameter(NXSIZE=128, NYSIZE=128, SEED=2)
real :: initPercent=0.2
contains
#ifdef _CUDA
attributes(global) subroutine life1_kernel(dOld, dNew, Nx, Ny)
implicit none
integer, dimension(Nx,Ny), device :: dOld, dNew
integer, value :: Nx,Ny
integer :: i, j, ix, iy, neighbors, state
ix = (blockIdx%x-1)*blockDim%x + threadIdx%x
iy = (blockIdx%y-1)*blockDim%y + threadIdx%y
if (ix .gt. 1 .and. iy.gt.1 .and. &
ix .lt. Nx .and. iy .lt. Ny) then
neighbors = dOld(ix,iy-1) + &
dOld(ix,iy+1) + &
dOld(ix+1,iy-1) + &
dOld(ix+1,iy+1) + &
dOld(ix-1,iy-1) + &
dOld(ix-1,iy+1) + &
dOld(ix-1,iy) + &
dOld(ix+1,iy)
state = dOld(ix,iy)
if (state .eq. 0 .and. neighbors .eq. 3) then
dNew(ix,iy) = 1 ! birth
else if (state.eq.1.and.neighbors.ne.2.and.neighbors.ne.3) then
dNew(ix,iy) = 0 ! death
else
dNew(ix,iy) = state ! no change
end if
end if
end subroutine life1_kernel
#else
subroutine life1(dOld, dNew, Nx, Ny)
implicit none
integer, dimension(Nx,Ny) :: dOld, dNew
integer, value :: Nx,Ny,tid
integer :: i, j,ix,iy, neighbors, state
do ix=2,Nx-1
do iy=2,Ny-1
neighbors = dOld(ix,iy-1) + &
dOld(ix,iy+1) + &
dOld(ix+1,iy-1) + &
dOld(ix+1,iy+1) + &
dOld(ix-1,iy-1) + &
dOld(ix-1,iy+1) + &
dOld(ix-1,iy) + &
dOld(ix+1,iy)
state = dOld(ix,iy)
if (state .eq. 0 .and. neighbors .eq. 3) then
dNew(ix,iy) = 1 ! birth
else if (state.gt.0.and.neighbors.ne.2.and.neighbors.ne.3) then
dNew(ix,iy) = 0 ! death 0
else
dNew(ix,iy) = state ! no change
end if
end do
end do
end subroutine life1
#endif
subroutine lifeDisplay ()
use opengl_gl
use opengl_glut
#ifdef _CUDA
use cudafor
#endif
use omp_lib
implicit none
integer :: Nx, Ny, Nt, temp, count, steps, i, j, angle, nthds, tnum
integer :: numdev, devnum, istat, myseed, blocksize
integer, allocatable, DEVICE dimension(:,:) :: dOld, dNew
integer, allocatable, dimension(:,:) :: A, B
integer :: start, end
real, allocatable, dimension(:,:) :: rand
character(len=80) :: rfile
integer :: alive
#ifdef _CUDA
type(dim3) :: dimGrid, dimBlock
#else
integer,dimension(3) :: dimGrid,dimBlock
#endif
real :: x,y
real :: wd,hg,rad
! account for the halo
Nx = NXSIZE+2
Ny = NYSIZE+2
myseed = SEED
allocate(A(Nx,Ny),B(Nx,Ny),rand(Nx,Ny))
call random_seed(myseed)
call random_number(rand)
A=0
do i=2,Nx-1
do j=2,Ny-1
if (rand(i,j) < initPercent) then
A(i,j) = 1
else
A(i,j) = 0
endif
enddo
enddo
B=A
! set the width and height of each box
wd = 1.0/real(Ny)
hg = 1.0/real(Nx)
rad= wd/2.0;
!print *, 'Nx:', Nx, 'Ny:', Ny, 'WD:', wd, 'HG:', hg
! Set the inital conditions
count = 0
steps = 0
temp = 0
alive = 1
#ifdef _CUDA
istat = cudaGetDeviceCount(numdev)
#endif
!$omp parallel &
!$omp shared(A,B,count,Nx,Ny,numdev,alive,steps), &
!$omp private(tnum,dOld,dNew,dimGrid,dimBlock,devnum,istat,Nt,x,y, &
!$omp angle,i,j,blocksize,start,end)
#ifdef _OMP
nthds = omp_get_num_threads()
tnum = omp_get_thread_num()
#else
nthds = 1
tnum=0
#endif
!$omp master
!$omp end master
!$omp barrier
blocksize = NXSIZE/nthds
Nt = blocksize+2
start = (tnum*blocksize)+1
end = start + blocksize + 1
! if (tnum .eq. nthds-1) then
! Nt = Nt + mod(blocksize,nthds)
! end = Nx
! endif
#ifdef _CUDA
devnum = mod(tnum,numdev)
istat = cudaSetDevice(devnum)
#endif
allocate(dNew(Nt,Ny), dOld(Nt,Ny))
dOld=0
dNew=0
#ifdef _CUDA
dimBlock = dim3(16,16,1)
dimGrid = dim3((Nt+15)/16,(ny+15)/16,1)
#endif
do, while (steps.lt.16384.and.count.lt.10.and.alive.gt.0)
dOld(1:Nt,1:Ny) = A(start:end,1:Ny)
dNew=dOld
#ifdef _CUDA
call life1_kernel<<<dimGrid,dimBlock>>>(dOld,dNew,Nt,Ny)
#else
call life1(dOld,dNew,Nt,Ny)
#endif
B(start+1:end-1,1:Ny) = dNew(2:Nt-1,1:Ny)
!$omp barrier
!$omp master
A=B
alive = sum(A)
print *, 'Step:', steps, ' Alive:', alive
call glclear(GL_COLOR_BUFFER_BIT)
call glcolor3f(1.0_glfloat, 1.0_glfloat, 1.0_glfloat)
do i=1,Nx
do j=Ny,1,-1
x=(i-1)*wd
y=(j-1)*hg
if (A(i,j).gt.0) then
call glBegin(GL_TRIANGLE_FAN)
call glVertex2f(x, y)
do angle=1,360,5
call glVertex2f(x+sin(real(angle))*rad,y+cos(real(angle))*rad)
enddo
call glEnd()
endif
enddo
enddo
call glflush()
call delay()
if (temp.lt.(alive+2).and.temp.gt.(alive-2)) then
count = count + 1
else
count = 0
endif
temp = alive
steps = steps + 1
! pause
!$omp end master
!$omp barrier
end do
deallocate(dOld)
deallocate(dNew)
#ifdef _CUDA
istat=cudaThreadExit()
#endif
!$omp end parallel
alive = sum(A)
print *, 'Step:', steps, ' Alive:',alive
print *, 'Init%:', initPercent, ' Actual%:', real(alive)/real(NXSIZE*NYSIZE)
close(21)
deallocate(A,B)
! stop
end subroutine lifeDisplay
end module lifeD
program life
use opengl_gl
use opengl_glut
use lifeD
implicit none
real(kind=glfloat), parameter :: zero = 0.0_glfloat
real(kind=gldouble), parameter :: dzero = 0.0_gldouble, &
one = 1.0_gldouble
integer(kind=glcint) :: iwin,windW,windH
integer(kind=glenum) :: itemp
windW=1024
windH=1024
! Declare initial window size, position, and display mode
! (single buffer and RGBA). Open window with "LIFE"
! in its title bar.
call glutInitWindowSize(windW,windH)
call glutinit()
call glutinitdisplaymode(ior(GLUT_SINGLE,GLUT_RGB))
iwin = glutcreatewindow("LIFE")
! select clearing color
call glclearcolor(zero, zero, zero, zero)
! initialize viewing values
itemp = GL_PROJECTION
call glmatrixmode(itemp)
call glloadidentity()
call glortho(dzero, one, dzero, one, -one, one)
! Register callback function to display graphics.
! Enter main loop and process events.
call glutdisplayfunc(lifeDisplay)
call glutmainloop()
end program life
Compilation commands:
pgfortran -I./f90gl-1.2.15/include/GL -mp -fast -c -Mpreprocess -Mcuda=cuda3.1 ./lifeD.F90 -o .//lifeD.o
pgfortran -g -c -Mpreprocess -DDELAY=10000 ./delay.f90 -o ./delay.o
pgfortran -c ./dclock_64.s -o ./dclock_64.o
pgfortran -mp -fast -Mcuda=cuda3.1 ./lifeD.o ./delay.o ./dclock_64.o -L./f90gl-1.2.15/lib -lglut1 -L/lib -lGLU -lGL -lf90glut -lf90GLU -lf90GL -lglut1 -L/usr/X11R6/lib -lXaw -lXt -lXmu -lXi -lXext -lX11 -o lifeD.out
Also, the code can be compiled with or without OpenMP (add or remove -mp) and with or without CUDA (add remove -Mcuda).
Hope this helps,
Mat
Thanks a lot Mat.
I’ll check it.
Tuan
Hi Mat.
I don’t think this sample code use CUDA-OpenGL interop. Indeed, what the it does is to copy the data from device back to host before calling the rendering functions. It would be great if you have a similar one that use CUDA-OpenGL interop in Fortran.
Also, when I try to write one by myself. I use the cudaGLSetGLDevice(devID), rather than cudaSetDevice(devID) and the sample code crash, with “segmentation fault”, when I try to allocate data on device memory.
Here, I created an explicit interface to cudaGLSetGLDevice as CUDA Fortran has no binding for that.
PROGRAM ABC
USE CUDAFOR
USE CUDA_INTEROP
REAL, DIMENSION(:,:), ALLOCATABLE, device :: mat_dev
integer :: gpuID
gpuID = 0
IF (cudaGLSetGLDevice( gpuID ) ) THEN
PRINT *, "Cannot set GPU device: ", gpuID
WRITE(*,*) cudaGetErrorString(cudaSetDevice(gpuID))
CALL cleanup()
STOP
ELSE
PRINT *, "We use device ID ", gpuID
ENDIF
allocate(mat_dev(100, 100))
END PROGRAM
MODULE CUDA_INTEROP
USE CUDAFOR
USE ISO_C_BINDING
INTERFACE
FUNCTION cudaGLSetGLDevice (device) &
BIND(C, name="cudaGLSetGLDevice") RESULT(res)
USE iso_c_binding
INTEGER(c_int), VALUE :: device
INTEGER(KIND(cudaSuccess)) :: res
END FUNCTION cudaGLSetGLDevice
END INTERFACE
CONTAINS
END MODULE
What is your suggestion? I’m using Fortran 10.9, CUDA 3.1.
Tuan
I don’t think this sample code use CUDA-OpenGL interop. Indeed, what the it does is to copy the data from device back to host before calling the rendering functions
Correct and why I didn’t think it was what you wanted, it’s just what I had on hand.
What is your suggestion? I’m using Fortran 10.9, CUDA 3.1.
This is new for me so don’t have much for you here. I’ll ask my contacts at NVIDIA to see if they can help.
I was able to work around the seg fault by allocating the array before calling cudaGLSetGLDevice. I’m sure why the segv occurred but hopefully it will get you farther along as you blaze trail here.
% cat test.cuf
MODULE CUDA_INTEROP
USE CUDAFOR
USE ISO_C_BINDING
INTERFACE
FUNCTION cudaGLSetGLDevice (device) &
BIND(C, name="cudaGLSetGLDevice") RESULT(res)
USE iso_c_binding
INTEGER(c_int), VALUE :: device
INTEGER(KIND(cudaSuccess)) :: res
END FUNCTION cudaGLSetGLDevice
END INTERFACE
CONTAINS
END MODULE
PROGRAM ABC
USE CUDAFOR
USE CUDA_INTEROP
REAL, DIMENSION(:,:), ALLOCATABLE, device :: mat_dev
REAL, DIMENSION(:,:), ALLOCATABLE :: mat
integer :: gpuID
gpuID = 0
istat = cudaSetDevice(gpuID)
allocate(mat(100, 100), mat_dev(100, 100))
IF (cudaGLSetGLDevice( gpuID ) ) THEN
PRINT *, "Cannot set GPU device: ", gpuID
WRITE(*,*) cudaGetErrorString(cudaSetDevice(gpuID))
! CALL cleanup()
STOP
ELSE
PRINT *, "We use device ID ", gpuID
ENDIF
mat_dev=1.0
mat = mat_dev
print *, mat(1,1)
END PROGRAM
Hi Mat.
CUDA C Prog. guide mentioned that cudaSetDeivce() and cudaGLSetGLDevice() are mutually exclusive. So, is it okay to use both in the same code?
Is this a CUDA bug or something from the compiler?
Regards,
Tuan
Hi Mat,
Actually, calling both cudaSetDevice and cudaGLSetGLDevice() doesn’t work.
I got the error message
IF (cudaSetDevice( gpuID ) .NE. cudaSuccess) THEN
PRINT *, "Cannot set GPU device: ", gpuID
WRITE(*,*) cudaGetErrorString(cudaSetDevice(gpuID))
CALL cleanup()
STOP
ELSE
PRINT *, "We use device ID ", gpuID
ENDIF
IF (cudaGLSetGLDevice( gpuID ) .NE. cudaSuccess) THEN
PRINT *, "Cannot set GPU device: ", gpuID
WRITE(*,*) cudaGetErrorString(cudaSetDevice(gpuID))
CALL cleanup()
STOP
ELSE
PRINT *, "We use device ID ", gpuID, " for OpenGL"
ENDIF
“setting the device when a process is active is not allowed”
Is there a solution to work-around for now?
Thanks,
Tuan
Hi Tuan,
This is new for me as well so don’t know the answer. I’m waiting for a reply from my contacts at NVIDIA for advice.
Thanks Mat. Please keep me updated when you get the response.
Tuan
Hi Mat,
May I ask you any update?
Thanks,
Tuan
Hi Tuan,
I doesn’t look like something we’ll be able to support, at least in the short term. As part of our CUDA C compiler, it appears that we’ll need to add OpenGL support so we may be able to fold this work back into CUDA Fortran. In the meantime, you’ll need to use the method I showed in my Game of Life program posted about.
Thanks,
Mat
Thanks Mat.
Sadly, this will make CUDA Fortran less competitive than CUDA C. Many scientific problems should be able to take advantages of both computational and graphical power of Fermi.
As mentioned in the CUDA Fortran manual, I thought any non-implemented runtime CUDA C API, including CUDA-OpenGL interop, can also be invoked from Fortran.
So, what if I write the main function from C, where I’ll call CUDA C-OpenGL initialization functions; and then use C-Fortran interop to call my existing function in Fortran? Can I use CUDA Fortran code as shared library or at least with static library?
Thank you,
Tuan