Data visualization

Do we have any example showing how to use the CUDA Fortran data (say 2D array) for visualization purpose, i.e. calling OpenGL interoperability

Thanks,
Tuan

Hi Tuan,

I don’t know if it’s exactly what you want, but below is an example CUDA Fortran program that includes some simple OpenGL calls. You’ll need to build the F90gl library ( http://math.nist.gov/f90gl/), see http://www.pgroup.com/resources/f90gl/f90gl129_pgi60.htm for details. Although I originally wrote this guide for use with the 6.0 compilers so may be dated, the basic steps should be the same.

Auxiliary files: delay.f90 and dclock_64.s

#ifndef DELAY
#define DELAY 1000
#endif

subroutine delay

    double precision :: s,t, dclock
    s=dclock()*1.0e+6
    t=s+DELAY
    !print *, s, t
    do while (s<t) 
       s=dclock()*1.0e+6
       !print *, s
    enddo
    
end subroutine delay



        .file   "dclock-hammer.s"
        .align    8
        .data
 .clock:  .double 0.000000000376        # 2.66 GHz
# .clock:  .double 0.000000000357        # 2.8 GHz
# .clock:  .double 0.0000000003333       # 3.0 GHz
# .clock:  .double 0.0000000003125       # 3.2 GHz

.low:   .long 0x00000000
.high:  .long 0x00000000
        .text
        
        .globl   _DCLOCK, dclock, _dclock, _dclock_, dclock_
_DCLOCK:
dclock:
_dclock:
_dclock_:
dclock_:
        .byte   0x0f, 0x31

        movl    %eax, .low(%RIP)
        movl    %edx, .high(%RIP)

        fildll  .low(%RIP)
        fmull   .clock(%RIP)
        fstpl   -24(%rsp)
        movsd   -24(%rsp), %xmm0
        ret

Main code for the game of life “lifeD.F90”

#ifdef _CUDA
#define DEVICE device,
#else
#define DEVICE 
#endif

module lifeD
 
    integer :: NXSIZE, NYSIZE, SEED
    parameter(NXSIZE=128, NYSIZE=128, SEED=2)
    real :: initPercent=0.2

contains

#ifdef _CUDA
attributes(global) subroutine life1_kernel(dOld, dNew, Nx, Ny)

	implicit none
	integer, dimension(Nx,Ny), device :: dOld, dNew
	integer, value :: Nx,Ny
	integer :: i, j, ix, iy, neighbors, state

	ix = (blockIdx%x-1)*blockDim%x + threadIdx%x
	iy = (blockIdx%y-1)*blockDim%y + threadIdx%y

	if (ix .gt. 1 .and. iy.gt.1 .and. &
            ix .lt. Nx .and. iy .lt. Ny) then
	
            neighbors = dOld(ix,iy-1) + &
	                dOld(ix,iy+1) +	&
	                dOld(ix+1,iy-1) + &	
	                dOld(ix+1,iy+1) + &	
	                dOld(ix-1,iy-1) + &	
	                dOld(ix-1,iy+1) + &	
	                dOld(ix-1,iy) +	&
	                dOld(ix+1,iy)
	state = dOld(ix,iy)
	
	if (state .eq. 0 .and. neighbors .eq. 3) then
	   dNew(ix,iy) = 1  ! birth
	else if (state.eq.1.and.neighbors.ne.2.and.neighbors.ne.3) then
	   dNew(ix,iy) = 0  ! death
	else
	   dNew(ix,iy) = state ! no change
        end if
	end if
	
end subroutine life1_kernel

#else
subroutine life1(dOld, dNew, Nx, Ny)

	implicit none
	integer, dimension(Nx,Ny) :: dOld, dNew
	integer, value :: Nx,Ny,tid
	integer :: i, j,ix,iy, neighbors, state

        do ix=2,Nx-1
	do iy=2,Ny-1
      
            neighbors = dOld(ix,iy-1) + &
	                dOld(ix,iy+1) +	&
	                dOld(ix+1,iy-1) + &	
	                dOld(ix+1,iy+1) + &	
	                dOld(ix-1,iy-1) + &	
	                dOld(ix-1,iy+1) + &	
	                dOld(ix-1,iy) +	&
	                dOld(ix+1,iy) 

	    state = dOld(ix,iy)

	    if (state .eq. 0 .and. neighbors .eq. 3) then
	      dNew(ix,iy) = 1  ! birth
	    else if (state.gt.0.and.neighbors.ne.2.and.neighbors.ne.3) then
	      dNew(ix,iy) = 0  ! death 0
	    else
              dNew(ix,iy) = state ! no change 
             end if

      end do
   end do

end subroutine life1

#endif

subroutine lifeDisplay ()
	
        use opengl_gl
        use opengl_glut
#ifdef _CUDA
	use cudafor
#endif
	use omp_lib
	implicit none

	integer :: Nx, Ny, Nt, temp, count, steps, i, j, angle, nthds, tnum
        integer :: numdev, devnum, istat, myseed, blocksize
	integer, allocatable, DEVICE dimension(:,:) :: dOld, dNew
	integer, allocatable, dimension(:,:) :: A, B
	integer :: start, end
	real, allocatable, dimension(:,:) :: rand
	character(len=80) :: rfile
	integer :: alive
#ifdef _CUDA
    	type(dim3) :: dimGrid, dimBlock
#else
        integer,dimension(3) :: dimGrid,dimBlock
#endif
        real :: x,y
        real :: wd,hg,rad

	! account for the halo
	Nx = NXSIZE+2
	Ny = NYSIZE+2
	myseed = SEED
	allocate(A(Nx,Ny),B(Nx,Ny),rand(Nx,Ny))
	call random_seed(myseed)
	call random_number(rand)
	A=0
	do i=2,Nx-1
	   do j=2,Ny-1
	     if (rand(i,j) < initPercent) then
	        A(i,j) = 1
	     else
                A(i,j) = 0
             endif
           enddo
        enddo
        B=A

       ! set the width and height of each box
        wd = 1.0/real(Ny)
        hg = 1.0/real(Nx)
        rad= wd/2.0;
        !print *, 'Nx:', Nx, 'Ny:', Ny, 'WD:', wd, 'HG:', hg
	! Set the inital conditions
	count = 0
	steps = 0
	temp = 0
	alive = 1		
#ifdef _CUDA
        istat = cudaGetDeviceCount(numdev)
#endif

!$omp parallel &
!$omp   shared(A,B,count,Nx,Ny,numdev,alive,steps), &
!$omp   private(tnum,dOld,dNew,dimGrid,dimBlock,devnum,istat,Nt,x,y, &
!$omp           angle,i,j,blocksize,start,end)

#ifdef _OMP
	nthds = omp_get_num_threads()	
	tnum = omp_get_thread_num()
#else
        nthds = 1
        tnum=0
#endif

!$omp master
	
!$omp end master
!$omp barrier

	blocksize =  NXSIZE/nthds
	Nt = blocksize+2
	start = (tnum*blocksize)+1
	end = start + blocksize + 1

!	if (tnum .eq. nthds-1) then
!	   Nt = Nt + mod(blocksize,nthds)
!	   end = Nx
!        endif

#ifdef _CUDA
        devnum = mod(tnum,numdev)
        istat = cudaSetDevice(devnum)
#endif
   	allocate(dNew(Nt,Ny), dOld(Nt,Ny))	

        dOld=0
	dNew=0
#ifdef _CUDA
	dimBlock = dim3(16,16,1)
        dimGrid = dim3((Nt+15)/16,(ny+15)/16,1)
#endif

	do, while (steps.lt.16384.and.count.lt.10.and.alive.gt.0)

	    dOld(1:Nt,1:Ny) = A(start:end,1:Ny)
            dNew=dOld
#ifdef _CUDA
            call life1_kernel<<<dimGrid,dimBlock>>>(dOld,dNew,Nt,Ny)
#else
            call life1(dOld,dNew,Nt,Ny)
#endif
	    B(start+1:end-1,1:Ny) = dNew(2:Nt-1,1:Ny)
!$omp barrier


!$omp master 
            A=B
	    alive = sum(A)
            print *, 'Step:', steps, ' Alive:', alive
	    call glclear(GL_COLOR_BUFFER_BIT)
            call glcolor3f(1.0_glfloat, 1.0_glfloat, 1.0_glfloat)
            do i=1,Nx
	       do j=Ny,1,-1
                   x=(i-1)*wd
                   y=(j-1)*hg                  
                   
	         if (A(i,j).gt.0) then 
                    call glBegin(GL_TRIANGLE_FAN)
                    call glVertex2f(x, y)
                    do angle=1,360,5
                        call glVertex2f(x+sin(real(angle))*rad,y+cos(real(angle))*rad)
                    enddo
                   call glEnd()
                 endif
              enddo
            enddo

            call glflush()
            call delay()  

            if (temp.lt.(alive+2).and.temp.gt.(alive-2)) then
               count = count + 1
	    else
	       count = 0
            endif
            temp = alive
	    steps = steps + 1

!	pause	

!$omp end master
!$omp barrier
 	
	end do
	
	deallocate(dOld)
	deallocate(dNew)
#ifdef _CUDA
	istat=cudaThreadExit()
#endif

!$omp end parallel
  
	alive = sum(A)
	print *, 'Step:', steps, ' Alive:',alive
	print *, 'Init%:', initPercent, ' Actual%:', real(alive)/real(NXSIZE*NYSIZE)
	close(21)
	deallocate(A,B)
		
!	stop

end subroutine lifeDisplay


end module lifeD


		
program life
 use opengl_gl
use opengl_glut
use lifeD
implicit none

real(kind=glfloat),  parameter :: zero  = 0.0_glfloat
real(kind=gldouble), parameter :: dzero = 0.0_gldouble, &
                                  one   = 1.0_gldouble
integer(kind=glcint) :: iwin,windW,windH
integer(kind=glenum) :: itemp

windW=1024
windH=1024

! Declare initial window size, position, and display mode
! (single buffer and RGBA).  Open window with "LIFE"
! in its title bar.
call glutInitWindowSize(windW,windH)
call glutinit()
call glutinitdisplaymode(ior(GLUT_SINGLE,GLUT_RGB))
iwin = glutcreatewindow("LIFE")

! select clearing color
call glclearcolor(zero, zero, zero, zero)

! initialize viewing values
itemp = GL_PROJECTION
call glmatrixmode(itemp)
call glloadidentity()
call glortho(dzero, one, dzero, one, -one, one)

! Register callback function to display graphics.
! Enter main loop and process events.
call glutdisplayfunc(lifeDisplay)
call glutmainloop()
end program life

Compilation commands:

pgfortran -I./f90gl-1.2.15/include/GL  -mp -fast -c -Mpreprocess   -Mcuda=cuda3.1  ./lifeD.F90 -o .//lifeD.o
pgfortran -g -c -Mpreprocess -DDELAY=10000 ./delay.f90 -o ./delay.o
pgfortran -c ./dclock_64.s -o ./dclock_64.o
pgfortran -mp -fast   -Mcuda=cuda3.1  ./lifeD.o ./delay.o ./dclock_64.o -L./f90gl-1.2.15/lib  -lglut1 -L/lib -lGLU -lGL -lf90glut -lf90GLU -lf90GL -lglut1 -L/usr/X11R6/lib -lXaw -lXt -lXmu -lXi -lXext -lX11 -o lifeD.out

Also, the code can be compiled with or without OpenMP (add or remove -mp) and with or without CUDA (add remove -Mcuda).

Hope this helps,
Mat

Thanks a lot Mat.
I’ll check it.

Tuan

Hi Mat.

I don’t think this sample code use CUDA-OpenGL interop. Indeed, what the it does is to copy the data from device back to host before calling the rendering functions. It would be great if you have a similar one that use CUDA-OpenGL interop in Fortran.

Also, when I try to write one by myself. I use the cudaGLSetGLDevice(devID), rather than cudaSetDevice(devID) and the sample code crash, with “segmentation fault”, when I try to allocate data on device memory.

Here, I created an explicit interface to cudaGLSetGLDevice as CUDA Fortran has no binding for that.

PROGRAM ABC
  USE CUDAFOR
  USE CUDA_INTEROP

  REAL, DIMENSION(:,:), ALLOCATABLE, device :: mat_dev
  integer :: gpuID

  gpuID = 0
  IF (cudaGLSetGLDevice( gpuID ) ) THEN
    PRINT *, "Cannot set GPU device: ", gpuID
    WRITE(*,*) cudaGetErrorString(cudaSetDevice(gpuID))
    CALL cleanup()
    STOP
  ELSE
    PRINT *, "We use device ID ", gpuID
  ENDIF

   allocate(mat_dev(100, 100))
   
END PROGRAM

MODULE CUDA_INTEROP
 USE CUDAFOR
 USE ISO_C_BINDING

 INTERFACE
    FUNCTION cudaGLSetGLDevice (device)  &
          BIND(C, name="cudaGLSetGLDevice") RESULT(res)
      USE iso_c_binding
      INTEGER(c_int), VALUE :: device
      INTEGER(KIND(cudaSuccess)) :: res
    END FUNCTION cudaGLSetGLDevice

  END INTERFACE
 
CONTAINS
END MODULE

What is your suggestion? I’m using Fortran 10.9, CUDA 3.1.

Tuan

I don’t think this sample code use CUDA-OpenGL interop. Indeed, what the it does is to copy the data from device back to host before calling the rendering functions

Correct and why I didn’t think it was what you wanted, it’s just what I had on hand.

What is your suggestion? I’m using Fortran 10.9, CUDA 3.1.

This is new for me so don’t have much for you here. I’ll ask my contacts at NVIDIA to see if they can help.

I was able to work around the seg fault by allocating the array before calling cudaGLSetGLDevice. I’m sure why the segv occurred but hopefully it will get you farther along as you blaze trail here.

% cat test.cuf 

MODULE CUDA_INTEROP
 USE CUDAFOR
 USE ISO_C_BINDING

 INTERFACE
    FUNCTION cudaGLSetGLDevice (device)  &
          BIND(C, name="cudaGLSetGLDevice") RESULT(res)
      USE iso_c_binding
      INTEGER(c_int), VALUE :: device
      INTEGER(KIND(cudaSuccess)) :: res
    END FUNCTION cudaGLSetGLDevice

  END INTERFACE
 
CONTAINS
END MODULE 

  PROGRAM ABC
  USE CUDAFOR
  USE CUDA_INTEROP

  REAL, DIMENSION(:,:), ALLOCATABLE, device :: mat_dev
  REAL, DIMENSION(:,:), ALLOCATABLE :: mat
  integer :: gpuID
 

  gpuID = 0
  istat = cudaSetDevice(gpuID)  
  allocate(mat(100, 100), mat_dev(100, 100))

  IF (cudaGLSetGLDevice( gpuID ) ) THEN
    PRINT *, "Cannot set GPU device: ", gpuID
    WRITE(*,*) cudaGetErrorString(cudaSetDevice(gpuID))
 !   CALL cleanup()
    STOP
  ELSE
    PRINT *, "We use device ID ", gpuID
  ENDIF
  mat_dev=1.0
  mat = mat_dev

  print *, mat(1,1) 
END PROGRAM

Hi Mat.
CUDA C Prog. guide mentioned that cudaSetDeivce() and cudaGLSetGLDevice() are mutually exclusive. So, is it okay to use both in the same code?


Is this a CUDA bug or something from the compiler?

Regards,
Tuan

Hi Mat,
Actually, calling both cudaSetDevice and cudaGLSetGLDevice() doesn’t work.
I got the error message

IF (cudaSetDevice( gpuID ) .NE. cudaSuccess) THEN
    PRINT *, "Cannot set GPU device: ", gpuID
    WRITE(*,*) cudaGetErrorString(cudaSetDevice(gpuID))
    CALL cleanup()
    STOP
  ELSE
    PRINT *, "We use device ID ", gpuID
  ENDIF

 IF (cudaGLSetGLDevice( gpuID ) .NE. cudaSuccess) THEN
    PRINT *, "Cannot set GPU device: ", gpuID
    WRITE(*,*) cudaGetErrorString(cudaSetDevice(gpuID))
    CALL cleanup()
    STOP
  ELSE
    PRINT *, "We use device ID ", gpuID, " for OpenGL"
  ENDIF



“setting the device when a process is active is not allowed”

Is there a solution to work-around for now?

Thanks,
Tuan

Hi Tuan,

This is new for me as well so don’t know the answer. I’m waiting for a reply from my contacts at NVIDIA for advice.

  • Mat

Thanks Mat. Please keep me updated when you get the response.

Tuan

Hi Mat,
May I ask you any update?

Thanks,

Tuan

Hi Tuan,

I doesn’t look like something we’ll be able to support, at least in the short term. As part of our CUDA C compiler, it appears that we’ll need to add OpenGL support so we may be able to fold this work back into CUDA Fortran. In the meantime, you’ll need to use the method I showed in my Game of Life program posted about.

Thanks,
Mat

Thanks Mat.

Sadly, this will make CUDA Fortran less competitive than CUDA C. Many scientific problems should be able to take advantages of both computational and graphical power of Fermi.

As mentioned in the CUDA Fortran manual, I thought any non-implemented runtime CUDA C API, including CUDA-OpenGL interop, can also be invoked from Fortran.

So, what if I write the main function from C, where I’ll call CUDA C-OpenGL initialization functions; and then use C-Fortran interop to call my existing function in Fortran? Can I use CUDA Fortran code as shared library or at least with static library?

Thank you,
Tuan