Finding minval and maxval using CUF kernels

Hi,

I am having trouble with this bit of code. The idea is to go through a vector and find the min and max values in the vector to determine the bounds of a simulation domain.

It works well on the CPU. When I try to do it with a CUF kernel, I get an incorrect answer. I am looking for an easy way to do this without getting into really complicated CUDA kernels…

I have set up a little test:

Program Bounds

Use cudafor

Implicit None

! Host Variables
Integer:: d, i, im
Integer, Parameter:: nTotal = 20000     ! Total number of nodes in the domain
Integer:: nCell(3)                      ! Number of cells in the domain
Real:: Bound(3,2)                       ! Bounds for the domain
Real:: x(nTotal,3)                      ! Position vector for nodes

! Device Variables
Integer, Device:: nCell_d(3)            ! Number of cells in the domain
Real, Device:: Bound_d(3,2)             ! Bounds for the domain
Real, Device:: x_d(nTotal,3)            ! Position vector for nodes

! Initialize the position vector for zeroes except for some seeded values
x = 0
x(100,1) = 1.2
x(200,2) = 1.6
x(300,3) = 1.4
x(400,1) = -1.2
x(500,2) = -1.6
x(600,3) = -1.4
nCell = 0

! Perform the algorithm on the CPU
Write(*,*) ""
Write(*,*) "CPU Implementation"
Bound(:,1) = -1.0E+10
Bound(:,2) = 1.0E+10	

Do d = 1, 3
	Bound(d,1) = maxval(x(:,d)) 
	Bound(d,2) = minval(x(:,d)) 		
	nCell(d) = ceiling(abs(Bound(d,1) - Bound(d,2))/(0.072))
End Do

Write(*,*) "xBoundMax = ",Bound(1,1)
Write(*,*) "xBoundMin = ",Bound(1,2)
Write(*,*) "yBoundMax = ",Bound(2,1)
Write(*,*) "yBoundMin = ",Bound(2,2)
Write(*,*) "zBoundMax = ",Bound(3,1)
Write(*,*) "zBoundMin = ",Bound(3,2)
Write(*,*) "nCellx    = ",nCell(1)
Write(*,*) "nCelly    = ",nCell(2)
Write(*,*) "nCellz    = ",nCell(3)

! Perform the algorithm on the GPU
Write(*,*) ""
Write(*,*) "GPU Implementation"
Bound(:,1) = -1.0E+10
Bound(:,2) = 1.0E+10	
Bound_d = Bound
nCell_d = nCell
x_d = x

!$CUF Kernel Do <<<*,128>>>
Do i = 1, nTotal
	Do d = 1, 3
		Bound_d(d,1) = max(Bound_d(d,1),x_d(i,d)) 
		Bound_d(d,2) = min(Bound_d(d,2),x_d(i,d))	
		nCell_d(d) = ceiling(abs(Bound_d(d,1) - Bound_d(d,2))/(0.072))
	End Do
End Do

Bound = Bound_d
nCell = nCell_d

Write(*,*) "xBoundMax = ",Bound(1,1)
Write(*,*) "xBoundMin = ",Bound(1,2)
Write(*,*) "yBoundMax = ",Bound(2,1)
Write(*,*) "yBoundMin = ",Bound(2,2)
Write(*,*) "zBoundMax = ",Bound(3,1)
Write(*,*) "zBoundMin = ",Bound(3,2)
Write(*,*) "nCellx    = ",nCell(1)
Write(*,*) "nCelly    = ",nCell(2)
Write(*,*) "nCellz    = ",nCell(3)

End Program Bounds

The above code is of course useless (normally the position vector would not be mainly zero), but the general idea is there.

Thank you,

Kirk

Hi Kirk,

If you use scalars, the compiler will generate parallel max and min reductions.

% cat test_3_4_14.cuf
Program Bounds

 Use cudafor

 Implicit None

 ! Host Variables
 Integer:: d, i, im
 Integer, Parameter:: nTotal = 20000     ! Total number of nodes in the domain
 Integer:: nCell(3)                      ! Number of cells in the domain
 Real:: Bound(3,2)                       ! Bounds for the domain
 Real:: x(nTotal,3)                      ! Position vector for nodes
 Real:: max1,max2,max3
 Real:: min1,min2,min3

 ! Device Variables
 Real, Device:: x_d(nTotal,3)            ! Position vector for nodes

 ! Initialize the position vector for zeroes except for some seeded values
 x = 0
 x(100,1) = 1.2
 x(200,2) = 1.6
 x(300,3) = 1.4
 x(400,1) = -1.2
 x(500,2) = -1.6
 x(600,3) = -1.4
 nCell = 0

 ! Perform the algorithm on the CPU
 Write(*,*) ""
 Write(*,*) "CPU Implementation"
 Bound(:,1) = -1.0E+10
 Bound(:,2) = 1.0E+10

 Do d = 1, 3
    Bound(d,1) = maxval(x(:,d))
    Bound(d,2) = minval(x(:,d))
    nCell(d) = ceiling(abs(Bound(d,1) - Bound(d,2))/(0.072))
 End Do

 Write(*,*) "xBoundMax = ",Bound(1,1)
 Write(*,*) "xBoundMin = ",Bound(1,2)
 Write(*,*) "yBoundMax = ",Bound(2,1)
 Write(*,*) "yBoundMin = ",Bound(2,2)
 Write(*,*) "zBoundMax = ",Bound(3,1)
 Write(*,*) "zBoundMin = ",Bound(3,2)
 Write(*,*) "nCellx    = ",nCell(1)
 Write(*,*) "nCelly    = ",nCell(2)
 Write(*,*) "nCellz    = ",nCell(3)

 ! Perform the algorithm on the GPU
 Write(*,*) ""
 Write(*,*) "GPU Implementation"
 Bound(:,1) = -1.0E+10
 Bound(:,2) = 1.0E+10
 x_d = x
 max1 = 0.0
 max2 = 0.0
 max3 = 0.0
 min1 = 999999.0
 min2 = 999999.0
 min3 = 999999.0

 !$CUF Kernel Do <<<*,128>>>
 Do i = 1, nTotal
    max1 = max(max1,x_d(i,1))
    min1 = min(min1,x_d(i,1))
    max2 = max(max2,x_d(i,2))
    min2 = min(min2,x_d(i,2))
    max3 = max(max3,x_d(i,3))
    min3 = min(min3,x_d(i,3))
 End Do

 Bound(1,1)=max1
 Bound(1,2)=min1
 Bound(2,1)=max2
 Bound(2,2)=min2
 Bound(3,1)=max3
 Bound(3,2)=min3
 Do d = 1, 3
    nCell(d) = ceiling(abs(Bound(d,1) - Bound(d,2))/(0.072))
 enddo

 Write(*,*) "xBoundMax = ",Bound(1,1)
 Write(*,*) "xBoundMin = ",Bound(1,2)
 Write(*,*) "yBoundMax = ",Bound(2,1)
 Write(*,*) "yBoundMin = ",Bound(2,2)
 Write(*,*) "zBoundMax = ",Bound(3,1)
 Write(*,*) "zBoundMin = ",Bound(3,2)
 Write(*,*) "nCellx    = ",nCell(1)
 Write(*,*) "nCelly    = ",nCell(2)
 Write(*,*) "nCellz    = ",nCell(3)

 End Program Bounds

% pgf90 test_3_4_14.cuf -Minfo; a.out
bounds:
     36, maxval reduction inlined
     37, minval reduction inlined
     65, CUDA kernel generated
         65, !$cuf kernel do <<< (*), (128) >>>
             Cached references to size [(x)x3] block of 'x_d'
         66, Max reduction generated for max1
         67, Min reduction generated for min1
         68, Max reduction generated for max2
         69, Min reduction generated for min2
         70, Max reduction generated for max3
         71, Min reduction generated for min3

 CPU Implementation
 xBoundMax =     1.200000
 xBoundMin =    -1.200000
 yBoundMax =     1.600000
 yBoundMin =    -1.600000
 zBoundMax =     1.400000
 zBoundMin =    -1.400000
 nCellx    =            34
 nCelly    =            45
 nCellz    =            39

 GPU Implementation
 xBoundMax =     1.200000
 xBoundMin =    -1.200000
 yBoundMax =     1.600000
 yBoundMin =    -1.600000
 zBoundMax =     1.400000
 zBoundMin =    -1.400000
 nCellx    =            34
 nCelly    =            45
 nCellz    =            39
  • Mat

Thank you Mat,

Is there an easy way to do what you did but keep the max, min and nCell as device variables. Otherwise I will need to copy back and forth every time I calculate the bounds (this is done every time step).

Kirk

Hi Kirk,

How about something like the following. I added the device attribute to all the max/min variables, and created device versions of Bounds and NCells. Then use OpenACC “parallel” construct to create a scalar kernel to update the bounds and ncell on the device. You could also write a scalar CUDA Fortran kernel as well.

% cat test_3_5_14.cuf
Program Bounds

 Use cudafor

 Implicit None

 ! Host Variables
 Integer:: d, i, im
 Integer, Parameter:: nTotal = 20000     ! Total number of nodes in the domain
 Integer:: nCell(3)                      ! Number of cells in the domain
 Real:: Bound(3,2)                       ! Bounds for the domain
 Integer,device:: nCell_d(3)                      ! Number of cells in the domain
 Real,device:: Bound_d(3,2)                       ! Bounds for the domain
 Real:: x(nTotal,3)                      ! Position vector for nodes
 Real, device :: max1,max2,max3
 Real, device :: min1,min2,min3

 ! Device Variables
 Real, Device:: x_d(nTotal,3)            ! Position vector for nodes

 ! Initialize the position vector for zeroes except for some seeded values
 x = 0
 x(100,1) = 1.2
 x(200,2) = 1.6
 x(300,3) = 1.4
 x(400,1) = -1.2
 x(500,2) = -1.6
 x(600,3) = -1.4
 nCell = 0

 ! Perform the algorithm on the CPU
 Write(*,*) ""
 Write(*,*) "CPU Implementation"
 Bound(:,1) = -1.0E+10
 Bound(:,2) = 1.0E+10

 Do d = 1, 3
    Bound(d,1) = maxval(x(:,d))
    Bound(d,2) = minval(x(:,d))
    nCell(d) = ceiling(abs(Bound(d,1) - Bound(d,2))/(0.072))
 End Do

 Write(*,*) "xBoundMax = ",Bound(1,1)
 Write(*,*) "xBoundMin = ",Bound(1,2)
 Write(*,*) "yBoundMax = ",Bound(2,1)
 Write(*,*) "yBoundMin = ",Bound(2,2)
 Write(*,*) "zBoundMax = ",Bound(3,1)
 Write(*,*) "zBoundMin = ",Bound(3,2)
 Write(*,*) "nCellx    = ",nCell(1)
 Write(*,*) "nCelly    = ",nCell(2)
 Write(*,*) "nCellz    = ",nCell(3)

 ! Perform the algorithm on the GPU
 Write(*,*) ""
 Write(*,*) "GPU Implementation"
 Bound(:,1) = -1.0E+10
 Bound(:,2) = 1.0E+10
 x_d = x
 max1 = 0.0
 max2 = 0.0
 max3 = 0.0
 min1 = 999999.0
 min2 = 999999.0
 min3 = 999999.0

 !$CUF Kernel Do <<<*,128>>>
 Do i = 1, nTotal
    max1 = max(max1,x_d(i,1))
    min1 = min(min1,x_d(i,1))
    max2 = max(max2,x_d(i,2))
    min2 = min(min2,x_d(i,2))
    max3 = max(max3,x_d(i,3))
    min3 = min(min3,x_d(i,3))
 End Do

 !$acc parallel
 Bound_d(1,1)=max1
 Bound_d(1,2)=min1
 Bound_d(2,1)=max2
 Bound_d(2,2)=min2
 Bound_d(3,1)=max3
 Bound_d(3,2)=min3
 !$acc loop seq
 Do d = 1, 3
    nCell_d(d) = ceiling(abs(Bound_d(d,1) - Bound_d(d,2))/(0.072))
 enddo
 !$acc end parallel

 Bound=Bound_d
 nCell=nCell_d
 Write(*,*) "xBoundMax = ",Bound(1,1)
 Write(*,*) "xBoundMin = ",Bound(1,2)
 Write(*,*) "yBoundMax = ",Bound(2,1)
 Write(*,*) "yBoundMin = ",Bound(2,2)
 Write(*,*) "zBoundMax = ",Bound(3,1)
 Write(*,*) "zBoundMin = ",Bound(3,2)
 Write(*,*) "nCellx    = ",nCell(1)
 Write(*,*) "nCelly    = ",nCell(2)
 Write(*,*) "nCellz    = ",nCell(3)

 End Program Bounds

% pgf90 -V14.2 -acc -Mcuda -Minfo test_3_5_14.cuf ; a.out
bounds:
     22, Memory zero idiom, array assignment replaced by call to pgf90_mzero4
     38, maxval reduction inlined
     39, minval reduction inlined
     67, CUDA kernel generated
         67, !$cuf kernel do <<< (*), (128) >>>
         68, Max reduction generated for max1
         69, Min reduction generated for min1
         70, Max reduction generated for max2
         71, Min reduction generated for min2
         72, Max reduction generated for max3
         73, Min reduction generated for min3
     76, Accelerator kernel generated
     84, Loop is parallelizable

 CPU Implementation
 xBoundMax =     1.200000
 xBoundMin =    -1.200000
 yBoundMax =     1.600000
 yBoundMin =    -1.600000
 zBoundMax =     1.400000
 zBoundMin =    -1.400000
 nCellx    =            34
 nCelly    =            45
 nCellz    =            39

 GPU Implementation
 xBoundMax =     1.200000
 xBoundMin =    -1.200000
 yBoundMax =     1.600000
 yBoundMin =    -1.600000
 zBoundMax =     1.400000
 zBoundMin =    -1.400000
 nCellx    =            34
 nCelly    =            45
 nCellz    =            39

Thanks Mat, I have never used acc, so I didn’t think of that.

Maybe I could use acc for this bit of code too? CellIndex is a 2 column array that I want to pack into CellList (a 2D array giving the nodes in each cell). NodesInCell is a vector that stores the number of nodes in each cell.

I cant think of a straightforward way to parallelize this since NodesInCell would be accessed by multiple threads at once.

Attributes(Global) Subroutine BuildCellList(nTotal,CellIndex,CellList,maxIndex,NodesInCell)

	Implicit None

	Integer:: i, j, d
	Integer, Value:: nTotal, maxIndex	
	Integer, Device, Intent(IN):: CellIndex(nTotal,2)
	Integer, Device, Intent(OUT):: CellList(maxIndex,27), NodesInCell(maxIndex)

	CellList      = 0
	NodesInCell   = 0

	Do i = 1, nTotal
		j = CellIndex(i,2)				
		NodesInCell(j) = NodesInCell(j) + 1
		CellList(j,NodesInCell(j)+1) = CellIndex(i,1)		
		CellList(j,1) = j	
	End Do			
			
End Subroutine BuildCellList

As you can see, I have taken a quick stab at it (first attempt being a single thread kernel… not great…). I know that there are really good ways to parallelize this, but maybe there is an easy way for a beginner like me.

Thanks,

Kirk

Are all values in the CellIndex unique? If so, then you could just put an OpenACC “!$ACC kernels loop independent” directive above the DO I loop (from host code, not within device code).

Though, given you’re counting the number of nodes in a cell, I’m guessing J is not unique. In that case, you probably could use atomic operations. They’re a bit slow so you’ll want to test performance versus other methods, but it may be ok.

OpenACC atomics are still being implemented but you can use CUDA Fortran atomics in OpenACC in 14.1. Though, since you’re already using CUDA Fortran in may be easier to just remove the do loop and then create “nTotal” threads, assigning a unique “i” to each thread, then add an atomic around update of “NodeInCell”.

The updates to CellList probably don’t need an atomic operation given the first update looks to assign values to unique CellList elements and the second would update the entry to same value no matter the i.

One potential problem is the use of “NodesInCell” in the CellList index. At this point we have to assume another thread has updated it and it’s value has changed. Instead, you’ll need use a local variable here. Though, I’ve forgotten off hand how to atomically increment NodesInCell and return the value to a scalar. I’ll look it up when I get into the office tomorrow.

  • Mat

Hi Mat,

Although this is probably not optimal, this works pretty well. I get ~8x speedup. I will have to try to compare it to a CUDA kernel with atomics.

Subroutine BuildCellListAcc(nTotal,CellIndex_d,CellList_d,maxIndex,NodesInCell_d)

	Implicit None

	Integer:: i, j, d
	Integer:: nTotal, maxIndex	
	Integer, Device, Intent(IN):: CellIndex_d(nTotal,2)
	Integer, Device, Intent(OUT):: CellList_d(maxIndex,27), NodesInCell_d(maxIndex)

	!$acc parallel
	CellList_d      = 0
	NodesInCell_d   = 0
	
	!$acc loop
	Do i = 1, nTotal
		j = CellIndex_d(i,2)				
		NodesInCell_d(j) = NodesInCell_d(j) + 1
		CellList_d(j,NodesInCell_d(j)+1) = CellIndex_d(i,1)		
		CellList_d(j,1) = j	
	End Do		
	!$acc end parallel	
			
	End Subroutine BuildCellListAcc

You are right, j is not unique, each cell holds between 9 to 27 nodes.

Kirk

Mat,

Hmmmm, on second thought, the acc does not work after all. I was hoping that acc magically did all the hard stuff for me… Guess not though, hahaha.

I guess the easiest way will be to create a CUDA kernel and use an atomic to guard NodesInCell.

Let me know if you find the atomic operation to do this. Maybe I need to use atomicinc or atomicadd!?

I imagine that the best way to do this is to write a CUDA kernel and used shared memory for NodesInCell and operate on blocks of data?!

Kirk

Hi Kirk,

Give this a try:

Subroutine BuildCellListAcc(nTotal,CellIndex_d,CellList_d,maxIndex,NodesInCell_d) 
    Use cudadevice
    Implicit None 
  
    Integer:: i, j, d 
    Integer:: nTotal, maxIndex, idx    
    Integer, Device, Intent(IN):: CellIndex_d(nTotal,2) 
    Integer, Device, Intent(OUT):: CellList_d(maxIndex,27), NodesInCell_d(maxIndex) 
    CellList_d      = 0 
    NodesInCell_d   = 0 
     
    !$acc parallel loop 
    Do i = 1, nTotal 
       j = CellIndex_d(i,2)    
       idx = atomicinc(NodesInCell_d(j), 27)
       idx = idx+2  ! increment since atomicinc returns the old value
       CellList_d(j,idx) = CellIndex_d(i,1)       
       CellList_d(j,1) = j    
    End Do       
    !$acc end parallel    
           
    End Subroutine BuildCellListAcc

Here, OpenACC is really just doing the booking keeping part so you could also turn this into a CUDA Fortran kernel. Atomicinc does return the “old” value from NodesInCell so is why I’m incrementing it by the extra 1.

Mat,

what version of PGF do I need for atomicinc? I am using 13.10. It doesnt seem to be incrementing NodesInCell as hoped.

Attributes(Global) Subroutine BuildCellListAtomic(nTotal,CellIndex,CellList,maxIndex,NodesInCell)

	Implicit None

	Integer:: i, j, idx
	Integer, Value:: nTotal, maxIndex	
	Integer, Device, Intent(IN):: CellIndex(nTotal,2)
	Integer, Device, Intent(OUT):: CellList(maxIndex,27), NodesInCell(maxIndex)

	CellList      = 0
	NodesInCell   = 0

	i = (blockIdx%x-1)*blockDim%x + threadIdx%x

	If (i >= 1 .and. i <= nTotal) Then
		j = CellIndex(i,2)	
		idx = atomicinc(NodesInCell(j),27)
		idx = idx + 2			
		CellList(j,idx) = CellIndex(i,1)		
		CellList(j,1) = j	
	End If  
			
End Subroutine BuildCellListAtomic

My kernel launch is something like this

Integer, Parameter:: BlockX = 128, BlockY = 1
tBlock = Dim3(BlockX,BlockY,1)
grid = Dim3(ceiling(Real(nTotal)/tBlock%x),1,1)	

Call BuildCellListAtomic<<<grid,tBlock>>>(nTotal,CellIndex_d,CellList_d,maxIndex,NodesInCell_d)

Kirk

what version of PGF do I need for atomicinc? I am using 13.10. It doesnt seem to be incrementing NodesInCell as hoped.

We just add them to OpenACC in 14.1, but they’ve been in CUDA Fortran for quite awhile. So my example wont work for you but you should be able to write you’re own CUDA Fortran kernel and use them there.

  • Mat

I tried this, but NodesInCell is not being incremented. Each entry of NodesInCell is zero.

Attributes(Global) Subroutine BuildCellListAtomic(nTotal,CellIndex,CellList,maxIndex,NodesInCell)

	Implicit None

	Integer:: i, j
	Integer, Device:: idx
	Integer, Value:: nTotal, maxIndex	
	Integer, Device, Intent(IN):: CellIndex(nTotal,2)
	Integer, Device, Intent(OUT):: CellList(maxIndex,27), NodesInCell(maxIndex)

	CellList      = 0
	NodesInCell   = 0

	i = (blockIdx%x-1)*blockDim%x + threadIdx%x

	If (i >= 1 .and. i <= nTotal) Then
		j = CellIndex(i,2)	
		idx = atomicinc(NodesInCell(j),27)
		idx = idx + 2			
		CellList(j,idx) = CellIndex(i,1)		
		CellList(j,1) = j	
	End If  
			
End Subroutine BuildCellListAtomic

Have I done something wrong?

Kirk

You shouldn’t clear “NodesInCell” in the kernel itself. This will get run by all threads, in any order, and will most likely give you zeros in the result.

Here’s a small example:

% cat test_3_7_14.cuf
module cells

contains

attributes(Global) Subroutine BuildCellListAtomic(nTotal,CellIndex,CellList,maxIndex,NodesInCell)

    Implicit None

    Integer:: i, j
    Integer, Device:: idx
    Integer, Value:: nTotal, maxIndex
    Integer, Device, Intent(IN):: CellIndex(nTotal,2)
    Integer, Device, Intent(OUT):: CellList(maxIndex,27), NodesInCell(maxIndex)

    i = (blockIdx%x-1)*blockDim%x + threadIdx%x

    If (i >= 1 .and. i <= nTotal) Then
       j = CellIndex(i,2)
       idx = atomicinc(NodesInCell(j),27)
       idx = idx + 2
       CellList(j,idx) = CellIndex(i,1)
       CellList(j,1) = j
    End If

 End Subroutine BuildCellListAtomic


end module cells


Program Bounds

 Use cudafor
 use cells
 Implicit None

    Integer, parameter :: nTotal=1024
    Integer, parameter :: maxIndex=64
    Integer :: CellIndex(nTotal,2)
    Integer :: CellList(maxIndex,27), NodesInCell(maxIndex)
    Integer, Device :: CellIndex_d(nTotal,2)
    Integer, Device :: CellList_d(maxIndex,27), NodesInCell_d(maxIndex)
    Integer, Parameter:: BlockX = 128, BlockY = 1
    Integer :: i
    type(dim3) :: tBlock, grid

    NodesInCell=0
    CellIndex=0
    CellList=0
    do i=1,nTotal
        CellIndex(i,2) = mod(i,maxIndex-1)+1
        CellIndex(i,1) = i
    enddo
    NodesInCell_d=NodesInCell
    CellIndex_d=CellIndex
    CellList_d=CellList
    tBlock = Dim3(BlockX,BlockY,1)
    grid = Dim3(ceiling(Real(nTotal)/tBlock%x),1,1)
    Call BuildCellListAtomic<<<grid,tBlock>>>(nTotal,CellIndex_d,CellList_d,maxIndex,NodesInCell_d)
    NodesInCell=NodesInCell_d
    CellList=CellList_d
    print *, NodesInCell
 End Program Bounds

% pgf90 test_3_7_14.cuf -V13.10; a.out
           16           17           17           17           17           17
           17           17           17           17           17           17
           17           17           17           17           17           16
           16           16           16           16           16           16
           16           16           16           16           16           16
           16           16           16           16           16           16
           16           16           16           16           16           16
           16           16           16           16           16           16
           16           16           16           16           16           16
           16           16           16           16           16           16
           16           16           16            0

Fantastic!

That is perfect, makes a big difference.

Thank you, your help is very appreciated.

Kirk