# Finding minval and maxval using CUF kernels

Hi,

I am having trouble with this bit of code. The idea is to go through a vector and find the min and max values in the vector to determine the bounds of a simulation domain.

It works well on the CPU. When I try to do it with a CUF kernel, I get an incorrect answer. I am looking for an easy way to do this without getting into really complicated CUDA kernels…

I have set up a little test:

``````Program Bounds

Use cudafor

Implicit None

! Host Variables
Integer:: d, i, im
Integer, Parameter:: nTotal = 20000     ! Total number of nodes in the domain
Integer:: nCell(3)                      ! Number of cells in the domain
Real:: Bound(3,2)                       ! Bounds for the domain
Real:: x(nTotal,3)                      ! Position vector for nodes

! Device Variables
Integer, Device:: nCell_d(3)            ! Number of cells in the domain
Real, Device:: Bound_d(3,2)             ! Bounds for the domain
Real, Device:: x_d(nTotal,3)            ! Position vector for nodes

! Initialize the position vector for zeroes except for some seeded values
x = 0
x(100,1) = 1.2
x(200,2) = 1.6
x(300,3) = 1.4
x(400,1) = -1.2
x(500,2) = -1.6
x(600,3) = -1.4
nCell = 0

! Perform the algorithm on the CPU
Write(*,*) ""
Write(*,*) "CPU Implementation"
Bound(:,1) = -1.0E+10
Bound(:,2) = 1.0E+10

Do d = 1, 3
Bound(d,1) = maxval(x(:,d))
Bound(d,2) = minval(x(:,d))
nCell(d) = ceiling(abs(Bound(d,1) - Bound(d,2))/(0.072))
End Do

Write(*,*) "xBoundMax = ",Bound(1,1)
Write(*,*) "xBoundMin = ",Bound(1,2)
Write(*,*) "yBoundMax = ",Bound(2,1)
Write(*,*) "yBoundMin = ",Bound(2,2)
Write(*,*) "zBoundMax = ",Bound(3,1)
Write(*,*) "zBoundMin = ",Bound(3,2)
Write(*,*) "nCellx    = ",nCell(1)
Write(*,*) "nCelly    = ",nCell(2)
Write(*,*) "nCellz    = ",nCell(3)

! Perform the algorithm on the GPU
Write(*,*) ""
Write(*,*) "GPU Implementation"
Bound(:,1) = -1.0E+10
Bound(:,2) = 1.0E+10
Bound_d = Bound
nCell_d = nCell
x_d = x

!\$CUF Kernel Do <<<*,128>>>
Do i = 1, nTotal
Do d = 1, 3
Bound_d(d,1) = max(Bound_d(d,1),x_d(i,d))
Bound_d(d,2) = min(Bound_d(d,2),x_d(i,d))
nCell_d(d) = ceiling(abs(Bound_d(d,1) - Bound_d(d,2))/(0.072))
End Do
End Do

Bound = Bound_d
nCell = nCell_d

Write(*,*) "xBoundMax = ",Bound(1,1)
Write(*,*) "xBoundMin = ",Bound(1,2)
Write(*,*) "yBoundMax = ",Bound(2,1)
Write(*,*) "yBoundMin = ",Bound(2,2)
Write(*,*) "zBoundMax = ",Bound(3,1)
Write(*,*) "zBoundMin = ",Bound(3,2)
Write(*,*) "nCellx    = ",nCell(1)
Write(*,*) "nCelly    = ",nCell(2)
Write(*,*) "nCellz    = ",nCell(3)

End Program Bounds
``````

The above code is of course useless (normally the position vector would not be mainly zero), but the general idea is there.

Thank you,

Kirk

Hi Kirk,

If you use scalars, the compiler will generate parallel max and min reductions.

``````% cat test_3_4_14.cuf
Program Bounds

Use cudafor

Implicit None

! Host Variables
Integer:: d, i, im
Integer, Parameter:: nTotal = 20000     ! Total number of nodes in the domain
Integer:: nCell(3)                      ! Number of cells in the domain
Real:: Bound(3,2)                       ! Bounds for the domain
Real:: x(nTotal,3)                      ! Position vector for nodes
Real:: max1,max2,max3
Real:: min1,min2,min3

! Device Variables
Real, Device:: x_d(nTotal,3)            ! Position vector for nodes

! Initialize the position vector for zeroes except for some seeded values
x = 0
x(100,1) = 1.2
x(200,2) = 1.6
x(300,3) = 1.4
x(400,1) = -1.2
x(500,2) = -1.6
x(600,3) = -1.4
nCell = 0

! Perform the algorithm on the CPU
Write(*,*) ""
Write(*,*) "CPU Implementation"
Bound(:,1) = -1.0E+10
Bound(:,2) = 1.0E+10

Do d = 1, 3
Bound(d,1) = maxval(x(:,d))
Bound(d,2) = minval(x(:,d))
nCell(d) = ceiling(abs(Bound(d,1) - Bound(d,2))/(0.072))
End Do

Write(*,*) "xBoundMax = ",Bound(1,1)
Write(*,*) "xBoundMin = ",Bound(1,2)
Write(*,*) "yBoundMax = ",Bound(2,1)
Write(*,*) "yBoundMin = ",Bound(2,2)
Write(*,*) "zBoundMax = ",Bound(3,1)
Write(*,*) "zBoundMin = ",Bound(3,2)
Write(*,*) "nCellx    = ",nCell(1)
Write(*,*) "nCelly    = ",nCell(2)
Write(*,*) "nCellz    = ",nCell(3)

! Perform the algorithm on the GPU
Write(*,*) ""
Write(*,*) "GPU Implementation"
Bound(:,1) = -1.0E+10
Bound(:,2) = 1.0E+10
x_d = x
max1 = 0.0
max2 = 0.0
max3 = 0.0
min1 = 999999.0
min2 = 999999.0
min3 = 999999.0

!\$CUF Kernel Do <<<*,128>>>
Do i = 1, nTotal
max1 = max(max1,x_d(i,1))
min1 = min(min1,x_d(i,1))
max2 = max(max2,x_d(i,2))
min2 = min(min2,x_d(i,2))
max3 = max(max3,x_d(i,3))
min3 = min(min3,x_d(i,3))
End Do

Bound(1,1)=max1
Bound(1,2)=min1
Bound(2,1)=max2
Bound(2,2)=min2
Bound(3,1)=max3
Bound(3,2)=min3
Do d = 1, 3
nCell(d) = ceiling(abs(Bound(d,1) - Bound(d,2))/(0.072))
enddo

Write(*,*) "xBoundMax = ",Bound(1,1)
Write(*,*) "xBoundMin = ",Bound(1,2)
Write(*,*) "yBoundMax = ",Bound(2,1)
Write(*,*) "yBoundMin = ",Bound(2,2)
Write(*,*) "zBoundMax = ",Bound(3,1)
Write(*,*) "zBoundMin = ",Bound(3,2)
Write(*,*) "nCellx    = ",nCell(1)
Write(*,*) "nCelly    = ",nCell(2)
Write(*,*) "nCellz    = ",nCell(3)

End Program Bounds

% pgf90 test_3_4_14.cuf -Minfo; a.out
bounds:
36, maxval reduction inlined
37, minval reduction inlined
65, CUDA kernel generated
65, !\$cuf kernel do <<< (*), (128) >>>
Cached references to size [(x)x3] block of 'x_d'
66, Max reduction generated for max1
67, Min reduction generated for min1
68, Max reduction generated for max2
69, Min reduction generated for min2
70, Max reduction generated for max3
71, Min reduction generated for min3

CPU Implementation
xBoundMax =     1.200000
xBoundMin =    -1.200000
yBoundMax =     1.600000
yBoundMin =    -1.600000
zBoundMax =     1.400000
zBoundMin =    -1.400000
nCellx    =            34
nCelly    =            45
nCellz    =            39

GPU Implementation
xBoundMax =     1.200000
xBoundMin =    -1.200000
yBoundMax =     1.600000
yBoundMin =    -1.600000
zBoundMax =     1.400000
zBoundMin =    -1.400000
nCellx    =            34
nCelly    =            45
nCellz    =            39
``````
• Mat

Thank you Mat,

Is there an easy way to do what you did but keep the max, min and nCell as device variables. Otherwise I will need to copy back and forth every time I calculate the bounds (this is done every time step).

Kirk

Hi Kirk,

How about something like the following. I added the device attribute to all the max/min variables, and created device versions of Bounds and NCells. Then use OpenACC “parallel” construct to create a scalar kernel to update the bounds and ncell on the device. You could also write a scalar CUDA Fortran kernel as well.

``````% cat test_3_5_14.cuf
Program Bounds

Use cudafor

Implicit None

! Host Variables
Integer:: d, i, im
Integer, Parameter:: nTotal = 20000     ! Total number of nodes in the domain
Integer:: nCell(3)                      ! Number of cells in the domain
Real:: Bound(3,2)                       ! Bounds for the domain
Integer,device:: nCell_d(3)                      ! Number of cells in the domain
Real,device:: Bound_d(3,2)                       ! Bounds for the domain
Real:: x(nTotal,3)                      ! Position vector for nodes
Real, device :: max1,max2,max3
Real, device :: min1,min2,min3

! Device Variables
Real, Device:: x_d(nTotal,3)            ! Position vector for nodes

! Initialize the position vector for zeroes except for some seeded values
x = 0
x(100,1) = 1.2
x(200,2) = 1.6
x(300,3) = 1.4
x(400,1) = -1.2
x(500,2) = -1.6
x(600,3) = -1.4
nCell = 0

! Perform the algorithm on the CPU
Write(*,*) ""
Write(*,*) "CPU Implementation"
Bound(:,1) = -1.0E+10
Bound(:,2) = 1.0E+10

Do d = 1, 3
Bound(d,1) = maxval(x(:,d))
Bound(d,2) = minval(x(:,d))
nCell(d) = ceiling(abs(Bound(d,1) - Bound(d,2))/(0.072))
End Do

Write(*,*) "xBoundMax = ",Bound(1,1)
Write(*,*) "xBoundMin = ",Bound(1,2)
Write(*,*) "yBoundMax = ",Bound(2,1)
Write(*,*) "yBoundMin = ",Bound(2,2)
Write(*,*) "zBoundMax = ",Bound(3,1)
Write(*,*) "zBoundMin = ",Bound(3,2)
Write(*,*) "nCellx    = ",nCell(1)
Write(*,*) "nCelly    = ",nCell(2)
Write(*,*) "nCellz    = ",nCell(3)

! Perform the algorithm on the GPU
Write(*,*) ""
Write(*,*) "GPU Implementation"
Bound(:,1) = -1.0E+10
Bound(:,2) = 1.0E+10
x_d = x
max1 = 0.0
max2 = 0.0
max3 = 0.0
min1 = 999999.0
min2 = 999999.0
min3 = 999999.0

!\$CUF Kernel Do <<<*,128>>>
Do i = 1, nTotal
max1 = max(max1,x_d(i,1))
min1 = min(min1,x_d(i,1))
max2 = max(max2,x_d(i,2))
min2 = min(min2,x_d(i,2))
max3 = max(max3,x_d(i,3))
min3 = min(min3,x_d(i,3))
End Do

!\$acc parallel
Bound_d(1,1)=max1
Bound_d(1,2)=min1
Bound_d(2,1)=max2
Bound_d(2,2)=min2
Bound_d(3,1)=max3
Bound_d(3,2)=min3
!\$acc loop seq
Do d = 1, 3
nCell_d(d) = ceiling(abs(Bound_d(d,1) - Bound_d(d,2))/(0.072))
enddo
!\$acc end parallel

Bound=Bound_d
nCell=nCell_d
Write(*,*) "xBoundMax = ",Bound(1,1)
Write(*,*) "xBoundMin = ",Bound(1,2)
Write(*,*) "yBoundMax = ",Bound(2,1)
Write(*,*) "yBoundMin = ",Bound(2,2)
Write(*,*) "zBoundMax = ",Bound(3,1)
Write(*,*) "zBoundMin = ",Bound(3,2)
Write(*,*) "nCellx    = ",nCell(1)
Write(*,*) "nCelly    = ",nCell(2)
Write(*,*) "nCellz    = ",nCell(3)

End Program Bounds

% pgf90 -V14.2 -acc -Mcuda -Minfo test_3_5_14.cuf ; a.out
bounds:
22, Memory zero idiom, array assignment replaced by call to pgf90_mzero4
38, maxval reduction inlined
39, minval reduction inlined
67, CUDA kernel generated
67, !\$cuf kernel do <<< (*), (128) >>>
68, Max reduction generated for max1
69, Min reduction generated for min1
70, Max reduction generated for max2
71, Min reduction generated for min2
72, Max reduction generated for max3
73, Min reduction generated for min3
76, Accelerator kernel generated
84, Loop is parallelizable

CPU Implementation
xBoundMax =     1.200000
xBoundMin =    -1.200000
yBoundMax =     1.600000
yBoundMin =    -1.600000
zBoundMax =     1.400000
zBoundMin =    -1.400000
nCellx    =            34
nCelly    =            45
nCellz    =            39

GPU Implementation
xBoundMax =     1.200000
xBoundMin =    -1.200000
yBoundMax =     1.600000
yBoundMin =    -1.600000
zBoundMax =     1.400000
zBoundMin =    -1.400000
nCellx    =            34
nCelly    =            45
nCellz    =            39
``````

Thanks Mat, I have never used acc, so I didn’t think of that.

Maybe I could use acc for this bit of code too? CellIndex is a 2 column array that I want to pack into CellList (a 2D array giving the nodes in each cell). NodesInCell is a vector that stores the number of nodes in each cell.

I cant think of a straightforward way to parallelize this since NodesInCell would be accessed by multiple threads at once.

``````Attributes(Global) Subroutine BuildCellList(nTotal,CellIndex,CellList,maxIndex,NodesInCell)

Implicit None

Integer:: i, j, d
Integer, Value:: nTotal, maxIndex
Integer, Device, Intent(IN):: CellIndex(nTotal,2)
Integer, Device, Intent(OUT):: CellList(maxIndex,27), NodesInCell(maxIndex)

CellList      = 0
NodesInCell   = 0

Do i = 1, nTotal
j = CellIndex(i,2)
NodesInCell(j) = NodesInCell(j) + 1
CellList(j,NodesInCell(j)+1) = CellIndex(i,1)
CellList(j,1) = j
End Do

End Subroutine BuildCellList
``````

As you can see, I have taken a quick stab at it (first attempt being a single thread kernel… not great…). I know that there are really good ways to parallelize this, but maybe there is an easy way for a beginner like me.

Thanks,

Kirk

Are all values in the CellIndex unique? If so, then you could just put an OpenACC “!\$ACC kernels loop independent” directive above the DO I loop (from host code, not within device code).

Though, given you’re counting the number of nodes in a cell, I’m guessing J is not unique. In that case, you probably could use atomic operations. They’re a bit slow so you’ll want to test performance versus other methods, but it may be ok.

OpenACC atomics are still being implemented but you can use CUDA Fortran atomics in OpenACC in 14.1. Though, since you’re already using CUDA Fortran in may be easier to just remove the do loop and then create “nTotal” threads, assigning a unique “i” to each thread, then add an atomic around update of “NodeInCell”.

The updates to CellList probably don’t need an atomic operation given the first update looks to assign values to unique CellList elements and the second would update the entry to same value no matter the i.

One potential problem is the use of “NodesInCell” in the CellList index. At this point we have to assume another thread has updated it and it’s value has changed. Instead, you’ll need use a local variable here. Though, I’ve forgotten off hand how to atomically increment NodesInCell and return the value to a scalar. I’ll look it up when I get into the office tomorrow.

• Mat

Hi Mat,

Although this is probably not optimal, this works pretty well. I get ~8x speedup. I will have to try to compare it to a CUDA kernel with atomics.

``````Subroutine BuildCellListAcc(nTotal,CellIndex_d,CellList_d,maxIndex,NodesInCell_d)

Implicit None

Integer:: i, j, d
Integer:: nTotal, maxIndex
Integer, Device, Intent(IN):: CellIndex_d(nTotal,2)
Integer, Device, Intent(OUT):: CellList_d(maxIndex,27), NodesInCell_d(maxIndex)

!\$acc parallel
CellList_d      = 0
NodesInCell_d   = 0

!\$acc loop
Do i = 1, nTotal
j = CellIndex_d(i,2)
NodesInCell_d(j) = NodesInCell_d(j) + 1
CellList_d(j,NodesInCell_d(j)+1) = CellIndex_d(i,1)
CellList_d(j,1) = j
End Do
!\$acc end parallel

End Subroutine BuildCellListAcc
``````

You are right, j is not unique, each cell holds between 9 to 27 nodes.

Kirk

Mat,

Hmmmm, on second thought, the acc does not work after all. I was hoping that acc magically did all the hard stuff for me… Guess not though, hahaha.

I guess the easiest way will be to create a CUDA kernel and use an atomic to guard NodesInCell.

Let me know if you find the atomic operation to do this. Maybe I need to use atomicinc or atomicadd!?

I imagine that the best way to do this is to write a CUDA kernel and used shared memory for NodesInCell and operate on blocks of data?!

Kirk

Hi Kirk,

Give this a try:

``````Subroutine BuildCellListAcc(nTotal,CellIndex_d,CellList_d,maxIndex,NodesInCell_d)
Use cudadevice
Implicit None

Integer:: i, j, d
Integer:: nTotal, maxIndex, idx
Integer, Device, Intent(IN):: CellIndex_d(nTotal,2)
Integer, Device, Intent(OUT):: CellList_d(maxIndex,27), NodesInCell_d(maxIndex)
CellList_d      = 0
NodesInCell_d   = 0

!\$acc parallel loop
Do i = 1, nTotal
j = CellIndex_d(i,2)
idx = atomicinc(NodesInCell_d(j), 27)
idx = idx+2  ! increment since atomicinc returns the old value
CellList_d(j,idx) = CellIndex_d(i,1)
CellList_d(j,1) = j
End Do
!\$acc end parallel

End Subroutine BuildCellListAcc
``````

Here, OpenACC is really just doing the booking keeping part so you could also turn this into a CUDA Fortran kernel. Atomicinc does return the “old” value from NodesInCell so is why I’m incrementing it by the extra 1.

Mat,

what version of PGF do I need for atomicinc? I am using 13.10. It doesnt seem to be incrementing NodesInCell as hoped.

``````Attributes(Global) Subroutine BuildCellListAtomic(nTotal,CellIndex,CellList,maxIndex,NodesInCell)

Implicit None

Integer:: i, j, idx
Integer, Value:: nTotal, maxIndex
Integer, Device, Intent(IN):: CellIndex(nTotal,2)
Integer, Device, Intent(OUT):: CellList(maxIndex,27), NodesInCell(maxIndex)

CellList      = 0
NodesInCell   = 0

i = (blockIdx%x-1)*blockDim%x + threadIdx%x

If (i >= 1 .and. i <= nTotal) Then
j = CellIndex(i,2)
idx = atomicinc(NodesInCell(j),27)
idx = idx + 2
CellList(j,idx) = CellIndex(i,1)
CellList(j,1) = j
End If

End Subroutine BuildCellListAtomic
``````

My kernel launch is something like this

``````Integer, Parameter:: BlockX = 128, BlockY = 1
tBlock = Dim3(BlockX,BlockY,1)
grid = Dim3(ceiling(Real(nTotal)/tBlock%x),1,1)

Call BuildCellListAtomic<<<grid,tBlock>>>(nTotal,CellIndex_d,CellList_d,maxIndex,NodesInCell_d)
``````

Kirk

what version of PGF do I need for atomicinc? I am using 13.10. It doesnt seem to be incrementing NodesInCell as hoped.

We just add them to OpenACC in 14.1, but they’ve been in CUDA Fortran for quite awhile. So my example wont work for you but you should be able to write you’re own CUDA Fortran kernel and use them there.

• Mat

I tried this, but NodesInCell is not being incremented. Each entry of NodesInCell is zero.

``````Attributes(Global) Subroutine BuildCellListAtomic(nTotal,CellIndex,CellList,maxIndex,NodesInCell)

Implicit None

Integer:: i, j
Integer, Device:: idx
Integer, Value:: nTotal, maxIndex
Integer, Device, Intent(IN):: CellIndex(nTotal,2)
Integer, Device, Intent(OUT):: CellList(maxIndex,27), NodesInCell(maxIndex)

CellList      = 0
NodesInCell   = 0

i = (blockIdx%x-1)*blockDim%x + threadIdx%x

If (i >= 1 .and. i <= nTotal) Then
j = CellIndex(i,2)
idx = atomicinc(NodesInCell(j),27)
idx = idx + 2
CellList(j,idx) = CellIndex(i,1)
CellList(j,1) = j
End If

End Subroutine BuildCellListAtomic
``````

Have I done something wrong?

Kirk

You shouldn’t clear “NodesInCell” in the kernel itself. This will get run by all threads, in any order, and will most likely give you zeros in the result.

Here’s a small example:

``````% cat test_3_7_14.cuf
module cells

contains

attributes(Global) Subroutine BuildCellListAtomic(nTotal,CellIndex,CellList,maxIndex,NodesInCell)

Implicit None

Integer:: i, j
Integer, Device:: idx
Integer, Value:: nTotal, maxIndex
Integer, Device, Intent(IN):: CellIndex(nTotal,2)
Integer, Device, Intent(OUT):: CellList(maxIndex,27), NodesInCell(maxIndex)

i = (blockIdx%x-1)*blockDim%x + threadIdx%x

If (i >= 1 .and. i <= nTotal) Then
j = CellIndex(i,2)
idx = atomicinc(NodesInCell(j),27)
idx = idx + 2
CellList(j,idx) = CellIndex(i,1)
CellList(j,1) = j
End If

End Subroutine BuildCellListAtomic

end module cells

Program Bounds

Use cudafor
use cells
Implicit None

Integer, parameter :: nTotal=1024
Integer, parameter :: maxIndex=64
Integer :: CellIndex(nTotal,2)
Integer :: CellList(maxIndex,27), NodesInCell(maxIndex)
Integer, Device :: CellIndex_d(nTotal,2)
Integer, Device :: CellList_d(maxIndex,27), NodesInCell_d(maxIndex)
Integer, Parameter:: BlockX = 128, BlockY = 1
Integer :: i
type(dim3) :: tBlock, grid

NodesInCell=0
CellIndex=0
CellList=0
do i=1,nTotal
CellIndex(i,2) = mod(i,maxIndex-1)+1
CellIndex(i,1) = i
enddo
NodesInCell_d=NodesInCell
CellIndex_d=CellIndex
CellList_d=CellList
tBlock = Dim3(BlockX,BlockY,1)
grid = Dim3(ceiling(Real(nTotal)/tBlock%x),1,1)
Call BuildCellListAtomic<<<grid,tBlock>>>(nTotal,CellIndex_d,CellList_d,maxIndex,NodesInCell_d)
NodesInCell=NodesInCell_d
CellList=CellList_d
print *, NodesInCell
End Program Bounds

% pgf90 test_3_7_14.cuf -V13.10; a.out
16           17           17           17           17           17
17           17           17           17           17           17
17           17           17           17           17           16
16           16           16           16           16           16
16           16           16           16           16           16
16           16           16           16           16           16
16           16           16           16           16           16
16           16           16           16           16           16
16           16           16           16           16           16
16           16           16           16           16           16
16           16           16            0
``````

Fantastic!

That is perfect, makes a big difference.

Thank you, your help is very appreciated.

Kirk