The main program is:
program example27b
use GPU_mod
implicit real(4)(a-h,o-z)
include "omp_lib.h"
integer,parameter:: Nbin=10, Na=134217728
integer i,j
integer:: n,Ndev,dev,dist3(Nbin)
real(4):: x(Na),dist(Nbin),dist2(Nbin),xBin(Nbin)
integer, allocatable:: OffSet(:),StrSz(:)
real(4), device, allocatable:: xD(:),xBinD(:)
integer(4), device, allocatable:: dist3D(:)
type(dim3) :: dimGrid, dimBlock
allocate(dist3D(Nbin),xD(Na),xBinD(Nbin))
count = 0
Ndev=4
call omp_set_num_threads(Ndev)
nTr=16
dimGrid = dim3( 16, 1, 1 )
dimBlock = dim3( 16, 1, 1 )
do i=1,Na
call random_number(f)
X(i)=f
enddo
dist=0.E0
xBin=0.E0
dx=1.E0/real(Nbin)
do i=1,Nbin
xBin(i)=(real(i)-0.5E0)*dx
enddo
start_time=omp_get_wtime()
do i=1,Na
ij=int(X(i)/dx)+1
if (ij.eq.Nbin+1) print *,X(i)
dist(ij)=dist(ij)+1.E0
enddo
end_time=omp_get_wtime()
print *,"Sequential time= ",(end_time-start_time)
start_time=omp_get_wtime()
!$omp parallel private(i,ij,ii) reduction (+:dist2)
dev=omp_get_thread_num()
do ii=1,Na,Ndev
i=ii+dev
ij=int(X(i)/dx)+1
dist2(ij)=dist2(ij)+1.E0
enddo
!$omp end parallel
end_time=omp_get_wtime()
print *,"OpenMP time= ",(end_time-start_time)
kol=0
print *,' '
start_time=omp_get_wtime()
istat=cudaMemset(xD,0.0E0,Na)
istat=cudaMemset(dist3D,0,Nbin)
istat=cudaMemcpy(xD,x,Na,cudaMemcpyHostToDevice)
end_time=omp_get_wtime()
print *,"host to device copy time= ",(end_time-start_time)
start_time=omp_get_wtime()
call stat_kernel<<<dimGrid,dimBlock>>>(xD,dist3D,Na,Nbin,dx,nTr)
end_time=omp_get_wtime()
print *,"GPU time= ",(end_time-start_time)
istat=cudaMemcpy(dist3,dist3D,Nbin,cudaMemcpyDeviceToHost)
print *,' '
deallocate(dist3D,xD,xBinD)
end
I’m using GeForce GT635m and PGI fortan 11.7 on Win 7 x64