Hi i have trouble with OMPing my DEM code. The problem is that my OMP talyor expansions are not scaling, which is embarrassing because as Taylor only theoretically “embarrassingly parallel” part of the code and things like Collision Detection and force models aren’t a problem with good scaling.
I am using a window 7 computer using PGI visual studio 13.5 with 2 Intel(R) Xeon(R) CPU E5-2687W 0 @ 3.10GHz, with the environmental OMP_NUM_THREADS 16
first if this program is run in visual studio in either release or the debugger it crashes on 11 (I think its reached 12 but beside the piont). If running it with with out visual studio we can get results
Now the problem is compound with different compiler options
With the standard release
-Bstatic -Mbackslash -mp -I"c:\program files\pgi\win64\13.5\include" -I"C:\Program Files\PGI\Microsoft Open Tools 11\include" -I"C:\Program Files (x86)\Windows Kits\8.0\Include\shared" -I"C:\Program Files (x86)\Windows Kits\8.0\Include\um" -fastsse -Minform=warn
WITHOUT OMP 0.007539
OMP using 1 Threads, took 0.049737 15.158576 % compared to not omp
OMP using 2 Threads, took 0.011979 62.938803 % compared to not omp
OMP using 3 Threads, took 0.011377 66.266989 % compared to not omp
OMP using 4 Threads, took 0.010057 74.965431 % compared to not omp
OMP using 5 Threads, took 0.009997 75.414831 % compared to not omp
OMP using 6 Threads, took 0.009970 75.617694 % compared to not omp
OMP using 7 Threads, took 0.019982 37.730534 % compared to not omp
OMP using 8 Threads, took 0.010585 71.229706 % compared to not omp
OMP using 9 Threads, took 0.010700 70.460453 % compared to not omp
OMP using 10 Threads, took 0.009990 75.472323 % compared to not omp
OMP using 11 Threads, took 0.019917 37.854732 % compared to not omp
OMP using 12 Threads, took 0.019918 37.852215 % compared to not omp
OMP using 13 Threads, took 0.019913 37.861656 % compared to not omp
OMP using 14 Threads, took 0.017349 43.457516 % compared to not omp
OMP using 15 Threads, took 0.019815 38.048927 % compared to not omp
OMP using 16 Threads, took 0.010039 75.101422 % compared to not omp
Then with the optimization turned off
-Bstatic -Mbackslash -mp -I"c:\program files\pgi\win64\13.5\include" -I"C:\Program Files\PGI\Microsoft Open Tools 11\include" -I"C:\Program Files (x86)\Windows Kits\8.0\Include\shared" -I"C:\Program Files (x86)\Windows Kits\8.0\Include\um" -Minform=warn
gives
WITHOUT OMP 0.032620
OMP using 1 Threads, took 0.027060 120.548683 % compared to not omp
OMP using 2 Threads, took 0.014288 228.299963 % compared to not omp
OMP using 3 Threads, took 0.011752 277.580232 % compared to not omp
OMP using 4 Threads, took 0.010447 312.243669 % compared to not omp
OMP using 5 Threads, took 0.009992 326.449732 % compared to not omp
OMP using 6 Threads, took 0.010884 299.695790 % compared to not omp
OMP using 7 Threads, took 0.009995 326.374027 % compared to not omp
OMP using 8 Threads, took 0.010520 310.062002 % compared to not omp
OMP using 9 Threads, took 0.010348 315.231025 % compared to not omp
OMP using 10 Threads, took 0.009983 326.763740 % compared to not omp
OMP using 11 Threads, took 0.020017 162.962963 % compared to not omp
OMP using 12 Threads, took 0.019957 163.455061 % compared to not omp
OMP using 13 Threads, took 0.019844 164.382373 % compared to not omp
OMP using 14 Threads, took 0.019579 166.606349 % compared to not omp
OMP using 15 Threads, took 0.029464 110.712038 % compared to not omp
OMP using 16 Threads, took 0.023021 141.698670 % compared to not omp
Preferably I would like to have SIMD and OMP because you know in theory 16 cores vectoring 4 loops at a time would be so fast that I would never have to worry about the Taylor expansion again. Can some one explain where i am going wrong? The code is bellow it might look weird but that’s because its part of a much larger code.
Thanks for any help in advance!
Marc
!
! ConsoleApp.f90
!
! Fortran Console Application
! Generated by PGI Visual Fortran(R)
! 5/16/2013 5:39:09 PM
!
program prog
!$ use omp_lib !omp values
implicit none
! Variables
INTEGER :: i, np, threads
REAL*8 :: Acc_tmp, Accel_diff_tmp
REAL*8, DIMENSION(3,2000000), TARGET:: xyz, fxyz,Accxyz, vXYZ
REAL*8, DIMENSION(2000000) :: Paricle_live, m, mInvert
REAL*8, POINTER:: y(:), fy(:),Accy(:), vY(:)
REAL*8, SAVE :: dt, dt_half, dt_Squared, dt_Squared_half, dt_Squared_Sixth
REAL*8 :: FreeFlightTotal, FreeFlightS, FreeFlightE, FreeFlightTotal_compare
!$OMP THREADPRIVATE(dt, dt_half, dt_Squared,dt_Squared_half, dt_Squared_Sixth)
! Body
dt = 0.001D0
dt_half = dt * 0.5D0
dt_Squared = dt * dt
dt_Squared_half = dt*dt*0.5D0
dt_Squared_Sixth = (dt*dt) / 6.0D0
fY => fXYZ(2,1:2000000)
Y => XYZ(2,1:2000000)
vY => vXYZ(2,1:2000000)
accY => accXYZ(2,1:2000000)
fY = 10.d0
Y = 200000.D0
Y = 100.D0
accY = 10.D0
m = 1.D0
mInvert = 1.D0/ m
np = 2000000
Paricle_live = 1.D0
Paricle_live(300:500) = 0.D0
threads = 0
CALL CPU_TIME (FreeFlightS)
!$ FreeFlightS = OMP_get_wtime()
DO i=1, np
!
Acc_tmp = fy(i)*Paricle_live(i) !F = MA !F/M = A
Acc_tmp = Acc_tmp* mInvert(i) !F = MA !F/M = A
Accel_diff_tmp = Acc_tmp - Accy(i)
Accy(i) = Acc_tmp
y(i) = y(i) + (vy(i) * dt) + (Acc_tmp* dt_Squared_half) + (Accel_diff_tmp * dt_Squared_Sixth)
vy(i) = vy(i) + (Acc_tmp * dt) + (Accel_diff_tmp * dt_half)
END DO
CALL CPU_TIME (FreeFlightE)
!$ FreeFlightE = OMP_get_wtime()
FreeFlightTotal = FreeFlightE - FreeFlightS
!
WRITE(6,'(a,f12.6)') "WITHOUT OMP", FreeFlightTotal
FreeFlightTotal_compare = FreeFlightTotal
!$ DO threads = 1, 16
fY = 10.d0
Y = 200000.D0
Y = 100.D0
accY = 10.D0
!$OMP PARALLEL PRIVATE(i, Acc_tmp,Accel_diff_tmp), num_threads (threads)
CALL CPU_TIME (FreeFlightS)
!$ FreeFlightS = OMP_get_wtime()
!$OMP DO
DO i=1, np
!
Acc_tmp = fy(i)*Paricle_live(i) !F = MA !F/M = A
Acc_tmp = Acc_tmp* mInvert(i) !F = MA !F/M = A
Accel_diff_tmp = Acc_tmp - Accy(i)
Accy(i) = Acc_tmp
y(i) = y(i) + (vy(i) * dt) + (Acc_tmp* dt_Squared_half) + (Accel_diff_tmp * dt_Squared_Sixth)
vy(i) = vy(i) + (Acc_tmp * dt) + (Accel_diff_tmp * dt_half)
END DO
!$OMP END DO
CALL CPU_TIME (FreeFlightE)
!$ FreeFlightE = OMP_get_wtime()
!$OMP END PARALLEL
FreeFlightTotal = FreeFlightE - FreeFlightS
!
WRITE(6,'(a,i3,a,f12.6,5x,f12.6,a)') "OMP using ", threads, " Threads, took ", FreeFlightTotal, ((FreeFlightTotal_compare/FreeFlightTotal)*100.D0), " % compared to not omp"
!$ END DO
pause
end program prog