Here is a program, check2.f, that determines if your system
can run OpenMP across multiple threads.
pgf90 check2.f dclock_64.s -o check2 -mp
Here is check2.f
program test
integer j,k
integer omp_get_num_procs, omp_get_max_threads
integer omp_get_num_threads
integer thread(20)
integer,parameter:: max_thrd=4
real*8 dclock, time1, time2
thread(1)=1
do i=1,max_thrd
thread(i+1)=2*thread(i)
end do
print *,"number of cores =", omp_get_num_procs()
print *,"max threads =", omp_get_max_threads()
print *,"current num threads =", omp_get_num_threads()
! call system("uname -a")
j = 200 ! may want to change this
do ii=1,max_thrd+1
call omp_set_num_threads(thread(ii))
time1 = dclock()
!$omp parallel
!$omp do
do k = 1, thread(max_thrd+1)
call delay(j)
enddo
!$omp end parallel
time2 = dclock() - time1
print *, thread(ii)," core test - delay value =",
* j*thread(max_thrd+1),
* " time =", time2, " seconds"
end do
end
subroutine delay(n)
integer n
integer i
do i=1,n
call abc()
end do
return
end
subroutine abc()
integer i
do i=1,1000000
call def()
end do
return
end
subroutine def()
return
end
Here is the timing routine dclock_64.s
.file "dclock-hammer.s"
.align 8
.data
# .clock: .double 0.000000001 # 1.0 GHz
# .clock: .double 0.000000000750187 # 1.33GHz
# .clock: .double 0.000000000714 # 1.4 GHz
# .clock: .double 0.000000000666 # 1.5 GHz
# .clock: .double 0.000000000625 # 1.6 GHz
# .clock: .double 0.00000000059 # 1.7 GHz
# .clock: .double 0.0000000005556 # 1.8 GHz
# .clock: .double 0.0000000005 # 2.0 GHz
# .clock: .double 0.000000000455 # 2.2 GHz
# .clock: .double 0.000000000417 # 2.4 GHz
.clock: .double 0.000000000376 # 2.66 GHz
# .clock: .double 0.000000000357 # 2.8 GHz
# .clock: .double 0.0000000003333 # 3.0 GHz
# .clock: .double 0.0000000003125 # 3.2 GHz
# .clock: .double 0.0000000002777 # 3.6 GHz
.low: .long 0x00000000
.high: .long 0x00000000
.text
.globl _DCLOCK, dclock, _dclock, _dclock_, dclock_
_DCLOCK:
dclock:
_dclock:
_dclock_:
dclock_:
.byte 0x0f, 0x31
movl %eax, .low(%RIP)
movl %edx, .high(%RIP)
fildll .low(%RIP)
fmull .clock(%RIP)
fstpl -24(%rsp)
movsd -24(%rsp), %xmm0
ret
You should see 1, 2, 4, 8, 16 threads run the same
(non-memory using) computations, and time the
work. Each time the threads double, the processing time should
cut in half from before.
If you extend to 32 threads(max_thrd=5), the hyper-threads, which are not on different cores, should speed nothing up. OpenMP should first assign real cores before assigning hyper-thread cores (which are a second set of registers for the same CPU core).
pgf90 -V
on your system should indicate what type CPUs the compiler thinks you have.
If this works and your code still fails, send your failing program
to trs@pgroup.com so we can take a look.
dave