Hi All,
We are using pgfortran compiler version 12.9(64 bit) on windows system.
We are spawning 4 openmp threads and each openmp thread is supposed to run on its respective GPU. We are calling this omp parallel region over NSTEPS steps.
With larger NSTEPS value (NSTEPS =100000) the probability of program failing at run-time increases.
With smaller NSTEPS value (NSTEPS =100) the probability of program failing at run-time decreases and most of the times it successfully finishes execution.
The compiled exe keeps on giving different run-time errors as follows
0: DEV_MKDESC: copyin Memcpy FAILED:11(invalid argument)
0: DEV_MKDESC: allocate FAILED:30(unknown error)
0: DEV_MKDESC: allocate FAILED:30(unknown error)
0: ALLOCATE: 4000 bytes requested; status = 30(unknown error)
0: DEV_MKDESC: copyin Memcpy FAILED:11(invalid argument)
PFA the code,
We used following script for compile and execute.
set OMP_NUM_THREADS=4
pgfortran -Mcuda -mp test.CUF
test.exe
Thanks and regards,
Praveen
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!11
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!11 KERNEL
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!11
module m_Kernels
use cudafor
real, device, allocatable :: VAR_1_D(:)
contains
ATTRIBUTES(GLOBAL) SUBROUTINE ProcessArray_Kernel_1(VAR_1_D)
real , device :: VAR_1_D(:)
END SUBROUTINE ProcessArray_Kernel_1
end module m_Kernels
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!11
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!11 main
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!11
program main
use omp_lib
use cudafor
use m_Kernels
integer :: threadsPerBlock, numOfThreads,numOfBlocks, ierr, array_size, num_of_omp_threads, NSTEPS, omp_thread_id
real , allocatable :: VAR_1(:)
!$OMP THREADPRIVATE(VAR_1)
! openmp shared VAR_1iables
num_of_omp_threads = 4
NSTEPS = 100000
array_size=1000
! setting number of threads
CALL omp_set_num_threads(num_of_omp_threads)
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!running omp parallel region NSTEPS times
DO NSTEP = 1, NSTEPS
!omp parallel region that will be called NSTEPS times
!$OMP Parallel Do &
!$OMP SHARED( num_of_omp_threads, NSTEPS,array_size) &
!$OMP PRIVATE( VAR_1_D, VAR_2_D, threadsPerBlock, numOfThreads,numOfBlocks)
DO omp_thread_id = 1, num_of_omp_threads
!setting device
ierr = CUDASETDEVICE((omp_thread_id-1))
if ( ierr /= cudaSuccess ) then
write (* ,*) cudaGetErrorString ( ierr )
else
write (* ,*) 'device was set to:', (omp_thread_id-1), 'nstep is:',nstep
end if
!allocating arrays
allocate(VAR_1_D(array_size))
allocate(VAR_1(array_size))
!CPU to GPU copy
VAR_1=1.0
VAR_1_D = VAR_1
!Kernel Call
threadsPerBlock = 512
numOfThreads = array_size
numOfBlocks = CEILING(real(numOfThreads) / threadsPerBlock)
call ProcessArray_Kernel_1<<<numOfBlocks, threadsPerBlock>>>(VAR_1_D)
ierr=cudaThreadSynchronize()
if ( ierr /= cudaSuccess ) write (* ,*) cudaGetErrorString ( ierr )
!deallocating arrays
deallocate(VAR_1_D)
deallocate(VAR_1)
END DO
!$OMP END Parallel Do
END DO
end program
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!11
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!11
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!11