Hi
I am getting an error message when I tried to use CUDA Visual Profiler for a multi-GPU CUDA Fortran Code using OpenMP. Though, the executable is running fine, but when I try to profile the same application, I get the following error:
=== Start profiling for session ‘Session1’ ===
Start program 'C:/Program Files/PGI/win64/2011/cuda/CUDA Fortran SDK/Balki/Problematic/testopenmp.exe ’ run #1 …
0: DEV_MKDESC: copyin Memcpy FAILED:11(invalid argument)Program run #1 failed, exit code: 127
Error in program execution.
!######################################################## MODULE
module m1
use cudafor
use omp_lib
implicit none
contains
!#################################### GPU
subroutine addKernel(h_a, b)
real, allocatable:: h_a(:)
real, device, allocatable :: d_a(:)
integer, value :: b
integer :: THREADS_PER_BLOCK, NUMBER_OF_BLOCKS, NUMBER_OF_THREADS, i, ierr, index, tid, devnum
real, allocatable:: h_a1(:)
real, allocatable:: h_a2(:)
real, allocatable:: h_a3(:)
real, allocatable:: h_a4(:)
THREADS_PER_BLOCK=512
NUMBER_OF_THREADS=512
NUMBER_OF_BLOCKS=NUMBER_OF_THREADS/THREADS_PER_BLOCK
allocate(h_a1(1:512))
allocate(h_a2(1:512))
allocate(h_a3(1:512))
allocate(h_a4(1:512))
do i=1, 512
h_a1(i)=1.0
h_a2(i)=1.0
h_a3(i)=1.0
h_a4(i)=1.0
end do
!$omp parallel do private(index, tid,devnum)
do index = 1, 4
tid = omp_get_thread_num()
if(tid==0) then
write(*,*) "Thread id", tid
ierr=cudaThreadExit()
ierr=cudaSetDevice(tid)
if(ierr /=cudaSuccess) then
write(*,*) "Error occurred in thread ", tid
end if
ierr=cudaMalloc(d_a,512)
if(ierr /=cudaSuccess) then
write(*,*) "Error occurred in thread ", tid
end if
ierr=cudaMemcpy(d_a,h_a1,512, cudaMemcpyHostToDevice)
if(ierr /=cudaSuccess) then
write(*,*) "Error occurred in thread ", tid
end if
call addKernelCall<<< NUMBER_OF_BLOCKS,THREADS_PER_BLOCK >>>(d_a,tid, THREADS_PER_BLOCK, NUMBER_OF_THREADS)
ierr=cudaMemcpy(h_a1,d_a,512, cudaMemcpyDeviceToHost)
if(ierr /=cudaSuccess) then
write(*,*) "Error occurred in thread ", tid
end if
ierr=cudaDeviceSynchronize()
ierr=cudaGetDevice(devnum)
write(*,*) "Displaying results on GPU ",devnum
do i=1,6
write(*,*) "h_a1 : ", i , " = ", h_a1(i)
end do
else if(tid==1) then
write(*,*) "Thread id", tid
ierr=cudaThreadExit()
ierr=cudaSetDevice(tid)
if(ierr /=cudaSuccess) then
write(*,*) "Error occurred in thread ", tid
end if
ierr=cudaMalloc(d_a,512)
ierr=cudaMemcpy(d_a,h_a2,512, cudaMemcpyHostToDevice)
if(ierr /=cudaSuccess) then
write(*,*) "Error occurred in thread ", tid
end if
call addKernelCall<<< NUMBER_OF_BLOCKS,THREADS_PER_BLOCK >>>(d_a,tid, THREADS_PER_BLOCK, NUMBER_OF_THREADS)
ierr=cudaMemcpy(h_a2,d_a,512, cudaMemcpyDeviceToHost)
if(ierr /=cudaSuccess) then
write(*,*) "Error occurred in thread ", tid
end if
ierr=cudaDeviceSynchronize()
ierr=cudaGetDevice(devnum)
write(*,*) "Displaying results on GPU ",devnum
do i=1,6
write(*,*) "h_a2 : ", i , " = ", h_a2(i)
end do
else if(tid ==2) then
write(*,*) "Thread id", tid
ierr=cudaThreadExit()
ierr=cudaSetDevice(tid)
if(ierr /=cudaSuccess) then
write(*,*) "Error occurred in thread ", tid
end if
ierr=cudaMalloc(d_a,512)
if(ierr /=cudaSuccess) then
write(*,*) "Error occurred in thread ", tid
end if
ierr=cudaMemcpy(d_a,h_a3,512, cudaMemcpyHostToDevice)
if(ierr /=cudaSuccess) then
write(*,*) "Error occurred in thread ", tid
end if
call addKernelCall<<< NUMBER_OF_BLOCKS,THREADS_PER_BLOCK >>>(d_a,tid, THREADS_PER_BLOCK, NUMBER_OF_THREADS)
ierr=cudaMemcpy(h_a3,d_a,512, cudaMemcpyDeviceToHost)
if(ierr /=cudaSuccess) then
write(*,*) "Error occurred in thread ", tid
end if
ierr=cudaDeviceSynchronize()
ierr=cudaGetDevice(devnum)
write(*,*) "Displaying results on GPU ",devnum
do i=1,6
write(*,*) "h_a3 : ", i , " = ", h_a3(i)
end do
else if(tid==3) then
write(*,*) "Thread id", tid
ierr=cudaThreadExit()
ierr=cudaSetDevice(tid)
if(ierr /=cudaSuccess) then
write(*,*) "Error occurred in thread ", tid
end if
ierr=cudaMalloc(d_a,512)
if(ierr /=cudaSuccess) then
write(*,*) "Error occurred in thread ", tid
end if
ierr=cudaMemcpy(d_a,h_a4,512, cudaMemcpyHostToDevice)
if(ierr /=cudaSuccess) then
write(*,*) "Error occurred in thread ", tid
end if
call addKernelCall<<< NUMBER_OF_BLOCKS,THREADS_PER_BLOCK >>>(d_a,tid, THREADS_PER_BLOCK, NUMBER_OF_THREADS)
ierr=cudaMemcpy(h_a4,d_a,512, cudaMemcpyDeviceToHost)
if(ierr /=cudaSuccess) then
write(*,*) "Error occurred in thread ", tid
end if
ierr=cudaDeviceSynchronize()
ierr=cudaGetDevice(devnum)
write(*,*) "Displaying results on GPU ",devnum
do i=1,6
write(*,*) "h_a4 : ", i , " = ", h_a4(i)
end do
end if
write(*,*) "Thread id", tid
ierr=cudaDeviceReset()
end do
call addKernelCall<<< NUMBER_OF_BLOCKS,THREADS_PER_BLOCK >>>(d_a,b, THREADS_PER_BLOCK, NUMBER_OF_THREADS)
end subroutine
attributes(global) subroutine addKernelCall(d_a, b, THREADS_PER_BLOCK, NUMBER_OF_THREADS)
real, device, intent(inout) :: d_a(:)
integer, value :: b
integer :: idx
integer, value :: THREADS_PER_BLOCK, NUMBER_OF_THREADS
idx= threadidx%x
if(idx<=NUMBER_OF_THREADS) then
d_a(idx)= d_a(idx) + b
endif
end subroutine
end module
!######################################################## MODULE END
program cfd
use m1
use omp_lib
use cudafor
implicit none
!########### DECLARE
real, allocatable:: h_a(:)
integer :: i, ierr, noOfProcs, nDevices, tid, index
allocate(h_a(1:512))
do i=1, 512
h_a(i)=1.0
end do
write(*,*) "calling kernel..."
call addKernel(h_a, 2)
ierr = cudaGetDeviceCount(nDevices)
if (ierr /= cudaSuccess) then
write(*,*) 'cudaGetDeviceCount failed -- CUDA driver and runtime may be mismatched'
stop
end if
if (nDevices == 0) then
write(*,*) 'No CUDA devices found'
end if
write(*,*) "No of GPUs present ", nDevices
noOfProcs=omp_get_num_procs()
write(*,*) "Max Procs: ", noOfProcs, "maxthreads : ", omp_get_max_threads()
write(*,*) " Using Multi GPUs "
write(*,*) "calling kernel..."
deallocate(h_a)
end
I am using PGI CUDA Fortran version 11.9 with Cuda 4.0 Toolkit.