Problem using CUDA Visual Profiler for CUDA Fortran

Hi

I am getting an error message when I tried to use CUDA Visual Profiler for a multi-GPU CUDA Fortran Code using OpenMP. Though, the executable is running fine, but when I try to profile the same application, I get the following error:

=== Start profiling for session ‘Session1’ ===
Start program 'C:/Program Files/PGI/win64/2011/cuda/CUDA Fortran SDK/Balki/Problematic/testopenmp.exe ’ run #1
0: DEV_MKDESC: copyin Memcpy FAILED:11(invalid argument)

Program run #1 failed, exit code: 127
Error in program execution.


!########################################################   MODULE
	 module m1
	 use cudafor
	 use omp_lib
	  implicit none
	  contains
	  
	    
	  !#################################### GPU
	  
         subroutine addKernel(h_a, b)
	    real, allocatable:: h_a(:)
	    real, device, allocatable :: d_a(:)
	    integer, value :: b
	    integer  :: THREADS_PER_BLOCK, NUMBER_OF_BLOCKS, NUMBER_OF_THREADS, i, ierr, index, tid, devnum
	    real, allocatable:: h_a1(:)
	    real, allocatable:: h_a2(:)
	    real, allocatable:: h_a3(:)
	    real, allocatable:: h_a4(:)

	     THREADS_PER_BLOCK=512
	     NUMBER_OF_THREADS=512
	     NUMBER_OF_BLOCKS=NUMBER_OF_THREADS/THREADS_PER_BLOCK
	     allocate(h_a1(1:512))
	     allocate(h_a2(1:512))
	     allocate(h_a3(1:512))
	     allocate(h_a4(1:512))


		do i=1, 512
			h_a1(i)=1.0
			h_a2(i)=1.0
			h_a3(i)=1.0
			h_a4(i)=1.0

		end do
	    
!$omp parallel do private(index, tid,devnum) 
	    	     do index = 1, 4  
	    		tid = omp_get_thread_num()
	    		if(tid==0) then
	    			write(*,*) "Thread id", tid
	    			ierr=cudaThreadExit()
	    			ierr=cudaSetDevice(tid)
	    			if(ierr /=cudaSuccess) then
					write(*,*) "Error occurred in thread ", tid
				  end if
	    			 ierr=cudaMalloc(d_a,512)
	    			 if(ierr /=cudaSuccess) then
	    			 	write(*,*) "Error occurred in thread ", tid
	    			 end if
	    			 ierr=cudaMemcpy(d_a,h_a1,512, cudaMemcpyHostToDevice)
	    			  if(ierr /=cudaSuccess) then
				  	 write(*,*) "Error occurred in thread ", tid
				   end if
	    			 call addKernelCall<<< NUMBER_OF_BLOCKS,THREADS_PER_BLOCK >>>(d_a,tid, THREADS_PER_BLOCK, NUMBER_OF_THREADS)
	    			 ierr=cudaMemcpy(h_a1,d_a,512, cudaMemcpyDeviceToHost)
	    			 if(ierr /=cudaSuccess) then
				 	 write(*,*) "Error occurred in thread ", tid
				  end if
				 ierr=cudaDeviceSynchronize()  
				 ierr=cudaGetDevice(devnum)  
		    		write(*,*) "Displaying results on GPU ",devnum 
				 do i=1,6
					write(*,*) "h_a1 : ", i , " = ", h_a1(i)  
				end do
	    			
	    		else if(tid==1) then
	    			write(*,*) "Thread id", tid
	    			ierr=cudaThreadExit()
	    			ierr=cudaSetDevice(tid)
	    			if(ierr /=cudaSuccess) then
				  	write(*,*) "Error occurred in thread ", tid
				  end if
				 ierr=cudaMalloc(d_a,512)
				ierr=cudaMemcpy(d_a,h_a2,512, cudaMemcpyHostToDevice)
				 if(ierr /=cudaSuccess) then
				 	 write(*,*) "Error occurred in thread ", tid
				  end if
	    			 call addKernelCall<<< NUMBER_OF_BLOCKS,THREADS_PER_BLOCK >>>(d_a,tid, THREADS_PER_BLOCK, NUMBER_OF_THREADS)
	    			 ierr=cudaMemcpy(h_a2,d_a,512, cudaMemcpyDeviceToHost)
	    			  if(ierr /=cudaSuccess) then
				 	 write(*,*) "Error occurred in thread ", tid
				  end if
 				ierr=cudaDeviceSynchronize()
				 	 ierr=cudaGetDevice(devnum)  
		    		write(*,*) "Displaying results on GPU ",devnum 
				do i=1,6
					write(*,*) "h_a2 : ", i , " = ", h_a2(i)  
				end do
	    		else if(tid ==2) then
	    			write(*,*) "Thread id", tid
	    			ierr=cudaThreadExit()
	    			ierr=cudaSetDevice(tid)
				 if(ierr /=cudaSuccess) then
				 				  	write(*,*) "Error occurred in thread ", tid
				  end if
				 ierr=cudaMalloc(d_a,512)
				  if(ierr /=cudaSuccess) then
				  	 write(*,*) "Error occurred in thread ", tid
				   end if
				ierr=cudaMemcpy(d_a,h_a3,512, cudaMemcpyHostToDevice)
	    			  if(ierr /=cudaSuccess) then
				  	 write(*,*) "Error occurred in thread ", tid
				   end if
	    			 call addKernelCall<<< NUMBER_OF_BLOCKS,THREADS_PER_BLOCK >>>(d_a,tid, THREADS_PER_BLOCK, NUMBER_OF_THREADS)
	    			 ierr=cudaMemcpy(h_a3,d_a,512, cudaMemcpyDeviceToHost)
	    			  if(ierr /=cudaSuccess) then
				 	 write(*,*) "Error occurred in thread ", tid
				  end if
 				ierr=cudaDeviceSynchronize()
    			 	 ierr=cudaGetDevice(devnum)  
		    		write(*,*) "Displaying results on GPU ",devnum 
			 	do i=1,6
			 		write(*,*) "h_a3 : ", i , " = ", h_a3(i)  
			 	end do
	    		else if(tid==3) then
	    			write(*,*) "Thread id", tid
	    			ierr=cudaThreadExit()
	    			ierr=cudaSetDevice(tid)
	    			 if(ierr /=cudaSuccess) then
				  	write(*,*) "Error occurred in thread ", tid
				  end if
	    			 ierr=cudaMalloc(d_a,512)
	    			  if(ierr /=cudaSuccess) then
 				 	 write(*,*) "Error occurred in thread ", tid
 				  end if

	    			ierr=cudaMemcpy(d_a,h_a4,512, cudaMemcpyHostToDevice)
	    			 if(ierr /=cudaSuccess) then
				 	 write(*,*) "Error occurred in thread ", tid
				  end if

	    			 call addKernelCall<<< NUMBER_OF_BLOCKS,THREADS_PER_BLOCK >>>(d_a,tid, THREADS_PER_BLOCK, NUMBER_OF_THREADS)
	    			 ierr=cudaMemcpy(h_a4,d_a,512, cudaMemcpyDeviceToHost)
	    			 if(ierr /=cudaSuccess) then
				 	 write(*,*) "Error occurred in thread ", tid
				  end if
 	    			ierr=cudaDeviceSynchronize() 
	    			ierr=cudaGetDevice(devnum)  
		    		write(*,*) "Displaying results on GPU ",devnum 
    			 	do i=1,6
			 		write(*,*) "h_a4 : ", i , " = ", h_a4(i)  
			 	end do			
	    		end if
	    		write(*,*) "Thread id", tid		
	       		  ierr=cudaDeviceReset()
	    		
	    	    end do
	    
	    call addKernelCall<<< NUMBER_OF_BLOCKS,THREADS_PER_BLOCK >>>(d_a,b, THREADS_PER_BLOCK, NUMBER_OF_THREADS)
          end subroutine
 	  
	  attributes(global) subroutine addKernelCall(d_a, b, THREADS_PER_BLOCK, NUMBER_OF_THREADS)
	    real, device, intent(inout) :: d_a(:)
	    integer, value :: b
	    integer :: idx
	    integer, value :: THREADS_PER_BLOCK, NUMBER_OF_THREADS
	    idx= threadidx%x
	    
	    if(idx<=NUMBER_OF_THREADS) then
	      d_a(idx)= d_a(idx) + b
	    endif
	  	  
	  end subroutine
	end module
	
!########################################################   MODULE END	
	program cfd
	use m1
	use omp_lib
	use cudafor
	implicit none
	
	!########### DECLARE
	real, allocatable:: h_a(:)
	integer :: i, ierr, noOfProcs, nDevices, tid, index
	
	allocate(h_a(1:512))
	
	do i=1, 512
		h_a(i)=1.0
	end do
	
	write(*,*) "calling kernel..."	
	
	call addKernel(h_a, 2)
	ierr = cudaGetDeviceCount(nDevices)
	  if (ierr /= cudaSuccess) then
	     write(*,*) 'cudaGetDeviceCount failed -- CUDA driver and runtime may be mismatched' 
	     stop
	  end if

	  if (nDevices == 0) then
	     write(*,*) 'No CUDA devices found'
	  end if

	write(*,*) "No of GPUs present  ", nDevices
	noOfProcs=omp_get_num_procs()
	write(*,*) "Max Procs: ", noOfProcs, "maxthreads : ", omp_get_max_threads()
	write(*,*) " Using Multi GPUs "
	write(*,*) "calling kernel..."	
		
	deallocate(h_a)
	end

I am using PGI CUDA Fortran version 11.9 with Cuda 4.0 Toolkit.

Hi Balkrishna,

Looks like you forgot to privatize “d_a” hence every thread is sharing the same device array pointer.

Hope this helps,
Mat

% diff -u ml.f90 ml_fixed.f90
--- ml.f90	2012-08-06 10:43:35.423387719 -0700
+++ ml_fixed.f90	2012-08-06 10:45:18.341348674 -0700
@@ -35,7 +35,7 @@
 
       end do
       
-!$omp parallel do private(index, tid,devnum)
+!$omp parallel do private(index, tid,devnum, d_a)
                do index = 1, 4 
              tid = omp_get_thread_num()
              if(tid==0) then

% pgf90 ml.f90 -Mcuda -mp
% a.out
 calling kernel...
 Thread id            0
 Thread id            1
0: copyout Memcpy (host=0x6c4790, dev=0x200600000, size=2048) FAILED: 11(invalid argument)
% pgf90 ml_fixed.f90 -Mcuda -mp
% a.out
 calling kernel...
 Thread id            0
 Thread id            1
 Displaying results on GPU             1
 h_a2 :             1  =     2.000000    
 h_a2 :             2  =     2.000000    
 h_a2 :             3  =     2.000000    
 h_a2 :             4  =     2.000000    
 h_a2 :             5  =     2.000000    
 h_a2 :             6  =     2.000000    
 Thread id            1
 Displaying results on GPU             0
 h_a1 :             1  =     1.000000    
 h_a1 :             2  =     1.000000    
 h_a1 :             3  =     1.000000    
 h_a1 :             4  =     1.000000    
 h_a1 :             5  =     1.000000    
 h_a1 :             6  =     1.000000    
 Thread id            0
 Thread id            1
 Thread id            0
 Displaying results on GPU             0
 h_a1 :             1  =     1.000000    
 h_a1 :             2  =     1.000000    
 h_a1 :             3  =     1.000000    
 h_a1 :             4  =     1.000000    
 h_a1 :             5  =     1.000000    
 h_a1 :             6  =     1.000000    
 Thread id            0
 Displaying results on GPU             1
 h_a2 :             1  =     3.000000    
 h_a2 :             2  =     3.000000    
 h_a2 :             3  =     3.000000    
 h_a2 :             4  =     3.000000    
 h_a2 :             5  =     3.000000    
 h_a2 :             6  =     3.000000    
 Thread id            1
 No of GPUs present              2
 Max Procs:             4 maxthreads :             2
  Using Multi GPUs 
 calling kernel...