I am trying to make use of the cublasSetVectorAsync command.
My code will compute a series of dot products and then accumulate the total from these dot products.
The CPU code will print the total first followed by the GPU code which makes use of CUBLAS.
If lines 39-41 is set to:
useNoCublas=.true.
useCublas=.false.
useCublasAsync=.false.
the code runs fine and the CPU and GPU produces similar results. This will force the code to use no cublas calls for transferring data and uses ‘=’ instead to transfer data from the host to the device.
If lines 39-41 is set to:
useNoCublas=.false.
useCublas=.true.
useCublasAsync=.false.
the code runs fine as well. This will force the code use cublasSetVector.
If lines 39-41 is set to:
useNoCublas=.false.
useCublas=.true.
useCublasAsync=.false.
the code segfaults at line 106 when it tries to use cublasSetVectorAsync.
I have error checks for creating the three CUDA streams that are used in the code: streamA, streamB, and streamC. So, I am puzzled as to why I am getting segmentation faults when I try to run this code.
Any ideas?
program.f
program test
!
use cudafor
use cublas
!
implicit none
!-----vector sizeD used in dot product
integer, parameter :: sizeD = 1000
!-----on the host
double precision, allocatable, dimension (:) ::
> vec_a, vec_b, vec_c, dotproductA, dotproductB, one
!-----stores reduction total of DdotproductA and DdotproductB
double precision :: TdotproductA, TdotproductB
!-----stores total
double precision :: Tdotproduct
logical :: useNoCublas, useCublas, useCublasAsync
!-----on the device
double precision, allocatable, dimension (:), device ::
> Dvec_a, Dvec_b, Dvec_c, DdotproductA, DdotproductB, Done
integer :: i, istat
!-----handle to cublas
type(cublashandle) :: h
!-----CUDA streams
integer :: streamA, streamB, streamC
!-----allocate on CPU
allocate(one(sizeD))
allocate(vec_a(sizeD),vec_b(sizeD),vec_c(sizeD))
allocate(dotproductA(sizeD),dotproductB(sizeD))
!-----allocate on GPU
allocate(Done(sizeD))
allocate(Dvec_a(sizeD),Dvec_b(sizeD),Dvec_c(sizeD))
allocate(DdotproductA(sizeD),DdotproductB(sizeD))
!-----initialize variables
istat=0
Tdotproduct=0.0D+00
TdotproductA=0.0D+00
TdotproductB=0.0D+00
!-----set logicals (only one can be true)
useNoCublas=.false.
useCublas=.false.
useCublasAsync=.true.
!-----startup cublas
istat=istat+cublasInit()
if(istat.ne.0) write(*,*) 'cublasInit'
h=cublasgethandle()
istat=istat+cublasCreate(h)
if(istat.ne.0) write(*,*) 'cublasCreate'
!-----create streams
istat=istat+cudaStreamCreate(streamA)
istat=istat+cudaStreamCreate(streamB)
istat=istat+cudaStreamCreate(streamC)
if(istat.ne.0) write(*,*) 'cudaStreamsCreate'
!=====CPU
do i=1, sizeD
call random_seed()
call random_number(vec_a)
call random_number(vec_b)
call random_number(vec_c)
dotproductA(i)=dot_product(vec_a,vec_c)
dotproductB(i)=dot_product(vec_b,vec_c)
end do
TdotproductA=sum(dotproductA(1:sizeD),1)
TdotproductB=sum(dotproductB(1:sizeD),1)
Tdotproduct=TdotproductA+TdotproductB
write(*,*) 'total on cpu := ', Tdotproduct
!=====GPU
!-----initialize variables
one=1.0D+00
Tdotproduct=0.0D+00
TdotproductA=0.0D+00
TdotproductB=0.0D+00
!
do i=1, sizeD
call random_seed()
call random_number(vec_a)
call random_number(vec_b)
call random_number(vec_c)
!=======synchronous using no cublas calls
if (useNoCublas) then
Dvec_a=vec_a
Dvec_b=vec_b
Dvec_c=vec_c
Done=one
endif
!=======synchronous using cublas calls
if (useCublas) then
!-------transfer vec_a and vec_b
istat=istat+
> cublasSetVector(sizeD,8,vec_a,1,Dvec_a,1)
if(istat.ne.0) write(*,*) 'cublasSetVectorAsync: Dvec_a'
istat=istat+
> cublasSetVector(sizeD,8,vec_b,1,Dvec_b,1)
if(istat.ne.0) write(*,*) 'cublasSetVectorAsync: Dvec_b'
!-------transfer vec_c
istat=istat+
> cublasSetVector(sizeD,8,vec_c,1,Dvec_c,1)
if(istat.ne.0) write(*,*) 'cublasSetVector: Dvec_c'
!-------transfer one
istat=istat+
> cublasSetVector(sizeD,8,one,1,Done,1)
if(istat.ne.0) write(*,*) 'cublasSetVector: Done'
endif
!=======asynchronous using cublas calls
!-------transfer vec_a and vec_b
if (useCublasAsync) then
istat=istat+
> cublasSetVectorAsync(sizeD,8,vec_a,1,Dvec_a,1,streamA)
if(istat.ne.0) write(*,*) 'cublasSetVectorAsync: Dvec_a',istat
istat=istat+
> cublasSetVectorAsync(sizeD,8,vec_b,1,Dvec_b,1,streamB)
if(istat.ne.0) write(*,*) 'cublasSetVectorAsync: Dvec_b'
!---------transfer vec_c
istat=istat+
> cublasSetVectorAsync(sizeD,8,vec_c,1,Dvec_c,1,streamC)
if(istat.ne.0) write(*,*) 'cublasSetVector: Dvec_c'
!---------transfer one
istat=istat+
> cublasSetVectorAsync(sizeD,8,one,1,Done,1,streamC)
if(istat.ne.0) write(*,*) 'cublasSetVector: Done'
endif
!=======do dot product on GPU and store scalar on GPU
istat=istat+
> cublasSetPointerMode(h,CUBLAS_POINTER_MODE_DEVICE)
if(istat.ne.0) write(*,*) 'cublasSetPointerMode: DEVICE'
istat=istat+
> cublasSetStream(h,streamA)
if(istat.ne.0) write(*,*) 'cublasSetStream: StreamA: 1'
istat=istat+
> cublasDdot_v2(h,sizeD,Dvec_a,1,Dvec_c,1,DdotproductA(i))
if(istat.ne.0) write(*,*) 'cublasDdot_v2: DdotproductA'
istat=istat+
> cublasSetStream(h,streamB)
if(istat.ne.0) write(*,*) 'cublasSetStream: StreamB: 1'
istat=istat+
> cublasDdot_v2(h,sizeD,Dvec_b,1,Dvec_c,1,DdotproductB(i))
if(istat.ne.0) write(*,*) 'cublasDdot_v2: DdotproductB'
end do
!=====do dot product on GPU and store scalar on CPU
istat=istat+
> cublasSetPointerMode(h,CUBLAS_POINTER_MODE_HOST)
if(istat.ne.0) write(*,*) 'cublasSetPointerMode: HOST'
!-----do sum reductions with cublas by doing a dot product with a 1 vector using 0 stride
istat=istat+
> cublasSetStream(h,streamA)
if(istat.ne.0) write(*,*) 'cublasSetStream: StreamA: 2'
istat=istat+
> cublasDdot_v2(h,sizeD,Done,0,DdotproductA,1,TdotproductA)
if(istat.ne.0) write(*,*) 'cublasDdot_v2: TdotproductA'
istat=istat+
> cublasSetStream(h,streamA)
if(istat.ne.0) write(*,*) 'cublasSetStream: StreamB: 2'
istat=istat+
> cublasDdot_v2(h,sizeD,Done,0,DdotproductB,1,TdotproductB)
if(istat.ne.0) write(*,*) 'cublasDdot_v2: TdotproductB'
Tdotproduct=TdotproductA+TdotproductB
write(*,*) 'total on gpu := ', Tdotproduct
!-----shutdown cublas
istat=istat+cublasShutdown()
end
pgfortran -Bstatic -Mcuda -ta=nvidia,nowait -Wl,/libpath:“C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v4.0\lib\x64” cublas.lib program.f