cublasSetVectorAsync

I am trying to make use of the cublasSetVectorAsync command.

My code will compute a series of dot products and then accumulate the total from these dot products.

The CPU code will print the total first followed by the GPU code which makes use of CUBLAS.

If lines 39-41 is set to:

useNoCublas=.true.
useCublas=.false.
useCublasAsync=.false.

the code runs fine and the CPU and GPU produces similar results. This will force the code to use no cublas calls for transferring data and uses ‘=’ instead to transfer data from the host to the device.

If lines 39-41 is set to:

useNoCublas=.false.
useCublas=.true.
useCublasAsync=.false.

the code runs fine as well. This will force the code use cublasSetVector.

If lines 39-41 is set to:

useNoCublas=.false.
useCublas=.true.
useCublasAsync=.false.

the code segfaults at line 106 when it tries to use cublasSetVectorAsync.

I have error checks for creating the three CUDA streams that are used in the code: streamA, streamB, and streamC. So, I am puzzled as to why I am getting segmentation faults when I try to run this code.

Any ideas?

program.f

      program test
 !
      use cudafor
      use cublas
!      
      implicit none
!-----vector sizeD used in dot product
      integer, parameter :: sizeD = 1000
!-----on the host
      double precision, allocatable, dimension (:) :: 
     > vec_a, vec_b, vec_c, dotproductA, dotproductB, one
!-----stores reduction total of DdotproductA and DdotproductB
      double precision :: TdotproductA, TdotproductB
!-----stores total
      double precision :: Tdotproduct
      logical :: useNoCublas, useCublas, useCublasAsync
!-----on the device
      double precision, allocatable, dimension (:), device :: 
     > Dvec_a, Dvec_b, Dvec_c, DdotproductA, DdotproductB, Done
      integer :: i, istat
!-----handle to cublas
      type(cublashandle) :: h
!-----CUDA streams
      integer :: streamA, streamB, streamC
!-----allocate on CPU
      allocate(one(sizeD))
      allocate(vec_a(sizeD),vec_b(sizeD),vec_c(sizeD))
      allocate(dotproductA(sizeD),dotproductB(sizeD))
!-----allocate on GPU
      allocate(Done(sizeD))
      allocate(Dvec_a(sizeD),Dvec_b(sizeD),Dvec_c(sizeD))
      allocate(DdotproductA(sizeD),DdotproductB(sizeD))
!-----initialize variables
      istat=0
      Tdotproduct=0.0D+00
      TdotproductA=0.0D+00
      TdotproductB=0.0D+00
!-----set logicals (only one can be true)
      useNoCublas=.false.
      useCublas=.false.
      useCublasAsync=.true.
!-----startup cublas
      istat=istat+cublasInit()
      if(istat.ne.0) write(*,*) 'cublasInit'
      h=cublasgethandle()
      istat=istat+cublasCreate(h)
      if(istat.ne.0) write(*,*) 'cublasCreate'
!-----create streams
      istat=istat+cudaStreamCreate(streamA)
      istat=istat+cudaStreamCreate(streamB)
      istat=istat+cudaStreamCreate(streamC)
      if(istat.ne.0) write(*,*) 'cudaStreamsCreate'
!=====CPU
      do i=1, sizeD
        call random_seed()
        call random_number(vec_a)
        call random_number(vec_b)
        call random_number(vec_c)
        dotproductA(i)=dot_product(vec_a,vec_c)
        dotproductB(i)=dot_product(vec_b,vec_c)
      end do
      TdotproductA=sum(dotproductA(1:sizeD),1)
      TdotproductB=sum(dotproductB(1:sizeD),1)
      Tdotproduct=TdotproductA+TdotproductB
      write(*,*) 'total on cpu := ', Tdotproduct
!=====GPU
!-----initialize variables
      one=1.0D+00
      Tdotproduct=0.0D+00
      TdotproductA=0.0D+00
      TdotproductB=0.0D+00
!
      do i=1, sizeD
        call random_seed()
        call random_number(vec_a)
        call random_number(vec_b)
        call random_number(vec_c)
!=======synchronous using no cublas calls
        if (useNoCublas) then
          Dvec_a=vec_a
          Dvec_b=vec_b
          Dvec_c=vec_c
          Done=one
        endif
!=======synchronous using cublas calls        
        if (useCublas) then
!-------transfer vec_a and vec_b
          istat=istat+
     >     cublasSetVector(sizeD,8,vec_a,1,Dvec_a,1)
          if(istat.ne.0) write(*,*) 'cublasSetVectorAsync: Dvec_a'
          istat=istat+
     >     cublasSetVector(sizeD,8,vec_b,1,Dvec_b,1)
          if(istat.ne.0) write(*,*) 'cublasSetVectorAsync: Dvec_b'
!-------transfer vec_c
          istat=istat+
     >     cublasSetVector(sizeD,8,vec_c,1,Dvec_c,1)
          if(istat.ne.0) write(*,*) 'cublasSetVector: Dvec_c'
!-------transfer one
          istat=istat+
     >     cublasSetVector(sizeD,8,one,1,Done,1)
          if(istat.ne.0) write(*,*) 'cublasSetVector: Done'
        endif
!=======asynchronous using cublas calls
!-------transfer vec_a and vec_b
        if (useCublasAsync) then
          istat=istat+
     >     cublasSetVectorAsync(sizeD,8,vec_a,1,Dvec_a,1,streamA)
          if(istat.ne.0) write(*,*) 'cublasSetVectorAsync: Dvec_a',istat
          istat=istat+
     >     cublasSetVectorAsync(sizeD,8,vec_b,1,Dvec_b,1,streamB)
          if(istat.ne.0) write(*,*) 'cublasSetVectorAsync: Dvec_b'
!---------transfer vec_c
          istat=istat+
     >     cublasSetVectorAsync(sizeD,8,vec_c,1,Dvec_c,1,streamC)
          if(istat.ne.0) write(*,*) 'cublasSetVector: Dvec_c'            
!---------transfer one
          istat=istat+
     >     cublasSetVectorAsync(sizeD,8,one,1,Done,1,streamC)
          if(istat.ne.0) write(*,*) 'cublasSetVector: Done'
        endif
!=======do dot product on GPU and store scalar on GPU
        istat=istat+
     >    cublasSetPointerMode(h,CUBLAS_POINTER_MODE_DEVICE) 
        if(istat.ne.0) write(*,*) 'cublasSetPointerMode: DEVICE'
        istat=istat+
     >    cublasSetStream(h,streamA)
        if(istat.ne.0) write(*,*) 'cublasSetStream: StreamA: 1'
        istat=istat+
     >    cublasDdot_v2(h,sizeD,Dvec_a,1,Dvec_c,1,DdotproductA(i))
        if(istat.ne.0) write(*,*) 'cublasDdot_v2: DdotproductA'            
        istat=istat+
     >    cublasSetStream(h,streamB)
        if(istat.ne.0) write(*,*) 'cublasSetStream: StreamB: 1'
        istat=istat+
     >    cublasDdot_v2(h,sizeD,Dvec_b,1,Dvec_c,1,DdotproductB(i))
        if(istat.ne.0) write(*,*) 'cublasDdot_v2: DdotproductB'            
      end do    
!=====do dot product on GPU and store scalar on CPU
      istat=istat+
     >  cublasSetPointerMode(h,CUBLAS_POINTER_MODE_HOST) 
      if(istat.ne.0) write(*,*) 'cublasSetPointerMode: HOST'
!-----do sum reductions with cublas by doing a dot product with a 1 vector using 0 stride
      istat=istat+
     >  cublasSetStream(h,streamA)
      if(istat.ne.0) write(*,*) 'cublasSetStream: StreamA: 2'
      istat=istat+
     >  cublasDdot_v2(h,sizeD,Done,0,DdotproductA,1,TdotproductA)
      if(istat.ne.0) write(*,*) 'cublasDdot_v2: TdotproductA'            
      istat=istat+
     >  cublasSetStream(h,streamA)
      if(istat.ne.0) write(*,*) 'cublasSetStream: StreamB: 2'
      istat=istat+
     >  cublasDdot_v2(h,sizeD,Done,0,DdotproductB,1,TdotproductB)
      if(istat.ne.0) write(*,*) 'cublasDdot_v2: TdotproductB'            
      Tdotproduct=TdotproductA+TdotproductB      
      write(*,*) 'total on gpu := ', Tdotproduct
!-----shutdown cublas
      istat=istat+cublasShutdown()    
      end

pgfortran -Bstatic -Mcuda -ta=nvidia,nowait -Wl,/libpath:“C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v4.0\lib\x64” cublas.lib program.f

Hi Sarom,

In looking at the cublas interface module that we provide, we’re expecting “stream” to be the same size as a C pointer. Since you’re in 64-bits, this means changing the declaration of your Stream variables to “integer(8)”. Though to be portable with 32-bits, I’d recommend using the iso_c_binding module and “integer(c_intptr_t)”.

I’ll ask if there is something we can do in the CUBLAS interface to handle this.

  • Mat
      program test
 !
      use iso_c_binding
      use cudafor
      use cublas
... cut
!-----CUDA streams
      integer(c_intptr_t) :: streamA, streamB, streamC
...

Actually, the CUDA Fortran way is:

integer(kind=cuda_stream_kind) :: streamA, streamB, streamC

That should work. Let me know if you have other problems