cublas<t>dot_v2

The code below fails at line 37 where it tries to perform a dot product using cublas and tries to store the result into an element of an array on the device.

Is this not allowed?

      program test
!GPU\
      use cudafor
      use cublas
!GPU/
      implicit none
      integer, parameter :: size = 10
      double precision, dimension (size) :: vec_a, vec_b, dotproduct
      double precision, allocatable, dimension (:), device :: Dvec_a,
     > Dvec_b, Ddotproduct
      integer, i, j, istat, imode
      type(cublashandle) :: h
c
      istat=0
c
      istat=istat+cublasInit()
      if(istat.ne.0) write(*,*) 'cublasInit'
      h=cublasgethandle()
      istat=istat+cublasCreate(h)
      if(istat.ne.0) write(*,*) 'cublasCreate'
      istat=istat+cublasalloc(size,8,Dvec_a)
      istat=istat+cublasalloc(size,8,Dvec_b)      
      istat=istat+cublasalloc(size,8,Ddotproduct)      
      if(istat.ne.0) write(*,*) 'cublasalloc'
      
      do i=1, size      
        do j=1, size
          vec_a(j)=dble(j)
          vec_b(j)=dble(size-j)
        enddo       
#ifdef _ACCEL
        istat=istat+cublassetvector(size,8,vec_a,1,Dvec_a,1)
        if(istat.ne.0) write(*,*) 'set vector Dvec_a'
        istat=istat+cublassetvector(size,8,vec_b,1,Dvec_b,1)
        if(istat.ne.0) write(*,*) 'set vector Dvec_b'
        istat=istat+
     >  cublasDdot_v2(h,size,Dvec_a,1,Dvec_b,1,Ddotproduct(i))  !<< code crashes here
        if(istat.ne.0) write(*,*) 'dot product'
        dotproduct(i)=dotproduct(i)+dble(i)
#else
        dotproduct(i)=dot_product(vec_a,vec_b)+dble(i)
#endif
      enddo
      
#ifdef _ACCEL
        istat=istat+cublasgetvector(size,8,Ddotproduct,1,dotproduct,1)
        if(istat.ne.0) write(*,*) 'get vector'     
#endif
      
      write(*,*) 'dotproduct := ', dotproduct
      
      istat=istat+cublasShutdown()
      
      end

Also, the cublas 4.0 manual states that cublasAlloc and cublasFree have been depreciated. Would the “in thing” be to use allocate and deallocate or cudaMalloc and cudaFree?

If either could be used is there a benefit to using one over the other (i.e. for 1D arrays vs. 2D or 3D arrays?) I do gather that their usage should not be mix.

Hi Sarom,

The seg fault is being caused by pass in “Ddotproduct(i)” as the result. To fix use “dotproduct” instead.

Also, the cublas 4.0 manual states that cublasAlloc and cublasFree have been depreciated. Would the “in thing” be to use allocate and deallocate or cudaMalloc and cudaFree?

I’m not much of an expert on using cuBlas so don’t know for sure, but using “allocate” works for me and simplifies things.

For example:

% cat blas.F 
      program test
!GPU\
      use cudafor
      use cublas
!GPU/
      implicit none
      integer, parameter :: size = 10
      double precision, dimension (size) :: vec_a, vec_b, dotproduct
      double precision, allocatable, dimension (:), device :: Dvec_a,
     > Dvec_b, Ddotproduct
      integer, i, j, istat, imode
      type(cublashandle) :: h
c
      istat=0
c
      istat=istat+cublasInit()
      if(istat.ne.0) write(*,*) 'cublasInit'
      h=cublasgethandle()
      istat=istat+cublasCreate(h)
      if(istat.ne.0) write(*,*) 'cublasalloc'
      allocate(Dvec_a(size), Dvec_b(size), Ddotproduct(size)) 
    
      do i=1, size     
        do j=1, size
          vec_a(j)=dble(j)
          vec_b(j)=dble(size-j)
        enddo       
#ifdef _CUDA
        Dvec_a=vec_a
        Dvec_b=vec_b
        istat=istat+
     >  cublasDdot_v2(h,size,Dvec_a,1,Dvec_b,1,dotproduct(i))  
        if(istat.ne.0) write(*,*) 'dot product'
        dotproduct(i)=dotproduct(i)+dble(i)
#else
        dotproduct(i)=dot_product(vec_a,vec_b)+dble(i)
#endif
      enddo
     
#ifdef _ACCEL
        istat=istat+cublasgetvector(size,8,Ddotproduct,1,dotproduct,1)
        if(istat.ne.0) write(*,*) 'get vector'     
#endif
     
      write(*,*) 'dotproduct := ', dotproduct
     
      istat=istat+cublasShutdown()
     
      end 
% pgf90 blas.F  -Mcuda -lcublas
% a.out
 dotproduct :=     166.0000000000000         167.0000000000000      
    168.0000000000000         169.0000000000000         170.0000000000000      
    171.0000000000000         172.0000000000000         173.0000000000000      
    174.0000000000000         175.0000000000000
  • Mat

Thanks Mat.

The array dotproduct is allocated on the host.

The cublas 4.0 library manual states that the result of cublasdot can be on either the device or the host.

I would like to store the result in an array allocated on the device, Ddotproduct.

Does the Fortran interface provided by the cublas module for the _v2 cublas calls not permit storing the results on the device?

Add these lines:

istat=istat+cublasSetPointerMode(h,CUBLAS_POINTER_MODE_DEVICE)
if(istat.ne.0) write(,) ‘cublassetpointermode’

Thanks!

Does changing the pointer mode affect cublasSet*, cublasGet* routines and their async counterpart?

I believe it only applies to scalars, such as the results of the min, max, and dotp reductions, and the alpha and beta inputs to routines like dgemm.

In the CUBLAS 4.0 document, look for “host or device” under memory in the parameter tables for each function