odd problem when use cula

   PROGRAM CULA_TEST
        
        use CULA_LAPACK_DEVICE_PGFORTRAN
        use cula_status
        use cula_type 
        use cudafor           

        IMPLICIT NONE
        
        INTEGER :: STATUS
        INTEGER, PARAMETER :: M =  16
        INTEGER, PARAMETER :: N =  16
        INTEGER :: K
        integer::ii
        real,allocatable,dimension(:,:),device :: A_DEVICE
        integer,allocatable,dimension(:),device :: TAU_DEVICE
         
        

        real, ALLOCATABLE, DIMENSION(:,:) :: A
        real, ALLOCATABLE, DIMENSION(:) :: work
        real, ALLOCATABLE, DIMENSION(:,:) :: b
        real, ALLOCATABLE, DIMENSION(:,:) :: c
        integer, ALLOCATABLE, DIMENSION(:) ,target:: TAU
  
       type(cudaevent)::startevent,stopevent
        real::time
       integer::istat
       
       istat = cudaeventcreate(startevent)
       istat = cudaeventcreate(stopevent)
 
        ALLOCATE(A(M,N))
        ALLOCATE(work(M))
        ALLOCATE(b(M,N))
        ALLOCATE(c(M,N))
        ALLOCATE(TAU(M))

        !fordeg
        CALL RANDOM_NUMBER(A)

       istat = cudaeventrecord(startevent,0)
        WRITE(*,*) 'ALLOCATE DEVICE ARRAYS'
        allocate(a_device(m,m))
        allocate(tau_device(m))
        WRITE(*,*) 'WRITING ARRAYS TO DEVICE'
        a_device=a
        tau_device=tau
       
        WRITE(*,*) 'INITIALIZING CULA'
        STATUS = CULA_INITIALIZE()
        print *,'bb'
        CALL CULA_CHECK_STATUS(STATUS)
        print *,status
        print *,'aa'
        WRITE(*,*) 'CALLING CULA_DEVICE_SGEQRF'
    
        STATUS = cula_device_sgetrf(m,n,a_device,n,tau_device)
        STATUS = CULA_DEVICE_sGETRI(M, A_DEVICE, M, TAU_DEVICE
   
        CALL CULA_CHECK_STATUS(STATUS)

        WRITE(*,*) 'RETRIEVING DEVICE MEMORY TO HOST'
        b=a_device
        tau=tau_device
        CALL CULA_SHUTDOWN()
  

        write(*,*)'time for gpu execution(ms):',time
 
        c = matmul(a,b)
              

       istat = cudaeventrecord(startevent,0)
        call sgetrf(m,n,a,m,tau,ii)
        !call sgetrf_f95(a,tau,ii)
        !print *,'sgetrf finish'
        call sGETRI(M, A, M, TAU,work,M,ii)
        istat = cudaeventrecord(stopevent,0)
        istat = cudaeventsynchronize(stopevent)
        istat = cudaeventelapsedtime(time,startevent,stopevent)
        write(*,*)'time for cpu execution(ms):',time
      END

I have encouter such two question:
1.When I compile this program with PGI2014,it failed to initial the cula when running.However,it succeed when I compile the program with PGI2015.
2.the intel’s mkl rountine fail to lanuch when my array’size is bigger while nothing happened with a small array size.for example,set the arry size 2,20 is passed,200 is failed
I hope someone can answer my question.