How to do the double precision computing of sqrt() in GPU. I put the loop in GPU with CUDA, the results occurs slightly different. I do not know whether the error happened by accuracy problem(sqrt) or my mistake of computing error?

the original code:

```
dimension qj0(kdim,idim-1,5,4),fnu(0:jdim,0:kdim,0:idim)
do i=1,idim-1
do k=1,kdim-1
tt=gamma*qj0(k,i,5,1)/qj0(k,i,1,1)
fnu(0,k,i)=c2bp*tt*sqrt(tt)/(c2b+tt)
tt=gamma*qj0(k,i,5,3)/qj0(k,i,1,3)
fnu(jdim,k,i)=c2bp*tt*sqrt(tt)/(c2b+tt)
enddo
enddo
```

modified code:

```
dimension qj0(kdim,idim-1,5,4),fnu(0:jdim,0:kdim,0:idim)
type(dim3) :: grid,tBlock
real,device :: fnu_d(0:jdim,0:kdim,0:idim)
real,device::qj0_d(kdim,idim-1,5,4)
tBlock=dim3(16,32,1)
grid=dim3(ceiling(real(kdim-1)/tBlock%x),
+ceiling(real(idim-1)/tBlock%y),1)
istat=cudaMemcpy(qj0_d,qj0,size(qj0),cudaMemcpyHostToDevice)
istat=cudaMemcpy(fnu_d,fnu,size(fnu),cudaMemcpyHostToDevice)
call do434<<<grid,tBlock>>>(fnu_d,qj0_d,gamma,c2bp,c2b,jdim)
istat=cudaMemcpy(fnu,fnu_d,size(fnu_d),cudaMemcpyDeviceToHost)
```

device code:

```
attributes(global) subroutine do434(a,b,d,e,f,g)
implicit none
real::a(:,:,:)
real::b(:,:,:,:)
real,value::d,e,f
integer,value::g
integer::i,j,n(2)
real :: c
i=blockDim%x*(blockIdx%x-1)+threadIdx%x
j=blockDim%y*(blockIdx%y-1)+threadIdx%y
n(1)=size(a,2)-2
n(2)=size(a,3)-2
if(i<=n(1) .and. j<=n(2)) then
c=d*b(i,j,5,1)/b(i,j,1,1)
a(1,i,j)=e*c*sqrt(c)/(f+c)
c=d*b(i,j,5,3)/b(i,j,1,3)
a(g+1,i,j)=e*c*sqrt(c)/(f+c)
end if
end subroutine do434
```

I compiled the code with the following option.

```
mpif90 -DP3D_SINGLE -DPG -DDBLE_PRECSN -g -r8 -acc -ta=tesla -Mfixed -Minfo=accel -Mcuda -c twoeqn.F
```

the compared result