Hi…
The following code showed me the correct results while launching the kernel with thread number 256 per block on the GTX260:
CALL gpu_bilinear_interpolation<<<Ndem/256+1, 256>>>(Ndem)
But it failed with the max. threads per block 512. All the d_v(k) were zero.
CALL gpu_bilinear_interpolation<<<Ndem/512+1, 512>>>(Ndem)
Did I do anything wrong?
I use pvf 10.8 on the Win 7.
Thank you in advance.
module dev_bilinear
use cudafor
implicit none
integer::done
real*8,allocatable,device :: d_bx(:,:), d_by(:,:), d_x(:), d_y(:)
real*8,allocatable,device :: d_bv(:,:), d_v(:)
contains
!============================================================================
! The subroutine is to do the interpolation of station o using bilinear method
!==============================================================================
Attributes(global) SUBROUTINE gpu_bilinear_interpolation(Ndem)
use cudadevice
implicit none
integer,value :: Ndem
real*8,device :: p1, p2, p3, p4, q1, q2, q3, q4
real*8,device :: wl1, wl3, d14, wx1, wx4, v1, v4
logical,device :: ck, getv
integer,device :: i, k
k=threadidx%x+ (blockidx%x-1)*blockdim%x
if( k <= Ndem )then
getv=.false.
do i=1, 4
if (d_x(k) == d_bx(i,k) .AND. d_y(k) == d_by(i,k)) then
d_v(k) = d_bv(i,k)
getv=.true.
end if
end do
IF( .not. getv )then
call gpu_get_intersection_of_two_lines(d_bx(1,k), d_by(1,k), d_bx(2,k), d_by(2,k), d_x(k), d_y(k), d_x(k), d_y(k)-1.d0, p1, q1, ck)
call gpu_get_intersection_of_two_lines(d_bx(3,k), d_by(3,k), d_bx(4,k), d_by(4,k), d_x(k), d_y(k), d_x(k), d_y(k)+1.d0, p4, q4, ck)
wl1 = dsqrt( (p1-d_bx(1,k))**2.d0 +(q1-d_by(1,k))**2.d0 ) / dsqrt( (d_bx(2,k)-d_bx(1,k))**2.d0 + (d_by(2,k)-d_by(1,k))**2.d0 )
wl3 = dsqrt( (p4-d_bx(3,k))**2.d0 +(q4-d_by(3,k))**2.d0 ) / dsqrt( (d_bx(4,k)-d_bx(3,k))**2.d0 + (d_by(4,k)-d_by(3,k))**2.d0 )
d14 = dsqrt( (p1-p4)*(p1-p4) +(q1-q4)*(q1-q4))
wx1 = dsqrt( (p1-d_x(k))**2.d0 + (q1-d_y(k))**2.d0)/d14
wx4 = dsqrt( (p4-d_x(k))**2.d0 + (q4-d_y(k))**2.d0 )/d14
v1 = wl1*(d_bv(2,k)-d_bv(1,k))+d_bv(1,k)
v4 = wl3*(d_bv(4,k)-d_bv(3,k))+d_bv(3,k)
d_v(k) = v1*wx4 + v4*wx1
ENDIF
else
call syncthreads()
end if
END SUBROUTINE gpu_bilinear_interpolation
!======================================================================================
! The subroutine is to get the coordinates of the intersection point of two lines
!========================================================================================
Attributes(device) subroutine gpu_get_intersection_of_two_lines(x1, y1, x2, y2, x3, y3, x4, y4, x, y, ck)
implicit none
real*8,device :: x1, y1, x2, y2, x3, y3, x4, y4
real*8,device :: x, y
logical,device :: ck
real*8,device :: m1, m3
m1=1.0d+10
m3=1.0d+10
if (x1 /= x2) m1 = (y1-y2)/(x1-x2)
if (x3 /= x4) m3 = (y3-y4)/(x3-x4)
if (m1 == m3) then ! Two lines are parallel
ck = .false.
else
ck = .true.
if (x1 == x2) then ! Line1 is vertical
x = x1
y = m3*(x-x4)+y4
else if (x3 == x4) then ! Line2 is vertical
x = x3
y = m1*(x-x2)+y2
else
x = (m1*x2-m3*x4+y4-y2)/(m1-m3)
y = m1*(x-x2)+y2
end if
end if
end subroutine gpu_get_intersection_of_two_lines
end module dev_bilinear