This is a relatively complete code. It prints correctly during the calculation process, but after the calculation is complete and the output data is printed, all the printed results are zeros.
allocate(array_sp1_dv(GPU_COUNT,chunk_size,xn),array_dir1_dv(GPU_COUNT,chunk_size,xn),array_sp2_dv(GPU_COUNT,chunk_size,xn),array_dir2_dv(GPU_COUNT,chunk_size,xn),CC1_dv(GPU_COUNT,4,chunk_size,xn),CC2_dv(GPU_COUNT,4,chunk_size,xn)) !output
call divide_data(data1, data2, data3, lon,lat, GPU_COUNT, data1_dv, data2_dv, data3_dv, lon_dv, lat_dv)
do device_num = 0, GPU_COUNT - 1
! enter data for each GPU
!$ACC set device_num(device_num)
!$acc enter data copyin(data1_dv(device_num+1,:,:), data2_dv(device_num+1,:,:), data3_dv(device_num+1,:,:), lon_dv(device_num+1,:,:), lat_dv(device_num+1,:,:), qf) &
!$acc& create(array_sp1_dv(device_num+1,:,:), array_dir1_dv(device_num+1,:,:),array_sp2_dv(device_num+1,:,:), array_dir2_dv(device_num+1,:,:), CC1_dv(device_num+1,:,:,:), CC2_dv(device_num+1,:,:,:)) async(device_num+1)
end do
!$acc wait
do device_num = 0, GPU_COUNT - 1
call calculate(data1_dv(device_num+1,:,:), data2_dv(device_num+1,:,:), data3_dv(device_num+1,:,:), x_dv(device_num+1,:,:), y_dv(device_num+1,:,:), qf,&
device_num, GPU_COUNT,xn, sbox_width, bbox_width, delta_t12, overlap_flag,array_sp1_dv(device_num+1,:,:), array_dir1_dv(device_num+1,:,:), &
array_sp2_dv(device_num+1,:,:), array_dir2_dv(device_num+1,:,:), CC1_dv(device_num+1,:,:,:), CC2_dv(device_num+1,:,:,:))
end do
!$acc wait
do device_num = 0, GPU_COUNT - 1
!$ACC set device_num(device_num)
!$acc exit data delete(data1_dv(device_num+1,:,:), data2_dv(device_num+1,:,:), data3_dv(device_num+1,:,:), x_dv(device_num+1,:,:), y_dv(device_num+1,:,:), qf)&
!$acc& copyout(array_sp1_dv(device_num+1,:,:), array_dir1_dv(device_num+1,:,:),array_sp2_dv(device_num+1,:,:), array_dir2_dv(device_num+1,:,:), CC1_dv(device_num+1,:,:,:), CC2_dv(device_num+1,:,:,:)) async(device_num+1)
end do
!$acc wait
!here print array_sp1_dv is zero
print*,'xx',size(array_sp1_dv,1),size(array_sp1_dv,2),size(array_sp1_dv,3),array_sp1_dv(:,1000,1200)
do nn = 1,size(array_sp1_dv,2)
do mm = 1,size(array_sp1_dv,3)
if(array_sp1_dv(2,nn,mm) > 0 .and. array_sp1_dv(2,nn,mm) <= 100) then
print*,'sp test xx',nn,mm,array_sp1_dv(2,nn,mm),size(array_sp1_dv,1),size(array_sp1_dv,2),size(array_sp1_dv,3)
end if
end do
end do
do device_num = 0, GPU_COUNT - 1
start_row = device_num * chunk_size + 1
end_row = (device_num + 1) * chunk_size ! Ensure we don't exceed nLine
nrows = end_row - start_row + 1
array_sp1(start_row:end_row, :) = array_sp1_dv(device_num+1,:,:)
array_dir1(start_row:end_row, :) = array_dir1_dv(device_num+1,:,:)
array_sp2(start_row:end_row, :) = array_sp2_dv(device_num+1,:,:)
array_dir2(start_row:end_row, :) = array_dir2_dv(device_num+1,:,:)
CC1(:,start_row:end_row, :) = CC1_dv(device_num+1,:,:,:)
CC2(:,start_row:end_row, :) = CC2_dv(device_num+1,:,:,:)
end do
subroutine calculate(data1, data2, data3, lon, lat,qf, device_num,GPU_COUNT,line, sbox_size, bbox_size, delta_t,overlap_flag, sp11, dir11, sp22, dir22, CC11, CC22)
implicit none
real, dimension(:,:), intent(in) :: data1, data2, data3
real, dimension(:,:), intent(in) :: lon, lat
integer(kind=1),dimension(:,:), intent(in) ::qf
integer, value,intent(in) :: device_num,GPU_COUNT,line, sbox_size, bbox_size
integer(kind=1), intent(in) :: overlap_flag
real, intent(in) :: delta_t
! -----------------
REAL, dimension(:,:), INTENT(OUT) :: sp11(:,:), dir11(:,:), sp22(:,:), dir22(:,:)
REAL, dimension(:,:,:), INTENT(OUT) :: CC11(:,:,:), CC22(:,:,:)
integer :: i, j, ii, jj, iii,istart1, jstart1, iend1, jend1, istart2, jstart2, tx1, ty1, tx2, ty2,txx1, tyy1, txx2, tyy2, tx, ty, nx, ny, boundary
integer :: x, y, xx, yy, tag1, tag2, pp, oo, status, chunk_size, istart, iend
real :: correlation1, correlation2, max_cc1, max_cc2
real :: percent,lon1, lat1, lon2, lat2, lon3, lat3
integer :: start_time, end_time, clock_rate, displace
real :: elapsed_time
real :: nan_value
REAL :: E_t, E_s, E_ts, sita_t, sita_s
nan_value = ieee_value(1.0, ieee_quiet_nan)
tag1 = 1
tag2 = 2
nx = bbox_size - sbox_size + 1
ny = bbox_size - sbox_size + 1
boundary = size(data,1)
chunk_size = line/GPU_COUNT
istart = device_num*chunk_size+1
iend = (device_num + 1) * chunk_size
displace = (bbox_size - sbox_size)/2
ALLOCATE(sub1(sbox_size, sbox_size), sub2(sbox_size, sbox_size), sub3(sbox_size, sbox_size), STAT=status)
!$ACC set device_num(device_num)
!$acc parallel loop gang vector collapse(2) private(sub1, sub2, sub3, max_cc1, max_cc2, istart2, jstart2, tx, ty, tx1, ty1, tx2, ty2,txx1,tyy1,txx2,tyy2) firstprivate(sbox_size, bbox_size) async(device_num+1)
do i = istart, iend
do j = 1, line
if(qf(i,j) /= 0) then
cycle
end if
if(device_num == 0) then
istart2 = (i-1) * sbox_size/(1+overlap_flag) + 1 - device_num*chunk_size
else
istart2 = (i-1) * sbox_size/(1+overlap_flag) + 1 - device_num*chunk_size + displace
end if
jstart2 = (j-1) * sbox_size/(1+overlap_flag) + 1
DO ii = 1, sbox_size
DO jj = 1, sbox_size
sub2(ii, jj) =data2(istart2 + ii - 1, jstart2 + jj - 1)
END DO
END DO
iii = i - device_num*chunk_size
max_cc1 = -1.0
max_cc2 = -1.0
do x = 1, (bbox_size - sbox_size + 1)
do y = 1, (bbox_size - sbox_size + 1)
istart1 = max(1, istart2 - (bbox_size-sbox_size)/2 + x - 1) !set left right boundary but not neccessary
jstart1 = max(1, jstart2 - (bbox_size-sbox_size)/2 + y - 1)
iend1 = min(istart1+sbox_size-1,boundary)
jend1 = min(jstart1+sbox_size-1,boundary)
DO xx = 1, iend1 - istart1 + 1
DO yy = 1, jend1 - jstart1 + 1
sub1(xx, yy) = data1(istart1 + xx - 1, jstart1 + yy - 1)
END DO
END DO
DO xx = 1, iend1 - istart1 + 1
DO yy = 1, jend1 - jstart1 + 1
sub3(xx, yy) = data3(istart1 + xx - 1, jstart1 + yy - 1)
END DO
END DO
E_t = SUM(sub2) / (sbox_size * sbox_size)
E_s = SUM(sub1) / (sbox_size * sbox_size)
E_ts = SUM((sub2 - E_t) * (sub1 - E_s)) / (sbox_size * sbox_size)
sita_t = SQRT(SUM((sub2 - E_t)**2) / (sbox_size * sbox_size))
sita_s = SQRT(SUM((sub1 - E_s)**2) / (sbox_size * sbox_size))
IF (sita_t == 0.0 .OR. sita_s == 0.0) THEN
correlation1 = 0.0
ELSE
correlation1 = E_ts / (sita_t * sita_s)
END IF
if(correlation1 .gt. 1) then
correlation1 = 0
end if
E_t = SUM(sub2) / (sbox_size * sbox_size)
E_s = SUM(sub3) / (sbox_size * sbox_size)
E_ts = SUM((sub2 - E_t) * (sub3 - E_s)) / (sbox_size * sbox_size)
sita_t = SQRT(SUM((sub2 - E_t)**2) / (sbox_size * sbox_size))
sita_s = SQRT(SUM((sub3 - E_s)**2) / (sbox_size * sbox_size))
IF (sita_t == 0.0 .OR. sita_s == 0.0) THEN
correlation2 = 0.0
ELSE
correlation2 = E_ts / (sita_t * sita_s)
END IF
if(correlation2 .gt. 1) then
correlation2 = 0
end if
IF (max_cc1 < correlation1) THEN
max_cc1 = correlation1
txx1 = x
tyy1 = y
END IF
IF (max_cc2 < correlation2) THEN
max_cc2 = correlation2
txx2 = x
tyy2 = y
end if
end do
end do
print*,'sp and dir', i,j,txx1,tyy1,sp11(iii,j), sp22(iii,j), dir11(iii,j), dir22(iii,j) !ok
end do
end do
!$acc end parallel
DEALLOCATE(sub1, sub2, sub3)
end subroutine calculate