module precision_m
integer , parameter :: singlePrecision = kind (0.0)
integer , parameter :: doublePrecision = kind (0.d0)
integer , parameter :: fp_kind = singlePrecision
!integer , parameter :: fp_kind = doublePrecision
end module precision_m
module kmod
use cudafor
use precision_m
contains
attributes(global) subroutine cuda_test(f0_dev, f1_dev, f2_dev, f3_dev, f4_dev, f5_dev, f6_dev, f7_dev, f8_dev, &
f0_dev1, f1_dev1, f2_dev1, f3_dev1, f4_dev1, f5_dev1, f6_dev1, f7_dev1, f8_dev1, &
xDim, yDim)
implicit none
integer, value :: xDim, yDim
real(fp_kind) :: f0_dev(yDim, xDim), f1_dev(yDim, xDim), f2_dev(yDim, xDim), f3_dev(yDim, xDim), f4_dev(yDim, xDim), f5_dev(yDim, xDim), f6_dev(yDim, xDim), f7_dev(yDim, xDim), f8_dev(yDim, xDim)
real(fp_kind) :: f0_dev1(yDim, xDim), f1_dev1(yDim, xDim), f2_dev1(yDim, xDim), f3_dev1(yDim, xDim), f4_dev1(yDim, xDim), f5_dev1(yDim, xDim), f6_dev1(yDim, xDim), f7_dev1(yDim, xDim), f8_dev1(yDim, xDim)
integer :: x, y, i
x = (blockIdx%x-1)*blockDim%x + threadIdx%x
y = (blockIdx%y-1)*blockDim%y + threadIdx%y
if(x <= xDim .and. y <= yDim .and. x >= 1 .and. y >= 1) then
f0_dev1(y,x) = real(x+y+0)
f1_dev1(y,x) = real(x+y+1)
f2_dev1(y,x) = real(x+y+2)
f3_dev1(y,x) = real(x+y+3)
f4_dev1(y,x) = real(x+y+4)
f5_dev1(y,x) = real(x+y+5)
f6_dev1(y,x) = real(x+y+6)
f7_dev1(y,x) = real(x+y+7)
f8_dev1(y,x) = real(x+y+8)
f0_dev(y,x) = f0_dev1(y,x)
f1_dev(y,x) = f1_dev1(y,x)
f2_dev(y,x) = f2_dev1(y,x)
f3_dev(y,x) = f3_dev1(y,x)
f4_dev(y,x) = f4_dev1(y,x)
f5_dev(y,x) = f5_dev1(y,x)
f6_dev(y,x) = f6_dev1(y,x)
f7_dev(y,x) = f7_dev1(y,x)
f8_dev(y,x) = f8_dev1(y,x)
end if
end subroutine cuda_test
end module kmod
PROGRAM cudatest
use precision_m
use kmod
implicit none
real(fp_kind), device, dimension(:,:), allocatable:: f0_dev, f1_dev, f2_dev, f3_dev, f4_dev, f5_dev, f6_dev, f7_dev, f8_dev, &
f0_dev1, f1_dev1, f2_dev1, f3_dev1, f4_dev1, f5_dev1, f6_dev1, f7_dev1, f8_dev1
real(fp_kind), dimension(:,:,:), allocatable:: f
integer :: i, istat, x, y, xDim=100, yDim=100
type(dim3) :: dimGrid, dimBlock
allocate(f0_dev(yDim, xDim))
allocate(f1_dev(yDim, xDim))
allocate(f2_dev(yDim, xDim))
allocate(f3_dev(yDim, xDim))
allocate(f4_dev(yDim, xDim))
allocate(f5_dev(yDim, xDim))
allocate(f6_dev(yDim, xDim))
allocate(f7_dev(yDim, xDim))
allocate(f8_dev(yDim, xDim))
allocate(f0_dev1(yDim, xDim))
allocate(f1_dev1(yDim, xDim))
allocate(f2_dev1(yDim, xDim))
allocate(f3_dev1(yDim, xDim))
allocate(f4_dev1(yDim, xDim))
allocate(f5_dev1(yDim, xDim))
allocate(f6_dev1(yDim, xDim))
allocate(f7_dev1(yDim, xDim))
allocate(f8_dev1(yDim, xDim))
allocate(f(yDim,xDim,0:8))
dimBlock = dim3(20, 20, 1)
dimGrid = dim3(ceiling(real(xDim)/dimBlock%x), ceiling(real(yDim)/dimBlock%y), 1)
call cuda_test<<<dimGrid,dimBlock>>>(f0_dev, f1_dev, f2_dev, f3_dev, f4_dev, f5_dev, f6_dev, f7_dev, f8_dev, &
f0_dev1, f1_dev1, f2_dev1, f3_dev1, f4_dev1, f5_dev1, f6_dev1, f7_dev1, f8_dev1, &
xDim, yDim)
f(1:yDim,1:xDim,0) = f0_dev
f(1:yDim,1:xDim,1) = f1_dev
f(1:yDim,1:xDim,2) = f2_dev
f(1:yDim,1:xDim,3) = f3_dev
f(1:yDim,1:xDim,4) = f4_dev
f(1:yDim,1:xDim,5) = f5_dev
f(1:yDim,1:xDim,6) = f6_dev
f(1:yDim,1:xDim,7) = f7_dev
f(1:yDim,1:xDim,8) = f8_dev
do x=1,xDim
do y=1,yDim
do i=0,8
if(int(f(y,x,i)) /= x+y+i) then
write(*,*) "error occurs", f(y,x,i), x+y+i, x, y, i
end if
enddo
enddo
enddo
deallocate(f0_dev, f0_dev1, f1_dev, f1_dev1, f2_dev, f2_dev1, f3_dev, f3_dev1, f4_dev, f4_dev1, f5_dev, f5_dev1, f6_dev, f6_dev1, f7_dev, f7_dev1, f8_dev, f8_dev1)
deallocate(f)
end PROGRAM cudatest
I need help, the gpu card is
Device 0: “GeForce GTX 580”
CUDA Driver Version: 4.0
CUDA Runtime Version: 3.20
CUDA Capability Major revision number: 2
CUDA Capability Minor revision number: 0
Total amount of global memory: 1609760768 bytes
Number of multiprocessors: 16
Number of cores: 512
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 49152 bytes
Total number of registers available per block: 32768
Warp size: 32
Maximum number of threads per block: 1024
Maximum sizes of each dimension of a block: 1024 x 1024 x 64
Maximum sizes of each dimension of a grid: 65535 x 65535 x 65535
Maximum memory pitch: 2147483647 bytes
Texture alignment: 512 bytes
and the program does not always work well,
is any problem in my codes ? thank you.
and, if I decrease the threads number in one block
dimBlock = dim3(20, 20, 1) ==> dimBlock = dim3(5, 5, 1)
the frequency of error is also decreasing
[/list][/code]