A strange problem

lishin · June 25, 2012, 8:57am

module precision_m
	integer , parameter :: singlePrecision = kind (0.0)
	integer , parameter :: doublePrecision = kind (0.d0)
	integer , parameter :: fp_kind = singlePrecision
	!integer , parameter :: fp_kind = doublePrecision
end module precision_m

module kmod
use cudafor
use precision_m
contains
attributes(global) subroutine cuda_test(f0_dev, f1_dev, f2_dev, f3_dev, f4_dev, f5_dev, f6_dev, f7_dev, f8_dev, &
                                        f0_dev1, f1_dev1, f2_dev1, f3_dev1, f4_dev1, f5_dev1, f6_dev1, f7_dev1, f8_dev1, &
									    xDim, yDim)
implicit none
integer, value :: xDim, yDim
real(fp_kind) :: f0_dev(yDim, xDim), f1_dev(yDim, xDim), f2_dev(yDim, xDim), f3_dev(yDim, xDim), f4_dev(yDim, xDim), f5_dev(yDim, xDim), f6_dev(yDim, xDim), f7_dev(yDim, xDim), f8_dev(yDim, xDim)
real(fp_kind) :: f0_dev1(yDim, xDim), f1_dev1(yDim, xDim), f2_dev1(yDim, xDim), f3_dev1(yDim, xDim), f4_dev1(yDim, xDim), f5_dev1(yDim, xDim), f6_dev1(yDim, xDim), f7_dev1(yDim, xDim), f8_dev1(yDim, xDim)
integer :: x, y, i

x = (blockIdx%x-1)*blockDim%x + threadIdx%x
y = (blockIdx%y-1)*blockDim%y + threadIdx%y
if(x <= xDim .and. y <= yDim .and. x >= 1 .and. y >= 1) then
f0_dev1(y,x) = real(x+y+0)
f1_dev1(y,x) = real(x+y+1)
f2_dev1(y,x) = real(x+y+2)
f3_dev1(y,x) = real(x+y+3)
f4_dev1(y,x) = real(x+y+4)
f5_dev1(y,x) = real(x+y+5)
f6_dev1(y,x) = real(x+y+6)
f7_dev1(y,x) = real(x+y+7)
f8_dev1(y,x) = real(x+y+8)

f0_dev(y,x) = f0_dev1(y,x)
f1_dev(y,x) = f1_dev1(y,x)
f2_dev(y,x) = f2_dev1(y,x)
f3_dev(y,x) = f3_dev1(y,x)
f4_dev(y,x) = f4_dev1(y,x)
f5_dev(y,x) = f5_dev1(y,x)
f6_dev(y,x) = f6_dev1(y,x)
f7_dev(y,x) = f7_dev1(y,x)
f8_dev(y,x) = f8_dev1(y,x)
end if
end subroutine cuda_test
end module kmod




PROGRAM cudatest
use precision_m
use kmod
implicit none
real(fp_kind), device, dimension(:,:), allocatable:: f0_dev, f1_dev, f2_dev, f3_dev, f4_dev, f5_dev, f6_dev, f7_dev, f8_dev, &
													   f0_dev1, f1_dev1, f2_dev1, f3_dev1, f4_dev1, f5_dev1, f6_dev1, f7_dev1, f8_dev1
real(fp_kind), dimension(:,:,:), allocatable:: f
integer :: i, istat, x, y, xDim=100, yDim=100
type(dim3) :: dimGrid, dimBlock

allocate(f0_dev(yDim, xDim))
allocate(f1_dev(yDim, xDim))
allocate(f2_dev(yDim, xDim))
allocate(f3_dev(yDim, xDim))
allocate(f4_dev(yDim, xDim))
allocate(f5_dev(yDim, xDim))
allocate(f6_dev(yDim, xDim))
allocate(f7_dev(yDim, xDim))
allocate(f8_dev(yDim, xDim))

allocate(f0_dev1(yDim, xDim))
allocate(f1_dev1(yDim, xDim))
allocate(f2_dev1(yDim, xDim))
allocate(f3_dev1(yDim, xDim))
allocate(f4_dev1(yDim, xDim))
allocate(f5_dev1(yDim, xDim))
allocate(f6_dev1(yDim, xDim))
allocate(f7_dev1(yDim, xDim))
allocate(f8_dev1(yDim, xDim))
allocate(f(yDim,xDim,0:8))
	


dimBlock = dim3(20, 20, 1)
dimGrid = dim3(ceiling(real(xDim)/dimBlock%x), ceiling(real(yDim)/dimBlock%y), 1)
call cuda_test<<<dimGrid,dimBlock>>>(f0_dev, f1_dev, f2_dev, f3_dev, f4_dev, f5_dev, f6_dev, f7_dev, f8_dev, &
                                        f0_dev1, f1_dev1, f2_dev1, f3_dev1, f4_dev1, f5_dev1, f6_dev1, f7_dev1, f8_dev1, &
									    xDim, yDim)						
f(1:yDim,1:xDim,0) = f0_dev
f(1:yDim,1:xDim,1) = f1_dev
f(1:yDim,1:xDim,2) = f2_dev
f(1:yDim,1:xDim,3) = f3_dev
f(1:yDim,1:xDim,4) = f4_dev
f(1:yDim,1:xDim,5) = f5_dev
f(1:yDim,1:xDim,6) = f6_dev
f(1:yDim,1:xDim,7) = f7_dev
f(1:yDim,1:xDim,8) = f8_dev

do x=1,xDim
	do y=1,yDim
		do i=0,8
			if(int(f(y,x,i)) /= x+y+i) then
			write(*,*) "error occurs", f(y,x,i), x+y+i, x, y, i
			end if
		enddo
	enddo
enddo

deallocate(f0_dev, f0_dev1, f1_dev, f1_dev1, f2_dev, f2_dev1, f3_dev, f3_dev1, f4_dev, f4_dev1, f5_dev, f5_dev1, f6_dev, f6_dev1, f7_dev, f7_dev1, f8_dev, f8_dev1)
deallocate(f)
end PROGRAM cudatest

I need help, the gpu card is

Device 0: “GeForce GTX 580”
CUDA Driver Version: 4.0
CUDA Runtime Version: 3.20
CUDA Capability Major revision number: 2
CUDA Capability Minor revision number: 0
Total amount of global memory: 1609760768 bytes
Number of multiprocessors: 16
Number of cores: 512
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 49152 bytes
Total number of registers available per block: 32768
Warp size: 32
Maximum number of threads per block: 1024
Maximum sizes of each dimension of a block: 1024 x 1024 x 64
Maximum sizes of each dimension of a grid: 65535 x 65535 x 65535
Maximum memory pitch: 2147483647 bytes
Texture alignment: 512 bytes

and the program does not always work well,
is any problem in my codes ? thank you.
and, if I decrease the threads number in one block
dimBlock = dim3(20, 20, 1) ==> dimBlock = dim3(5, 5, 1)
the frequency of error is also decreasing
[/list][/code]

toepfer · June 25, 2012, 9:41pm

What is meant by the statement, “does not always work well”? Does the program fail during runtime? Does it run slowly sometimes? What version of the PGI compiler are you using to build with?

lishin · June 26, 2012, 8:25am

module kmod
use cudafor
contains
attributes(global) subroutine cuda_test(f0_dev, xDim, yDim)
implicit none
integer, value :: xDim, yDim
integer :: f0_dev(yDim, xDim)
integer :: x, y, i

x = (blockIdx%x-1)*blockDim%x + threadIdx%x
y = (blockIdx%y-1)*blockDim%y + threadIdx%y
if(x <= xDim .and. y <= yDim .and. x >= 1 .and. y >= 1) then
f0_dev(y,x) = x+y
end if
end subroutine cuda_test
end module kmod

PROGRAM cudatest
use kmod
implicit none
integer, device, dimension(:,:), allocatable :: f0_dev
integer, dimension(:,:), allocatable :: f
integer :: i, istat, x, y, xDim=10000, yDim=10000
type(dim3) :: dimGrid, dimBlock

allocate(f0_dev(yDim, xDim))
allocate(f(yDim,xDim))
dimBlock = dim3(32, 32, 1)
dimGrid = dim3(ceiling(real(xDim)/dimBlock%x), ceiling(real(yDim)/dimBlock%y), 1)
call cuda_test<<<dimGrid,dimBlock>>>(f0_dev, xDim, yDim)						
f = f0_dev
do x=1,xDim
	do y=1,yDim
		if(int(f(y,x)) /= x+y) then
			write(*,*) "error occurs", f(y,x), x+y+i, x, y
		end if
	enddo
enddo

deallocate(f0_dev)
deallocate(f)
end PROGRAM cudatest

I rewrite the program to reveal my question. This program just does one thing ==> ask gpu to do f_dev(y,x) = x+y, and return the matrix, use cpu
to compare the results if f(y,x) = x+y
if the matrix size is small, everything seems ok, but, if we increase the size of
matrix, for example 10000x10000, then some elements of the matrix is not correct, and it seems randomly happening.
my compiler is
pgfortran 11.8-0 64-bit target on x86-64 Linux -tp nehalem
thxs for help.

toepfer · June 26, 2012, 5:54pm

I have compiled and run your program a number of times, but it does not fail for me. I am running on a Tesla C2070. We do not have a GTX 580 card, so perhaps there is something specific to that card? A few things you might want to try:

*) Reduce the number of threads per block. Try 16x16
*) Check and see if you are getting ECC memory errors. This can be done using the command: nvidia-smi -q -d ECC. You probably want to run this command, save the output, and then run your program until it fails. Then run the nvidia-smi command again, and save it to a different file. Then compare the to output files and see if there are any differences.
*) Is your NVIDIA driver up-to-date? We are currently at: 295.59

When you do get an error, what kind of values do you get? Do you just get a few wrong answers or is the entire matrix incorrect?

lishin · June 27, 2012, 3:04am

*) 32x32 => 16x16 makes the errors more.

*)all items are always “N/A” even the program is fail.
Timestamp : Wed Jun 27 10:53:32 2012

Driver Version : 270.41.19

Attached GPUs : 1

GPU 0:2:0
Ecc Mode
Current : N/A
Pending : N/A
ECC Errors
Volatile
Single Bit
Device Memory : N/A
Register File : N/A
L1 Cache : N/A
L2 Cache : N/A
Total : N/A
Double Bit
Device Memory : N/A
Register File : N/A
L1 Cache : N/A
L2 Cache : N/A
Total : N/A
Aggregate
Single Bit
Device Memory : N/A
Register File : N/A
L1 Cache : N/A
L2 Cache : N/A
Total : N/A
Double Bit
Device Memory : N/A
Register File : N/A
L1 Cache : N/A
L2 Cache : N/A
Total : N/A

*)maybe it’s driver problem, my driver version is 270.41.19 . I will try
on update driver version now.

However, the problem is solved by exchanging the indexes of x and y
In the same code,
inside subroutine cuda_test

y = (blockIdx%x-1)*blockDim%x + threadIdx%x
x = (blockIdx%y-1)*blockDim%y + threadIdx%y

in cudatest

dimGrid = dim3(ceiling(real(yDim)/dimBlock%x), ceiling(real(xDim)/dimBlock%y), 1)

I do not why, but it works now, no errors occur.

toepfer · June 27, 2012, 11:16pm

What happens if you change your threadblock size to 16x16, and change xDim=yDim=10.
Does this fail? This should run with one single threadblock(i.e., grid dimensions of 1x1).