GPU does not work why?

I put the data into the GPU to run it, the result is wrong,why?

Hi SWL_EGGBABY,

There could be a number of possibilities. Can you please provide more details including sample code (if possible), compiler version, and OS? Also, please post your response to this topic. Posting new topics to each response makes it difficult for other users to follow the thread.

Thanks,
Mat

Hi Mat,

The version is PGI Visual Fortran 2008 x64 10.2
The OS is windows xp 64-bit
The code is:

module mmul_mod
use cudafor
contains
attributes(global) subroutine mmul_kernel( A, B, C, N, M, L )
real :: A(N,M), B(M,L), C(N,L)
integer, value :: N, M, L
integer :: i, j, kb, k, tx, ty
real, shared :: Asub(16,16), Bsub(16,16)
real :: Cij
tx = threadidx%x
ty = threadidx%y
i = (blockidx%x-1) * 16 + tx
j = (blockidx%y-1) * 16 + ty
Cij = 0.0
do kb = 1, M, 16
Asub(tx,ty) = A(i,kb+ty-1)
Bsub(tx,ty) = B(kb+tx-1,j)
call syncthreads()
do k = 1,16
Cij = Cij + Asub(tx,k) * Bsub(k,ty)
enddo
call syncthreads()
enddo
C(i,j) = Cij
end subroutine mmul_kernel


subroutine mmul( A, B, C )
real, dimension(:,:) :: A, B, C
real, device, allocatable, dimension(:,:) :: Adev,Bdev,Cdev
type(dim3) :: dimGrid, dimBlock
N = size( A, 1 )
M = size( A, 2 )
L = size( B, 2 )
allocate( Adev(N,M), Bdev(M,L), Cdev(N,L) )
Adev = A(1:N,1:M)
Bdev(:,:) = B(1:M,1:L)
dimGrid = dim3( N/16, M/16, 1 )
dimBlock = dim3( 16, 16, 1 )
call mmul_kernel<<<dimGrid,dimBlock>>>( Adev, Bdev, Cdev, &
N, M, L )
C(1:N,1:L) = Cdev
deallocate( Adev, Bdev, Cdev )
end subroutine mmul
end module mmul_mod

program swl_eggbaby
use mmul_mod
implicit none
integer,parameter :: N=16,M=16,L=4
integer :: i
real :: A(N,M)=0
real :: B(M,L)=0
real :: C(N,L)
call mmul(A,B,C)
write(,) “”
do i=1,N
write(,) C(N,:)
end do
end program

run it

-7.2750384E+34 NaN -4.8118824E-35 -1.7518910E-15
-7.2750384E+34 NaN -4.8118824E-35 -1.7518910E-15
-7.2750384E+34 NaN -4.8118824E-35 -1.7518910E-15
-7.2750384E+34 NaN -4.8118824E-35 -1.7518910E-15
-7.2750384E+34 NaN -4.8118824E-35 -1.7518910E-15
-7.2750384E+34 NaN -4.8118824E-35 -1.7518910E-15
-7.2750384E+34 NaN -4.8118824E-35 -1.7518910E-15
-7.2750384E+34 NaN -4.8118824E-35 -1.7518910E-15
-7.2750384E+34 NaN -4.8118824E-35 -1.7518910E-15
-7.2750384E+34 NaN -4.8118824E-35 -1.7518910E-15
-7.2750384E+34 NaN -4.8118824E-35 -1.7518910E-15
-7.2750384E+34 NaN -4.8118824E-35 -1.7518910E-15
-7.2750384E+34 NaN -4.8118824E-35 -1.7518910E-15
-7.2750384E+34 NaN -4.8118824E-35 -1.7518910E-15
-7.2750384E+34 NaN -4.8118824E-35 -1.7518910E-15
-7.2750384E+34 NaN -4.8118824E-35 -1.7518910E-15
. . .[/img]
thank you

Device Number: 0
Device Name: Quadro FX 5800
Total Global Memory: 0.000 Gbytes
sharedMemPerBlock: 16384 bytes
regsPerBlock: 16384
warpSize: 32
maxThreadsPerBlock: 512
maxThreadsDim: 512 x 512 x 64
maxGridSize: 65535 x 65535 x 1
ClockRate: 1.296 GHz
Total Const Memory: 65536 bytes
Compute Capability Revision: 1.3
TextureAlignment: 256 bytes
deviceOverlap: T
multiProcessorCount: 30
integrated: T
canMapHostMemory: T
Device Number: 1
Device Name: Tesla C1060
Total Global Memory: 0.000 Gbytes
sharedMemPerBlock: 16384 bytes
regsPerBlock: 16384
warpSize: 32
maxThreadsPerBlock: 512
maxThreadsDim: 512 x 512 x 64
maxGridSize: 65535 x 65535 x 1
ClockRate: 1.296 GHz
Total Const Memory: 65536 bytes
Compute Capability Revision: 1.3
TextureAlignment: 256 bytes
deviceOverlap: T
multiProcessorCount: 30
integrated: T
canMapHostMemory: T

Hi SWL_EGGBABY,

Your code looks fine so I suspect that there is some issue with your cards. Try adding error checking to give us some ideas:

module mmul_mod
use cudafor
contains
attributes(global) subroutine mmul_kernel( A, B, C, N, M, L )
real :: A(N,M), B(M,L), C(N,L)
integer, value :: N, M, L
integer :: i, j, kb, k, tx, ty
real, shared :: Asub(16,16), Bsub(16,16)
real :: Cij
tx = threadidx%x
ty = threadidx%y
i = (blockidx%x-1) * 16 + tx
j = (blockidx%y-1) * 16 + ty
Cij = 0.0
do kb = 1, M, 16
Asub(tx,ty) = A(i,kb+ty-1)
Bsub(tx,ty) = B(kb+tx-1,j)
call syncthreads()
do k = 1,16
Cij = Cij + Asub(tx,k) * Bsub(k,ty)
enddo
call syncthreads()
enddo
C(i,j) = Cij
end subroutine mmul_kernel


subroutine mmul( A, B, C )
real, dimension(:,:) :: A, B, C
real, device, allocatable, dimension(:,:) :: Adev,Bdev,Cdev
type(dim3) :: dimGrid, dimBlock
integer :: errCode
N = size( A, 1 )
M = size( A, 2 )
L = size( B, 2 )
allocate( Adev(N,M), Bdev(M,L), Cdev(N,L) )
Adev = A(1:N,1:M)
Bdev(:,:) = B(1:M,1:L)
dimGrid = dim3( N/16, M/16, 1 )
dimBlock = dim3( 16, 16, 1 )
call mmul_kernel<<<dimGrid,dimBlock>>>( Adev, Bdev, Cdev, &
N, M, L )
errCode = cudaGetLastError()
print *, cudaGetErrorString(errCode)
C(1:N,1:L) = Cdev
errCode = cudaGetLastError()
print *, cudaGetErrorString(errCode)
deallocate( Adev, Bdev, Cdev )
end subroutine mmul
end module mmul_mod

program swl_eggbaby
use mmul_mod
implicit none
integer,parameter :: N=16,M=16,L=4
integer :: i
real :: A(N,M)=0
real :: B(M,L)=0
real :: C(N,L)
call mmul(A,B,C)
write(*,*) "Results:"
do i=1,N
write(*,*) C(N,:)
end do
end program
  • Mat

Hi Mat,

The running results:

invalid device function

no error

Results:
-1.1893121E+30 NaN -4.8118640E-35 -1.7518910E-15
-1.1893121E+30 NaN -4.8118640E-35 -1.7518910E-15
-1.1893121E+30 NaN -4.8118640E-35 -1.7518910E-15
-1.1893121E+30 NaN -4.8118640E-35 -1.7518910E-15
-1.1893121E+30 NaN -4.8118640E-35 -1.7518910E-15
-1.1893121E+30 NaN -4.8118640E-35 -1.7518910E-15
-1.1893121E+30 NaN -4.8118640E-35 -1.7518910E-15
-1.1893121E+30 NaN -4.8118640E-35 -1.7518910E-15
-1.1893121E+30 NaN -4.8118640E-35 -1.7518910E-15
-1.1893121E+30 NaN -4.8118640E-35 -1.7518910E-15
-1.1893121E+30 NaN -4.8118640E-35 -1.7518910E-15
-1.1893121E+30 NaN -4.8118640E-35 -1.7518910E-15
-1.1893121E+30 NaN -4.8118640E-35 -1.7518910E-15
-1.1893121E+30 NaN -4.8118640E-35 -1.7518910E-15
-1.1893121E+30 NaN -4.8118640E-35 -1.7518910E-15
-1.1893121E+30 NaN -4.8118640E-35 -1.7518910E-15

what is the invalid device function,I do not understand,please help me,thank you!

Hi SWL_EGGBABY,

Try using the Telsa card instead of the FX5800. i.e. add “istat = cudaSetDevice(1)” to your main program. Also, you’ll need to declare “istat” and add “use cudafor”.

Hope this helps,
Mat

Hi Mat,

I can’t use both of my tow cards,They are both display invalid device function.

I use the samples sgemm.cuf test them:

\nDevice:Quadro FX 5800, 1296.0 MHz clock, 4095.7 MB memory.\n
invalid device function
65536 errors were encountered
256x256 * 256x256:\t 0.007 ms\t4644.484 GFlops/s
. . .

\nDevice:Tesla C1060, 1296.0 MHz clock, 4095.8 MB memory.\n
invalid device function
38560 errors were encountered
256x256 * 256x256:\t 0.008 ms\t4309.860 GFlops/s
. . .

Oh my tow bad cards!

Hi SWL_EGGBABY,

Oh my tow bad cards!

I doubt your cards are bad, rather it’s more likely a configuration issue, like a driver problem. Exactly what, unfortunately I don’t know.

My next step would be to call the “cudaGetDevice” and “cudaGetDeviceProperties” to see what is returned. Also, what NVIDIA drivers are you using? Are they CUDA 2.3 capable (version >= 190)?

  • Mat

Hi Mat,

Thank you.
It’is a driver problem,I download and installed the new driver,everything is fine now.
Thank you again for help me so much.