thanks Mat for your prompt reply, I have two questions to ask you:
- how can I run the accelinfo?? do I run a special command in cmd??
- how can I check that memory bandwidth are the optimal value?? I am new to this technology (NVIDIA and CUDA)
I don’t have the Tesla, I wished that our budget can afford to buy one, instead I thought I should start something simple like NVIDIA GeForce 460 graphic card, installed on a Dell XPS 8300 (8 i7 cores), hope to run a test to see what is my GPU capable of.
also, regarding your question, what is the time needed to copy, I have developed the code bellow, please look at it, it might make it easier for you to understand what I am trying to do, I made that by my self with my little knowledge of cuda fortran since there is no books available other than some cuda c.
I used a function (CALL DATE AND TIME) to calculate the run time to see how many milli second (1/1000 of second) needed to do multiply by the gpu kernel, it was 1 msec, for the same task done by CPU, it took the cpu 80 msec, so that’s a big achievement for me, but, when I try to copy from gpu to cpu memory, there I get the big shock! it took 227 msec to copy! its like 3 time longer than the cpu doing the multiplication, but I did not use pinned attribute, this will be my next task.
here is the code:
module variables
implicit none
integer :: N = 500
integer :: M = 400
integer :: L = 500
real :: sum = 0.0
end module variables
module mmul_mod
use cudafor
use variables
contains
attributes (global) subroutine mmul_kernel (A,B,C, N,M,L)
integer , value :: N,M,L
real :: A(N,M) , B(M,L) , C(N,L)
real :: sum
integer :: k
blkidx = blockidx%x
blkidy = blockidx%y
sum = 0.0
do k =1, M
sum = sum + (A(blkidx,k) * B(k,blkidy))
enddo
C(blkidx,blkidy) = sum
sum = 0.0
end subroutine mmul_kernel
end module mmul_mod
!=============================
program mat_mult
!=============================
use variables
use mmul_mod
implicit none
integer i,j,k
real, dimension (N,M) :: A
real, dimension (M,L) :: B
real, dimension (N,L) :: C
real, device, allocatable, dimension (:,:) :: Adev, Bdev, Cdev
integer :: start_time(8), end_time(8)
CHARACTER (LEN = 12) REAL_CLOCK (3)
type(dim3) :: blocks
allocate (Adev(N,M), Bdev(M,L), Cdev(N,L))
CALL DATE_AND_TIME (REAL_CLOCK (1), REAL_CLOCK (2), REAL_CLOCK (3), start_time)
Adev = A(1:N,1:M)
Bdev (:,:) = B(1:M,1:L)
blocks = dim3(N, L, 1)
call mmul_kernel <<>> (Adev, Bdev, Cdev, N,M,L)
CALL DATE_AND_TIME (REAL_CLOCK (1), REAL_CLOCK (2),REAL_CLOCK (3), end_time)
C(1:N,1:L) = Cdev
deallocate (Adev, Bdev, Cdev)
end program mat_mult