I have been trying to achieve faster data transfer between the CPU and GPU by using pinned memory. I can run the FORTRAN thunking cuBLAS pinned memory example from http://www.nvidia.com/object/cuda_programming_tools.html but when I tried to implement it in my own program it fails. The program fails when I set the initial values one of the variables but even if I skip this the program will crash when I try to retrieve one of the variables from GPU memory. I am using the gnu compilers version 4.3.2 on 64bit Ubuntu 8.10. I have also tried the latest 4.4.1 gnu compilers and got the same error. This is the portion of the code that fails. I have also included the full code as an attachment, the fortran.c file is a symbolic link so you may need to change its location to get it to work.
Thanks in advance,
David
[codebox]program main
use iso_c_binding
implicit none
include ‘cuda_stuff.inc’
!define the floating point kind to be single precision
integer, parameter :: fp_kind = kind(1.d0) !double = kind(1.d0) single = kind(1.0)
integer :: N
real(fp_kind), dimension(:,:), pointer :: II,CC,AA
real(fp_kind), dimension(:), pointer :: AV,BV
integer :: I_ptr,CC_ptr,AV_ptr,BV_ptr,AA_ptr
type(C_PTR) :: cptr_II,cptr_CC,cptr_AV,cptr_BV,cptr_AA
integer :: res
! Define matrices as pointer.
!real (fp_kind), dimension(:,:), pointer :: A, B, C
real(fp_kind) :: alpha, beta
real :: tic,toc
real :: time_send, time_comp,time_retrieve
integer :: i,j
integer :: sizeof_real = 8
integer :: icuBLAS_error
integer :: cublas_alloc
integer :: cublas_set_matrix
integer :: cublas_get_matrix
integer :: cublas_set_vector
integer :: cublas_get_vector
real(fp_kind) :: cublas_Dasum
real(fp_kind) :: cublas_sasum
external cublas_alloc
external cublas_set_matrix
external cublas_get_matrix
external CUBLAS_SET_VECTOR
external CUBLAS_GET_VECTOR
external cublas_Dasum
external cublas_Sasum
write(*,*) "enter Matrix size"
read(*,"(i4)") N
! allocate(AA(N,N))
res = cudaMallocHost ( cptr_AA, n*n*sizeof(fp_kind) )
call c_f_pointer ( cptr_AA, AA, (/ n, n /) )
if(res /=0) STOP
! allocate(II(N,N))
res = cudaMallocHost ( cptr_II, n*n*sizeof(fp_kind) )
call c_f_pointer ( cptr_II, II, (/ n, n /) )
if(res /=0) STOP
! allocate(CC(N,N))
res = cudaMallocHost ( cptr_CC, n*n*sizeof(fp_kind) )
call c_f_pointer ( cptr_CC, CC, (/ n, n /) )
if(res /=0) STOP
! allocate(AV(N))
res = cudaMallocHost ( cptr_AV, n*sizeof(fp_kind) )
call c_f_pointer ( cptr_AV, AV, (/ n/) )
if(res /=0) STOP
! allocate(BV(N))
res = cudaMallocHost ( cptr_BV, n*sizeof(fp_kind) )
call c_f_pointer ( cptr_BV, BV, (/ n /) )
alpha = 1.0
beta = 0.0
sizeof_real = fp_kind
call cublas_init
write(*,*) 'Check 1'
do i = 1,N
!write(*,*) i
do j = 1,N
II(i,j) = 0._fp_kind
!write(*,*) i,j,i*j
!AA(i,j) = 1._fp_kind !This causes a seg fault
end do
AV(i) = real(i)
II(i,i) = 1._fp_kind
end do
BV=1._fp_kind
CC = 0._fp_kind
write(*,*) 'Check 1.5'
AA=1._fp_kind !This causes a seg fault
[/codebox]
Pinned_test.zip (6.05 KB)