CUDA 3.2 RC Fortran Wrapper

Hi,

when using the fortran wrapper as provided by the SDK one needs to define the device pointers as integer*8 on 64bit linux machine.

The documentation to the CuBLAS 3.2 RC lib states

The following program demonstrates that this is not the case.

[codebox]

  PROGRAM TEST_ALLOC

  IMPLICIT NONE

!

  INTEGER    GPURAM,      MIBYTE

  PARAMETER (GPURAM=1023, MIBYTE=1024*1024)

INTEGER IX,STAT

EXTERNAL CUBLAS_INIT, CUBLAS_SHUTDOWN, CUBLAS_ALLOC, CUBLAS_FREE

  INTEGER*4 CUBLAS_INIT, CUBLAS_SHUTDOWN, CUBLAS_ALLOC, CUBLAS_FREE

INTEGER*4 DEVLOCA(2)

  INTEGER*8 DEVLOCB

! > Initialize CuBLAS

  STAT = CUBLAS_INIT()

  IF (STAT .NE. 0)  WRITE(*,*) 'cublas init failed'

! > Preset field for device location

DEVLOCA(1) = -1

  DEVLOCA(2) = -1

  WRITE(*,*) ' DevLocA(1),DevLocA(2) =',DEVLOCA(1),DEVLOCA(2) 

  STAT = CUBLAS_ALLOC(1, MIBYTE, DEVLOCA)

  IF (STAT .EQ. 0) THEN

     WRITE(*,*) ' DevLocA(1),DevLocA(2) =',DEVLOCA(1),DEVLOCA(2) 

     STAT = CUBLAS_FREE(DEVLOCA)     

  ELSE

     WRITE(*,*) 'allocation failed'

  END IF

! > Shutdown CuBLAS

STAT = CUBLAS_SHUTDOWN()

  IF (STAT .NE. 0)  WRITE(*,*) 'cublas shutdown failed'

!

  END

[/codebox]

Compiling the fortran wrapper files with

gcc -I /opt/cuda/include/ -o cublas_wrapper.o -c fortran.c

and

gfortran -fno-second-underscore -I/opt/cuda/include -L/opt/cuda/lib64 -lcublas -lcudart -L/usr/lib64 -lgfortran cublas_wrapper.o testalloc.f

to get the executable I get the following output.

heinemey@gpu-1:~/tmp/nvidia> a.out

DevLocA(1),DevLocA(2) = -1 -1

DevLocA(1),DevLocA(2) = 1048576 0

This shows that cublas_alloc expects a 64 bit integer for the device pointer.

The problem seems to me that the definition of devptr_t in fortran.h changed from version 3.1 to 3.2.

CuBLAS 3.2: typedef size_t devptr_t;

CuBLAS 3.1: typedef uint devptr_t;

but the documentation was missed.

Cheers,

Eric

Thanks for pointing the documentation problem.
Here is a small Fortran example that can be compiled on 32-bit and 64-bit OS (relying on preprocessing variable ARCH_64)

#define IDX2F(i,j,ld) ((((j)-1)(ld))+((i)-1))
subroutine modify (devPtrM, ldm, n, p, q, alpha, beta)
implicit none
integer sizeof_real
parameter (sizeof_real=4)
integer ldm, n, p, q
#if ARCH_64
integer
8 devPtrM
#else
integer4 devPtrM
#endif
real
4 alpha, beta
call cublas_sscal (n-p+1, alpha,
1 devPtrM+IDX2F(p,q,ldm)*sizeof_real,
2 ldm)
call cublas_sscal (ldm-p+1, beta,
1 devPtrM+IDX2F(p,q,ldm)*sizeof_real,
2 1)
return
end

  program matrixmod
  implicit none
  integer M, N, sizeof_real

#if ARCH_64
integer8 devPtrA
#else
integer
4 devPtrA
#endif
parameter (M=6, N=5, sizeof_real=4)
real4 a(M,N)
integer i, j, stat
external cublas_init, cublas_set_matrix, cublas_get_matrix
external cublas_shutdown, cublas_alloc
integer cublas_alloc, cublas_set_matrix, cublas_get_matrix
do j = 1, N
do i = 1, M
a(i,j) = (i-1) * M + j
enddo
enddo
call cublas_init
stat = cublas_alloc(M
N, sizeof_real, devPtrA)
if (stat .NE. 0) then
write(,) “device memory allocation failed”
call cublas_shutdown
stop
endif
stat = cublas_set_matrix (M, N, sizeof_real, a, M, devPtrA, M)
if (stat .NE. 0) then
call cublas_free (devPtrA)
write(,) “data download failed”
call cublas_shutdown
stop
endif
call modify (devPtrA, M, N, 2, 3, 16.0, 12.0)
stat = cublas_get_matrix (M, N, sizeof_real, devPtrA, M, a, M)
if (stat .NE. 0) then
call cublas_free (devPtrA)
write(,) “data upload failed”
call cublas_shutdown
stop
endif
call cublas_free (devPtrA)
call cublas_shutdown
do j = 1, N
do i = 1, M
write(,“(F7.0$)”) a(i,j)
enddo
write (
,*) “”
enddo
stop
end

Thanks for pointing the documentation problem.
Here is a small Fortran example that can be compiled on 32-bit and 64-bit OS (relying on preprocessing variable ARCH_64)

#define IDX2F(i,j,ld) ((((j)-1)(ld))+((i)-1))
subroutine modify (devPtrM, ldm, n, p, q, alpha, beta)
implicit none
integer sizeof_real
parameter (sizeof_real=4)
integer ldm, n, p, q
#if ARCH_64
integer
8 devPtrM
#else
integer4 devPtrM
#endif
real
4 alpha, beta
call cublas_sscal (n-p+1, alpha,
1 devPtrM+IDX2F(p,q,ldm)*sizeof_real,
2 ldm)
call cublas_sscal (ldm-p+1, beta,
1 devPtrM+IDX2F(p,q,ldm)*sizeof_real,
2 1)
return
end

  program matrixmod
  implicit none
  integer M, N, sizeof_real

#if ARCH_64
integer8 devPtrA
#else
integer
4 devPtrA
#endif
parameter (M=6, N=5, sizeof_real=4)
real4 a(M,N)
integer i, j, stat
external cublas_init, cublas_set_matrix, cublas_get_matrix
external cublas_shutdown, cublas_alloc
integer cublas_alloc, cublas_set_matrix, cublas_get_matrix
do j = 1, N
do i = 1, M
a(i,j) = (i-1) * M + j
enddo
enddo
call cublas_init
stat = cublas_alloc(M
N, sizeof_real, devPtrA)
if (stat .NE. 0) then
write(,) “device memory allocation failed”
call cublas_shutdown
stop
endif
stat = cublas_set_matrix (M, N, sizeof_real, a, M, devPtrA, M)
if (stat .NE. 0) then
call cublas_free (devPtrA)
write(,) “data download failed”
call cublas_shutdown
stop
endif
call modify (devPtrA, M, N, 2, 3, 16.0, 12.0)
stat = cublas_get_matrix (M, N, sizeof_real, devPtrA, M, a, M)
if (stat .NE. 0) then
call cublas_free (devPtrA)
write(,) “data upload failed”
call cublas_shutdown
stop
endif
call cublas_free (devPtrA)
call cublas_shutdown
do j = 1, N
do i = 1, M
write(,“(F7.0$)”) a(i,j)
enddo
write (
,*) “”
enddo
stop
end