texture variable as argument in CUDA Fortran?

Is it possible to use texture variable as argument in CUDA Fortran?
I would like to have the same kernel subroutine called with different texture variables.

-Andras

Hi Andras,

Is it possible to use texture variable as argument in CUDA Fortran?

Variables with the texture attribute must be a F90 pointer declared in a module.

See: https://www.pgroup.com/resources/docs/19.5/x86/cuda-fortran-prog-guide/index.htm#cfref-var-attr-texture-data
and
https://www.pgroup.com/resources/docs/19.5/x86/cuda-fortran-prog-guide/index.htm#cfpg-fort-mods-decl-textures

I would like to have the same kernel subroutine called with different texture variables.

You should be able to accomplish this by assigning the texture pointer to different device target arrays before the call to the kernel.

For example, I’ve added to the example shown in the CUDA Fortran manual to have the texture point to different device arrays before each call:

% cat test.cuf
module memtests
  real(8), texture, pointer :: t(:)  ! declare the texture
  contains
    attributes(device) integer function bitrev8(i)
    integer ix1, ix2, ix
    ix = i
    ix1 = ishft(iand(ix,z'0aa'),-1)
    ix2 = ishft(iand(ix,z'055'), 1)
    ix = ior(ix1,ix2)
    ix1 = ishft(iand(ix,z'0cc'),-2)
    ix2 = ishft(iand(ix,z'033'), 2)
    ix = ior(ix1,ix2)
    ix1 = ishft(ix,-4)
    ix2 = ishft(ix, 4)
    bitrev8 = iand(ior(ix1,ix2),z'0ff')
    end function bitrev8

    attributes(global) subroutine without( a, b )
    real(8), device :: a(*), b(*)
    i = blockDim%x*(blockIdx%x-1) + threadIdx%x
    j = bitrev8(threadIdx%x-1) + 1
    b(i) = a(j)
    return
    end subroutine

    attributes(global) subroutine withtex( a, b )
    real(8), device :: a(*), b(*)
    i = blockDim%x*(blockIdx%x-1) + threadIdx%x
    j = bitrev8(threadIdx%x-1) + 1
    b(i) = t(j)  ! This subroutine accesses a through the texture
    return
    end subroutine
end module memtests

program t
use cudafor
use memtests
real(8), device, target, allocatable :: da(:), dc(:)
real(8), device, allocatable :: db(:)
integer, parameter :: n = 1024*1024
integer, parameter :: nthreads = 256
integer, parameter :: ntimes = 1000
type(cudaEvent) :: start, stop
real(8) b(n)

allocate(da(nthreads))
allocate(dc(nthreads))
allocate(db(n))

istat = cudaEventCreate(start)
istat = cudaEventCreate(stop)

db = 100.0d0
da = (/ (dble(i),i=1,nthreads) /)
dc = (/ (dble(i),i=1,nthreads) /)

call without<<<n/nthreads, nthreads>>> (da, db)
istat = cudaEventRecord(start,0)
do j = 1, ntimes
  call without<<<n/nthreads, nthreads>>> (da, db)
end do
istat = cudaEventRecord(stop,0)
istat = cudaDeviceSynchronize()
istat = cudaEventElapsedTime(time1, start, stop)
time1 = time1 / (ntimes*1.0e3)
b = db
print *,sum(b)==(n*(nthreads+1)/2)

db = 100.0d0
t => da  ! assign the texture to da using f90 pointer assignment

istat = cudaEventRecord(start,0)
call withtex<<<n/nthreads, nthreads>>> (da, db)
istat = cudaEventRecord(start,0)
do j = 1, ntimes
  call withtex<<<n/nthreads, nthreads>>> (da, db)
end do
istat = cudaEventRecord(stop,0)
istat = cudaDeviceSynchronize()
istat = cudaEventElapsedTime(time2, start, stop)
time2 = time2 / (ntimes*1.0e3)
b = db
print *,sum(b)==(n*(nthreads+1)/2)

t => dc  ! assign the texture to da using f90 pointer assignment
istat = cudaEventRecord(start,0)
call withtex<<<n/nthreads, nthreads>>> (dc, db)
istat = cudaEventRecord(start,0)
do j = 1, ntimes
  call withtex<<<n/nthreads, nthreads>>> (dc, db)
end do
istat = cudaEventRecord(stop,0)
istat = cudaDeviceSynchronize()
istat = cudaEventElapsedTime(time3, start, stop)
time3 = time3 / (ntimes*1.0e3)
b = db
print *,sum(b)==(n*(nthreads+1)/2)

print *,"Times with    textures",time2,time3
print *,"Time without textures",time1
print *,"Speedup with textures",time1 / time2

deallocate(da)
deallocate(db)
deallocate(dc)
end
% pgfortran test.cuf
% a.out
  T
  T
  T
 Times with    textures   1.5546017E-05   1.5541344E-05
 Time without textures   1.6113119E-05
 Speedup with textures    1.036479

Note that my speed-up is relatively small in this example. This because I’m using a V100 where texture memory is no longer separate and instead uses L2 cache.


Hope this helps,
Mat

Great! Thanks Mat!

I was afraid to do this because I was expecting some slow down due to the constant reassignments of the texture pointers, but I see no such thing when compared to writing separate kernel codes for each call.

-Andras