Is it possible to use texture variable as argument in CUDA Fortran?
I would like to have the same kernel subroutine called with different texture variables.
-Andras
Is it possible to use texture variable as argument in CUDA Fortran?
I would like to have the same kernel subroutine called with different texture variables.
-Andras
Hi Andras,
Is it possible to use texture variable as argument in CUDA Fortran?
Variables with the texture attribute must be a F90 pointer declared in a module.
See: CUDA Fortran Programming Guide Version 19.5 for x86 and NVIDIA Processors
and
CUDA Fortran Programming Guide Version 19.5 for x86 and NVIDIA Processors
I would like to have the same kernel subroutine called with different texture variables.
You should be able to accomplish this by assigning the texture pointer to different device target arrays before the call to the kernel.
For example, I’ve added to the example shown in the CUDA Fortran manual to have the texture point to different device arrays before each call:
% cat test.cuf
module memtests
real(8), texture, pointer :: t(:) ! declare the texture
contains
attributes(device) integer function bitrev8(i)
integer ix1, ix2, ix
ix = i
ix1 = ishft(iand(ix,z'0aa'),-1)
ix2 = ishft(iand(ix,z'055'), 1)
ix = ior(ix1,ix2)
ix1 = ishft(iand(ix,z'0cc'),-2)
ix2 = ishft(iand(ix,z'033'), 2)
ix = ior(ix1,ix2)
ix1 = ishft(ix,-4)
ix2 = ishft(ix, 4)
bitrev8 = iand(ior(ix1,ix2),z'0ff')
end function bitrev8
attributes(global) subroutine without( a, b )
real(8), device :: a(*), b(*)
i = blockDim%x*(blockIdx%x-1) + threadIdx%x
j = bitrev8(threadIdx%x-1) + 1
b(i) = a(j)
return
end subroutine
attributes(global) subroutine withtex( a, b )
real(8), device :: a(*), b(*)
i = blockDim%x*(blockIdx%x-1) + threadIdx%x
j = bitrev8(threadIdx%x-1) + 1
b(i) = t(j) ! This subroutine accesses a through the texture
return
end subroutine
end module memtests
program t
use cudafor
use memtests
real(8), device, target, allocatable :: da(:), dc(:)
real(8), device, allocatable :: db(:)
integer, parameter :: n = 1024*1024
integer, parameter :: nthreads = 256
integer, parameter :: ntimes = 1000
type(cudaEvent) :: start, stop
real(8) b(n)
allocate(da(nthreads))
allocate(dc(nthreads))
allocate(db(n))
istat = cudaEventCreate(start)
istat = cudaEventCreate(stop)
db = 100.0d0
da = (/ (dble(i),i=1,nthreads) /)
dc = (/ (dble(i),i=1,nthreads) /)
call without<<<n/nthreads, nthreads>>> (da, db)
istat = cudaEventRecord(start,0)
do j = 1, ntimes
call without<<<n/nthreads, nthreads>>> (da, db)
end do
istat = cudaEventRecord(stop,0)
istat = cudaDeviceSynchronize()
istat = cudaEventElapsedTime(time1, start, stop)
time1 = time1 / (ntimes*1.0e3)
b = db
print *,sum(b)==(n*(nthreads+1)/2)
db = 100.0d0
t => da ! assign the texture to da using f90 pointer assignment
istat = cudaEventRecord(start,0)
call withtex<<<n/nthreads, nthreads>>> (da, db)
istat = cudaEventRecord(start,0)
do j = 1, ntimes
call withtex<<<n/nthreads, nthreads>>> (da, db)
end do
istat = cudaEventRecord(stop,0)
istat = cudaDeviceSynchronize()
istat = cudaEventElapsedTime(time2, start, stop)
time2 = time2 / (ntimes*1.0e3)
b = db
print *,sum(b)==(n*(nthreads+1)/2)
t => dc ! assign the texture to da using f90 pointer assignment
istat = cudaEventRecord(start,0)
call withtex<<<n/nthreads, nthreads>>> (dc, db)
istat = cudaEventRecord(start,0)
do j = 1, ntimes
call withtex<<<n/nthreads, nthreads>>> (dc, db)
end do
istat = cudaEventRecord(stop,0)
istat = cudaDeviceSynchronize()
istat = cudaEventElapsedTime(time3, start, stop)
time3 = time3 / (ntimes*1.0e3)
b = db
print *,sum(b)==(n*(nthreads+1)/2)
print *,"Times with textures",time2,time3
print *,"Time without textures",time1
print *,"Speedup with textures",time1 / time2
deallocate(da)
deallocate(db)
deallocate(dc)
end
% pgfortran test.cuf
% a.out
T
T
T
Times with textures 1.5546017E-05 1.5541344E-05
Time without textures 1.6113119E-05
Speedup with textures 1.036479
Note that my speed-up is relatively small in this example. This because I’m using a V100 where texture memory is no longer separate and instead uses L2 cache.
Hope this helps,
Mat
Great! Thanks Mat!
I was afraid to do this because I was expecting some slow down due to the constant reassignments of the texture pointers, but I see no such thing when compared to writing separate kernel codes for each call.
-Andras