Hello.
I’m reading CUDA Fortran Reference Guide 2016 about texture use.
And I’m confronted with a tricky example code.
module memtests
real(8), texture, pointer :: t(:) ! declare the texture
contains
attributes(device) integer function bitrev8(i)
integer ix1, ix2, ix
ix = i
ix1 = ishft(iand(ix,z'0aa'),-1)
ix2 = ishft(iand(ix,z'055'), 1)
ix = ior(ix1,ix2)
ix1 = ishft(iand(ix,z'0cc'),-2)
ix2 = ishft(iand(ix,z'033'), 2)
ix = ior(ix1,ix2)
ix1 = ishft(ix,-4)
ix2 = ishft(ix, 4)
bitrev8 = iand(ior(ix1,ix2),z'0ff')
end function bitrev8
attributes(global) subroutine without( a, b )
real(8), device :: a(*), b(*)
i = blockDim%x*(blockIdx%x-1) + threadIdx%x
j = bitrev8(threadIdx%x-1) + 1
b(i) = a(j)
return
end subroutine
attributes(global) subroutine withtex( a, b )
real(8), device :: a(*), b(*)
i = blockDim%x*(blockIdx%x-1) + threadIdx%x
j = bitrev8(threadIdx%x-1) + 1
b(i) = t(j) ! This subroutine accesses a through the texture
return
end subroutine
end module memtests
program t
use cudafor
use memtests
real(8), device, target, allocatable :: da(:)
real(8), device, allocatable :: db(:)
integer, parameter :: n = 1024*1024
integer, parameter :: nthreads = 256
integer, parameter :: ntimes = 1000
type(cudaEvent) :: start, stop
real(8) b(n)
allocate(da(nthreads))
allocate(db(n))
istat = cudaEventCreate(start)
istat = cudaEventCreate(stop)
db = 100.0d0
da = (/ (dble(i),i=1,nthreads) /)
call without<<<n/nthreads, nthreads>>> (da, db)
istat = cudaEventRecord(start,0)
do j = 1, ntimes
call without<<<n/nthreads, nthreads>>> (da, db)
end do
istat = cudaEventRecord(stop,0)
istat = cudaDeviceSynchronize()
istat = cudaEventElapsedTime(time1, start, stop)
time1 = time1 / (ntimes*1.0e3)
b = db
print *,sum(b)==(n*(nthreads+1)/2)
db = 100.0d0
t => da ! assign the texture to da using f90 pointer assignment
call withtex<<<n/nthreads, nthreads>>> (da, db)
istat = cudaEventRecord(start,0)
do j = 1, ntimes
call withtex<<<n/nthreads, nthreads>>> (da, db)
end do
istat = cudaEventRecord(stop,0)
istat = cudaDeviceSynchronize()
istat = cudaEventElapsedTime(time2, start, stop)
time2 = time2 / (ntimes*1.0e3)
b = db
print *,sum(b)==(n*(nthreads+1)/2)
print *,"Time with textures",time2
print *,"Time without textures",time1
print *,"Speedup with textures",time1 / time2
deallocate(da)
deallocate(db)
end
This is an example at page 17-18.
What is bitrev8 doing?
Do I have to do that kind of mapping when I use textures?