Hi there,
I have a deadline tomorrow at 12 and I’ve accidently messed up my code so I need urgent help from anyone who can.
The code’s basically supposed to use constant memory for the devifact array, but when i run it it’s saying copyin Memcpy FAILED:17.
The copy from host to constant memory is devifact=ifact(1:292)
Please please please can anyone help.
Host code
subroutine matmul_host(lowt,a,b,d,ifact,nbasis)
! Declare host arrays
Implicit none
Integer:: threadblocks
Integer:: error, istat,size
Integer, value:: lowt, nbasis,blocksize
double precision, dimension(1:lowt):: a, b,d
integer::ifact(292)
! Declare device arrays
Double precision, device,allocatable, dimension(:)::adev,bdev,ddev
integer, constant::devifact(292)
! Declare grid and block
type(dim3):: dimGrid, dimBlock
! Declare variables for timer
real ctimeall, ctimekernel
integer c1, c2, c3, c4
! Start time for kernel and data transfers
call system_clock( count=c1 )
! Allocate memory on the device
allocate(adev(1:lowt),bdev(1:lowt),ddev(1:lowt))
istat = cudathreadsynchronize()
! Define the grid and block shape
blocksize=3
if (mod(nbasis, blocksize)==0) then
threadblocks=nbasis/blocksize
else
threadblocks=nbasis/blocksize+1
end if
dimGrid= dim3(threadblocks, 1, 1)
dimBlock= dim3 (blocksize, 1, 1)
! Copy a, b, and ifact to the device
Adev = A(1:lowt)
bdev = b(1:lowt)
devifact=ifact(1:292)
! Second timer to record kernel time without data transfers
call system_clock( count=c2 )
! Kernel invocation
call matmulcuf<<<dimGrid,dimBlock>>>(lowt,adev,bdev,ddev,devifact,nbasis,blocksize)
! Synchronise threads
istat = cudathreadsynchronize()
! Stop second timer
call system_clock( count=c3 )
! Copy results back to host array
d = ddev(1:lowt)
! Stop first timer
call system_clock( count=c4 )
! Freeing arrays on device(may need to declare as integer function)
deallocate(adev,bdev,ddev)
! Print out kernel times
ctimekernel = c3 - c2
ctimeall = c4 - c1
print *, 'Kernel time excluding data xfer:', ctimekernel/1000000
print *, 'Total time including data xfer: ', ctimeall/1000000
end
end
Device code
Module matmul_mod
use cudafor
contains
! Device code
attributes(global) subroutine matmulcuf(lowt, a, b, d, ifact, nbasis, blocksize)
implicit none
double precision :: a(lowt), b(lowt), d(lowt)
integer:: i, j, k, ii, jj, kk, l, m, tx, bx
double precision :: sum, temp
integer, value:: lowt, nbasis, blocksize
double precision, value:: one
! Declare devifact to reside in constant memory
Integer, constant:: ifact(292)
! Initialising variables
kk=0
ii=0
jj=0
! Setting local thread index and block index
tx= threadidx%x
bx= blockidx%x
! Decomposing into threads
i= blocksize*(bx-1)+(tx)
! Conditional statement to prevent threads working outside of array boundaries
if (i<(nbasis+1))then
! Ifact array used as an index
ii=ifact(i)
! Outer j loop
do j=1,i
jj=ifact(j)
sum=0.0d0
one=0.0d0;
! First vecotr multiplication, indexing set by ifact(j) and ifact(i),
! the results is accumulated in sum
do k=1,j
sum=sum+a(ii+k)*b(jj+k)
end do
! Second vecotr multiplication, indexing set by ifact(k) and ifact(i),
! the results is accumulated in sum
do k=j+1,i
kk=ifact(k)
sum=sum+a(ii+k)*b(kk+j)
end do
! Third vecotr multiplication, indexing set by ifact(k),
! the results is accumulated in sum
do k=i+1,nbasis
kk=ifact(k)
sum=sum+a(kk+i)*b(kk+j)
end do
! Storing intermediate variables
temp=sum;
one=-1.0d0;
sum=0.0d0
! Combined section of kernel
! Multiplying the opposite part of the a and b arrays,
! notice the b and a arrays have swapped positions.
! First vecotr multiplication, indexing set by ifact(j) and ifact(i),
! the results is accumulated in sum
do k=1,j
sum=sum+b(ii+k)*a(jj+k)
end do
! Second vecotr multiplication, indexing set by ifact(k) and ifact(i),
! the results is accumulated in sum
do k=j+1,i
kk=ifact(k)
sum=sum+b(ii+k)*a(kk+j)
end do
! Third vecotr multiplication, indexing set by ifact(k),
! the results is accumulated in sum
do k=i+1,nbasis
kk=ifact(k)
sum=sum+b(kk+i)*a(kk+j)
end do
! Calculating the counter
m=0
do l=i-1,0,-1
m=m+l
end do
! Calculating the result
d(m+j)=sum+one*temp
end do
end if
end
Thank you in advance,
Crip_crop