Dynamic allocation of memory on Device

Hi guys,

I am putting together a recursive strassen matrix multiply scheme using an example from


and the routines described in this paper


I’m currently just trying to get my kernels to run correctly when called from another kernel. I tested my main kernels and they work with out any problems. I was able to launch the Strassen kernel without problems when I allocate memory on device from host for A11, A12, … etc.

When I try to dynamically allocate memory in the device code I get a unspecified launch failure from cudaGetLastError and then mem cpyout error when C is transferred from device to host. This is the same approach as presented in the insider article, and after reading the reference guide’s topic on this, I’m not sure what is the cause of the error.

Whatever the case my goal is to recursively apply the Strassen kernel. I need a good way to handle memory allocation for the submatrices or a good way to break up A and B into submatrices.

I noticed that the assignment of arrays in the kernel is different than in normal fortran. I can’t just pass to a kernel A(1:x,1:x), is there a good way to do this instead of using kernels to split and combine A and B?

Card: Tesla K20
Compiled with: pgf90 -Mcuda=cuda5.0,cc35,rdc -tp:x64

  use cudafor
  real(KIND(0.D0)), allocatable, device, dimension(:,:) :: T1, T2
  real(KIND(0.D0)), allocatable, device, dimension(:,:) :: A11, A12, A21, A22
  real(KIND(0.D0)), allocatable, device, dimension(:,:) :: B11, B12, B21, B22
  real(KIND(0.D0)), allocatable, device, dimension(:,:) :: C11, C12, C21, C22

attributes(global) subroutine Strassen(A,B,C,N)!,A11,A12,A21,A22,B11,B12,B21,B22,C11,C12,C21,C22,T1, T2, N) 
	real(KIND(0.D0)),  device, dimension(N,N) :: A, B, C
	!real(KIND(0.D0)),  device, dimension(N/2,N/2) :: T1, T2
	!real(KIND(0.D0)),  device, dimension(N/2,N/2) :: A11, A12, A21, A22
	!real(KIND(0.D0)),  device, dimension(N/2,N/2) :: B11, B12, B21, B22
	!real(KIND(0.D0)),  device, dimension(N/2,N/2) :: C11, C12, C21, C22
	INTEGER :: i,x, istat
	type(dim3) :: blocks
	type(dim3) :: threads
	i = threadidx%x
	x = N/2
	blocks = dim3(x/16, x/16, 1)
    threads = dim3(16, 16, 1)
    IF(i==1) THEN
	   ! IF(nalloc == 0) THEN
			allocate(A11(x,x), A12(x,x), A21(x,x), A22(x,x))
			allocate(B11(x,x), B12(x,x), B21(x,x), B22(x,x))
			allocate(C11(x,x), C12(x,x), C21(x,x), C22(x,x))
			allocate(T1(x,x), T2(x,x))
			!nalloc = 1
		call Split<<<blocks,threads>>>(A,B,A11,A12,A21,A22,B11,B12,B21,B22,N)

		call sub<<<blocks,threads>>>(A21, A11, C12, x)
		call add<<<blocks,threads>>>(B11, B12, C21, x)
		call mmul<<<blocks,threads>>>(C12, C21, C22, x)
		call sub<<<blocks,threads>>>(A12, A22, C12, x)

		call add<<<blocks,threads>>>(B21, B22, C21, x)

		call mmul<<<blocks,threads>>>(C12, C21, C11, x)

		call add<<<blocks,threads>>>(A11, A22, C12, x)

		call add<<<blocks,threads>>>(B11, B22, C21, x)
		call mulIncInc<<<blocks,threads>>>(C12, C21, C11, C22, x)

		call add<<<blocks,threads>>>(A21, A22, T2, x)
		call mulStoreDec<<<blocks,threads>>>(T2, B11, C21, C22, x)
		call sub<<<blocks,threads>>>(B21, B11, T1, x)
		call mulIncInc<<<blocks,threads>>>(A22, T1, C21, C11, x)
		call sub<<<blocks,threads>>>(B12, B22, T1, x)

		call mulStoreInc<<<blocks,threads>>>(A11, T1, C12, C22, x)
		call add<<<blocks,threads>>>(A11, A12, T2, x)

		call mulIncDec<<<blocks,threads>>>(T2, B22, C12, C11, x)

		call Combine<<<blocks,threads>>>(C,C11,C12,C21,C22,N)
		DEALLOCATE(A11, A12, A21, A22)
		DEALLOCATE(B11, B12, B21, B22)
		DEALLOCATE(C11, C12, C21, C22)
 end subroutine Strassen

program main

  USE CUDA_Kernels
  USE cudafor

  integer, parameter :: N = 2048
  integer, parameter :: NREPS = 1
  real(KIND(0.D0)), allocatable, dimension(:,:) :: C_mat
  real(KIND(0.D0)), allocatable, dimension(:,:), pinned :: A, B, C
  real(KIND(0.D0)), allocatable, device, dimension(:,:) :: dA, dB, dC
  real(KIND(0.D0)) ::  rmaxerr, rsumerr, nerrors, gflops
  real :: time
  INTEGER :: i, j, istat, x, loci, locj
  type(cudaEvent) :: start, stop
  type(dim3) :: blocks
  type(dim3) :: threads

  x = N/2
  istat = cudaEventCreate(start)
  istat = cudaEventCreate(stop)

  blocks = dim3(x/16, x/16, 1)
  threads = dim3(16, 16, 1)

  allocate(A(N,N), B(N,N), C(N,N), C_mat(N,N))
  allocate(dA(N,N), dB(N,N), dC(N,N))

  OPEN (UNIT = 10, FILE = 'A.txt', status='OLD')
  OPEN (UNIT = 20, FILE = 'B.txt', status='OLD')
  OPEN (UNIT = 30, FILE = 'C_mat.txt', status='OLD')

  !Max size of N = 4096
  DO i = 1,N
	  READ(10,*) (A(i,j),j=1,N)
	  READ(20,*) (B(i,j),j=1,N)
	  READ(30,*) (C_mat(i,j),j=1,N)



C = 0.d0

dA = A
dB = B 
dC = C

! timing experiment
time = 0.d0
  istat = cudaEventRecord(start, 0)
  !do j = 1, NREPS
        call Strassen<<<1,1>>>(dA,dB,dC, N) 

  !end do
  istat = cudaEventRecord(stop, 0)
  istat = cudaDeviceSynchronize()
  istat = cudaEventElapsedTime(time, start, stop)
  time = time / (NREPS*1.0d3)
  ir = cudaGetLastError()
  print *, cudaGetErrorString( ir ) 
  C = dC

  do j = 1, N
    do i = 1, N
      if (C(i,j) .ne. C_mat(i,j)) then
        if (abs(C(i,j)-C_mat(i,j)) .gt. rmaxerr) THEN
		rmaxerr = abs(C(i,j)-C_mat(i,j))
                loci = i 
                locj = j
	end if
        if (abs(C(i,j)-C_mat(i,j)) .gt. 1d-10) nerrors = nerrors + 1
        rsumerr = rsumerr + abs(C(i,j)-C_mat(i,j))
      end if
    end do
  end do

  if (nerrors .eq. 0) then
    print *,"Test passed!"
    print *,nerrors," errors were encountered"
    print *,"Max error was ",rmaxerr
    print *,"Ave error was ",rsumerr / (N * N)

  gflops = 0! 33095680/time/1d9 !(7T+7m^2/4)/time/G
  write (*,901) N,N,N,N,time*1.0d3, gflops
  print *,"### Location of max error is ",loci, locj
  print *,"### C(1,1+x)    =",C(2000,2000)
  print *,"### C_mat(1,1)=",C_mat(2000,2000)
901 format(i0,'x',i0,' * ',i0,'x',i0,':\t',f8.3,' ms\t',f11.3,' GFlops/s')

  deallocate(A(N,N), B(N,N), C(N,N), C_mat(N,N))
  deallocate(dA(N,N), dB(N,N), dC(N,N))

end program

Hi Mr. Savage,

Mind send the complete code to PGI Customer Support (trs@pgroup.com) and ask them to send it to me?

It could be a problem with your code or since dynamic parallelism is a new feature there’s possibly still issues to be worked out. Either way, I’ll need to replicate the error before I can determine the cause.