Need help writing "non-thunking" code in Fortran This code also uses complex numbers

I would be very grateful for help with getting the following code working. The output matrix,C, contains only zeros, whereas I am expecting the product of two random numbers.

[codebox] !

! Simple Fortan90 program that multiplies 2 square matrices calling Cgemm

! C = alpha A*B + beta C

!

program matrix_multiply

implicit none

integer, parameter :: fp_kind = kind(0.0)

integer, parameter :: spc = kind((1.0_fp_kind,1.0_fp_kind))

! Define

complex(spc), allocatable :: A( : , : ), B( : , : ), C( : , : )

real, allocatable :: A_ran( : , : )

real, allocatable :: B_ran( : , : )

real :: time_start,time_end, norm

complex(spc) zZero

complex(spc) alpha

complex(spc) beta

integer:: i,j,m1,ierr

integer :: nr,nc,nr_maxA,nc_MaxA,nr_maxB,nc_MaxB

external :: cublas_cgemm

external :: cublas_init, cublas_shutdown, cublas_free

integer, external :: cublas_alloc, cublas_set_matrix, cublas_get_matrix, cublas_set_vector, cublas_get_vector

integer devPtrA, devPtrB, devPtrC, sizeof_complex, status

zZero = cmplx( 0.0, 0.0 )

alpha = cmplx( 1.0, 0.0 )

beta = zZero

sizeof_complex = spc

m1 = 2496

! Fill the complex matrices with random numbers

allocate( A(1:m1, 1:m1 ) )

  A = cmplx( 0.0, 0.0 )

  allocate( A_ran(1:m1, 1:m1 ) )

  allocate( B_ran(1:m1, 1:m1 ) )

  call random_seed( )

  call random_number( A_ran )

  call random_number( B_ran )

  A = cmplx( A_ran, B_ran )

  allocate( B(1:m1, 1:m1 ) )

  B = cmplx( 0.0, 0.0 )

  call random_number( A_ran )

  call random_number( B_ran )

  B = cmplx( A_ran, B_ran )

  deallocate( A_ran, B_ran )

allocate(C(m1,m1))

  C = cmplx( 0.0, 0.0 )

! Check one entry of each array

  print *,'A(1,1) = ',A(1,1)

  print *,'B(1,1) = ',B(1,1)

  print *,'C(1,1) = ',C(1,1)

call cublas_init

  status = cublas_alloc(m1*m1, sizeof_complex,devPtrA)

  if(status /= 0) then

    write(*,*) "A device memory allocation failed"

    call cublas_shutdown

    stop

  endif

  status = cublas_set_matrix(m1,m1,sizeof_complex,A,m1,devPtrA,m1)

  if(status /= 0) then

    call cublas_free(devPtrA)

    write(*,*) "A data download failed"

    call cublas_shutdown

    stop

  endif

  status = cublas_alloc(m1*m1, sizeof_complex,devPtrB)

  if(status /= 0) then

    write(*,*) "B device memory allocation failed"

    call cublas_shutdown

    stop

  endif

  status = cublas_set_matrix(m1,m1,sizeof_complex,B,m1,devPtrB,m1)

  if(status /= 0) then

    call cublas_free(devPtrB)

    write(*,*) "B data download failed"

    call cublas_shutdown

    stop

  endif

  status = cublas_alloc(m1*m1, sizeof_complex,devPtrC)

  if(status /= 0) then

    write(*,*) "C device memory allocation failed"

    call cublas_shutdown

    stop

  endif

  status = cublas_set_matrix(m1,m1,sizeof_complex,C,m1,devPtrC,m1)

  if(status /= 0) then

    call cublas_free(devPtrC)

    write(*,*) "C data download failed"

    call cublas_shutdown

    stop

  endif

! Compute the matrix product

  call cpu_time(time_start)

call cublas_Cgemm (‘n’,‘n’,m1,m1,m1,alpha,A,m1,B,m1,beta,C,m1)

! call cublasCgemm (‘n’,‘n’,m1,m1,m1,alpha,devPtrA,m1,devPtrB,m1,beta,devPtrC,m1)

 print *,'Using cublas_CGEMM'

call cpu_time(time_end)

status = cublas_get_matrix(m1,m1,sizeof_complex, devPtrC,m1,C,m1)

  if(status /= 0) then

    call cublas_free(devPtrC)

    write(*,*) "C data upload failed"

    call cublas_shutdown

    stop

  endif

  call cublas_free(devPtrA)

  call cublas_free(devPtrB)

  call cublas_free(devPtrC)

  call cublas_shutdown

! Check one entry of each array - again

  print *,'A(1,1) = ',A(1,1)

  print *,'B(1,1) = ',B(1,1)

  print *,'C(1,1) = ',C(1,1)

! Print timing information

  print "(i5,1x,a,1x,f8.4,2x,a,f12.4)", m1, " time =",time_end-time_start, " MFLOPS=",1.e-6*2._fp_kind*m1*m1*m1/(time_end-time_start)

norm = 0.0

  do j = 1,m1

    do i = 1,m1

      norm = norm + C(i,j)*conjg(C(i,j))

    end do

  end do 

  norm = sqrt(norm)

  print *,'m1,C-norm = ',m1,norm

  print *,' '

deallocate(A,B,C)

end program matrix_multiply

[/codebox]

The corresponding Makefile is:

[codebox]

FC = ifort

FFLAGS= -O3 -fpp

cgemm_nonthunking: cgemm_nonthunking.f90 fortran.o

    $(FC)  -o cgemm_nonthunking $(FFLAGS) cgemm_nonthunking.f90 fortran.o -L/usr/local/cuda/lib64  -lcublas 

fortran.o: fortran.c

    gcc -O3 -I/usr/local/cuda/include -c  fortran.c

clean:

    rm cgemm_nonthunking  fortran.o

[/codebox]

The output of the above program ls:

[codebox]

A(1,1) = (0.4843275,0.5477588)

B(1,1) = (0.6729684,1.9516187E-02)

C(1,1) = (0.0000000E+00,0.0000000E+00)

Using cublas_CGEMM

A(1,1) = (0.4843275,0.5477588)

B(1,1) = (0.6729684,1.9516187E-02)

C(1,1) = (0.0000000E+00,0.0000000E+00)

2496 time = 0.0000 MFLOPS= Infinity

m1,C-norm = 2496 0.0000000E+00

[/codebox]

Again, any and all help will be appreciated.