I would be very grateful for help with getting the following code working. The output matrix,C, contains only zeros, whereas I am expecting the product of two random numbers.
[codebox] !
! Simple Fortan90 program that multiplies 2 square matrices calling Cgemm
! C = alpha A*B + beta C
!
program matrix_multiply
implicit none
integer, parameter :: fp_kind = kind(0.0)
integer, parameter :: spc = kind((1.0_fp_kind,1.0_fp_kind))
! Define
complex(spc), allocatable :: A( : , : ), B( : , : ), C( : , : )
real, allocatable :: A_ran( : , : )
real, allocatable :: B_ran( : , : )
real :: time_start,time_end, norm
complex(spc) zZero
complex(spc) alpha
complex(spc) beta
integer:: i,j,m1,ierr
integer :: nr,nc,nr_maxA,nc_MaxA,nr_maxB,nc_MaxB
external :: cublas_cgemm
external :: cublas_init, cublas_shutdown, cublas_free
integer, external :: cublas_alloc, cublas_set_matrix, cublas_get_matrix, cublas_set_vector, cublas_get_vector
integer devPtrA, devPtrB, devPtrC, sizeof_complex, status
zZero = cmplx( 0.0, 0.0 )
alpha = cmplx( 1.0, 0.0 )
beta = zZero
sizeof_complex = spc
m1 = 2496
! Fill the complex matrices with random numbers
allocate( A(1:m1, 1:m1 ) )
A = cmplx( 0.0, 0.0 )
allocate( A_ran(1:m1, 1:m1 ) )
allocate( B_ran(1:m1, 1:m1 ) )
call random_seed( )
call random_number( A_ran )
call random_number( B_ran )
A = cmplx( A_ran, B_ran )
allocate( B(1:m1, 1:m1 ) )
B = cmplx( 0.0, 0.0 )
call random_number( A_ran )
call random_number( B_ran )
B = cmplx( A_ran, B_ran )
deallocate( A_ran, B_ran )
allocate(C(m1,m1))
C = cmplx( 0.0, 0.0 )
! Check one entry of each array
print *,'A(1,1) = ',A(1,1)
print *,'B(1,1) = ',B(1,1)
print *,'C(1,1) = ',C(1,1)
call cublas_init
status = cublas_alloc(m1*m1, sizeof_complex,devPtrA)
if(status /= 0) then
write(*,*) "A device memory allocation failed"
call cublas_shutdown
stop
endif
status = cublas_set_matrix(m1,m1,sizeof_complex,A,m1,devPtrA,m1)
if(status /= 0) then
call cublas_free(devPtrA)
write(*,*) "A data download failed"
call cublas_shutdown
stop
endif
status = cublas_alloc(m1*m1, sizeof_complex,devPtrB)
if(status /= 0) then
write(*,*) "B device memory allocation failed"
call cublas_shutdown
stop
endif
status = cublas_set_matrix(m1,m1,sizeof_complex,B,m1,devPtrB,m1)
if(status /= 0) then
call cublas_free(devPtrB)
write(*,*) "B data download failed"
call cublas_shutdown
stop
endif
status = cublas_alloc(m1*m1, sizeof_complex,devPtrC)
if(status /= 0) then
write(*,*) "C device memory allocation failed"
call cublas_shutdown
stop
endif
status = cublas_set_matrix(m1,m1,sizeof_complex,C,m1,devPtrC,m1)
if(status /= 0) then
call cublas_free(devPtrC)
write(*,*) "C data download failed"
call cublas_shutdown
stop
endif
! Compute the matrix product
call cpu_time(time_start)
call cublas_Cgemm (‘n’,‘n’,m1,m1,m1,alpha,A,m1,B,m1,beta,C,m1)
! call cublasCgemm (‘n’,‘n’,m1,m1,m1,alpha,devPtrA,m1,devPtrB,m1,beta,devPtrC,m1)
print *,'Using cublas_CGEMM'
call cpu_time(time_end)
status = cublas_get_matrix(m1,m1,sizeof_complex, devPtrC,m1,C,m1)
if(status /= 0) then
call cublas_free(devPtrC)
write(*,*) "C data upload failed"
call cublas_shutdown
stop
endif
call cublas_free(devPtrA)
call cublas_free(devPtrB)
call cublas_free(devPtrC)
call cublas_shutdown
! Check one entry of each array - again
print *,'A(1,1) = ',A(1,1)
print *,'B(1,1) = ',B(1,1)
print *,'C(1,1) = ',C(1,1)
! Print timing information
print "(i5,1x,a,1x,f8.4,2x,a,f12.4)", m1, " time =",time_end-time_start, " MFLOPS=",1.e-6*2._fp_kind*m1*m1*m1/(time_end-time_start)
norm = 0.0
do j = 1,m1
do i = 1,m1
norm = norm + C(i,j)*conjg(C(i,j))
end do
end do
norm = sqrt(norm)
print *,'m1,C-norm = ',m1,norm
print *,' '
deallocate(A,B,C)
end program matrix_multiply
[/codebox]
The corresponding Makefile is:
[codebox]
FC = ifort
FFLAGS= -O3 -fpp
cgemm_nonthunking: cgemm_nonthunking.f90 fortran.o
$(FC) -o cgemm_nonthunking $(FFLAGS) cgemm_nonthunking.f90 fortran.o -L/usr/local/cuda/lib64 -lcublas
fortran.o: fortran.c
gcc -O3 -I/usr/local/cuda/include -c fortran.c
clean:
rm cgemm_nonthunking fortran.o
[/codebox]
The output of the above program ls:
[codebox]
A(1,1) = (0.4843275,0.5477588)
B(1,1) = (0.6729684,1.9516187E-02)
C(1,1) = (0.0000000E+00,0.0000000E+00)
Using cublas_CGEMM
A(1,1) = (0.4843275,0.5477588)
B(1,1) = (0.6729684,1.9516187E-02)
C(1,1) = (0.0000000E+00,0.0000000E+00)
2496 time = 0.0000 MFLOPS= Infinity
m1,C-norm = 2496 0.0000000E+00
[/codebox]
Again, any and all help will be appreciated.