Error in compiling CUDA Fortran

Hello everybody,

I got the following error:
NVFORTRAN-S-0034-Syntax error at or near device (XXXXX.f90: 59)

And the line 59 is:
real, device, allocatable :: d_A(:,:), d_B(:,:), d_C(:,:)
I just want to multiply 2 random matrices A, B

What should I do now?

Thank you.

Hi @mbzadegan, welcome to the NVIDIA developer forums.

I think we need a bit more detail to be able to help you.

Can you share some context?

Thanks!

Thank you Markus,
This is my code that I want to multiply two matrices with CPU and GPU and then show the processing times:

My code in GitHub

Oh, my bad, I forgot to change .f90 to .cuf or using -cuda.

Now, I got the following error on my code:

user@xadegan:~$ nvfortran CPU_1T-CPU-MT-GPU.cuf
NVFORTRAN-S-0155-Global and device subprograms may not be contained in the main program or another subprogram - matrix_mult_kernel (CPU_1T-CPU-MT-GPU.cuf: 76)
NVFORTRAN/x86-64 Linux 25.1-0: compilation completed with severe errors
user@xadegan:~$

The line 76 is:
attributes(global) subroutine matrix_mult_kernel(A, B, C, N)
What should I do with this line to resolve this error?
Thank you!

Put the GPU code in a module. Also, cudaDeviceSynchronize() is a function not a subroutine. You also don’t really need device in the global subroutine



! Here's a Fortran program implementing matrix multiplication using three approaches:
!     CPU Single-threaded
!     OpenMP Multi-threaded
!     CUDA (GPU-based)
! This program initializes two random matrices, performs multiplication using the three methods, and then measures the execution time for each approach.

module gpu_mat
  contains
  subroutine gpu_matrix_multiply(A, B, C, N, start, finish)
    use cudafor
    implicit none
    integer, intent(in) :: N
    real, dimension(N,N), intent(in) :: A, B
    real, dimension(N,N), intent(out) :: C
    real, device, allocatable :: d_A(:,:), d_B(:,:), d_C(:,:)
    real, intent(out) :: start, finish
    integer:: istat

    allocate(d_A(N,N), d_B(N,N), d_C(N,N))
    d_A = A
    d_B = B
    d_C = 0.0

    call cpu_time(start)
    call matrix_mult_kernel<<<(N+15)/16, (N+15)/16>>>(d_A, d_B, d_C, N)
    istat= cudaDeviceSynchronize()
    call cpu_time(finish)

    C = d_C
    deallocate(d_A, d_B, d_C)
  end subroutine gpu_matrix_multiply

  attributes(global) subroutine matrix_mult_kernel(A, B, C, N)
    use cudafor
    implicit none
    integer, value :: N
    real,device :: A(N,N), B(N,N), C(N,N)
    integer :: i, j, k
    i = (blockIdx%x - 1) * blockDim%x + threadIdx%x
    j = (blockIdx%y - 1) * blockDim%y + threadIdx%y
    if (i <= N .and. j <= N) then
       C(i,j) = 0.0
       do k = 1, N
          C(i,j) = C(i,j) + A(i,k) * B(k,j)
       end do
    end if
  end subroutine matrix_mult_kernel
end module gpu_mat

program matrix_multiplication
  use omp_lib
  use gpu_mat
  implicit none
  integer, parameter :: N = 512
  real, dimension(N,N) :: A, B, C
  integer :: i, j, k
  real :: start, finish

  call random_seed()
  call random_number(A)
  call random_number(B)
  C = 0.0

  ! Single-threaded CPU matrix multiplication
  call cpu_time(start)
  do i = 1, N
     do j = 1, N
        do k = 1, N
           C(i,j) = C(i,j) + A(i,k) * B(k,j)
        end do
     end do
  end do
  call cpu_time(finish)
  print *, 'Single-thread CPU Time:', finish - start

  ! OpenMP Multi-threaded matrix multiplication
  C = 0.0
  call cpu_time(start)
  !$omp parallel do private(i,j,k) shared(A,B,C) schedule(static)
  do i = 1, N
     do j = 1, N
        do k = 1, N
           C(i,j) = C(i,j) + A(i,k) * B(k,j)
        end do
     end do
  end do
  !$omp end parallel do
  call cpu_time(finish)
  print *, 'OpenMP Multi-threaded CPU Time:', finish - start

  ! CUDA Kernel Call
  call gpu_matrix_multiply(A, B, C, N, start, finish)
  print *, 'CUDA GPU Time:', finish - start

end program matrix_multiplication

This is the output:

./a.out 
 Single-thread CPU Time:   0.5041461    
 OpenMP Multi-threaded CPU Time:   0.4083960    
 CUDA GPU Time:   1.1618137E-03
1 Like

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.