Calling cuda with cublas_v2 from fortran

mbarbry · March 2, 2017, 9:59am

Dear Cuda developers,

I’m trying to write some complex functions using cuda that use blas routines in the device code.
This cuda routines are then call from a fortran program.
Unfortunately, I don’t manage to compile my program, I reproduce a small test.

The cuda routines,

#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <math.h>

#include <cuda.h>
#include <cuda_runtime.h>
#include <helper_functions.h>
#include <helper_cuda.h>
#include <cublas_v2.h>


int addition(int a, int b)
{
  int c;
  c = a + b;
  return c;
}

__global__ void addition_vect(int *A, int *B, int *C, int N)
{
  int i = blockIdx.x * blockDim.x + threadIdx.x;

  cublasHandle_t cnpHandle;
  cublasCreate(&cnpHandle);

  if (i<N)
  {
    C[i] = A[i] + B[i];
  }

  cublasDestroy(cnpHandle);
}

__global__ void print_array_dev(float *arr, int N, int M)
{
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  int j = blockIdx.y * blockDim.y + threadIdx.y;

  if(i<N)
  {
    if (j<M)
    {
      printf("arr[%d, %d] = %f\n", i, j, arr[j+i*M]);
    }
  }

}

extern "C" void addition_vect_cuda_(int *A, int *B, int *C, int *Np)
{
  int N = *Np;

  // device variables
  int *A_d, *B_d, *C_d;
  
  checkCudaErrors(cudaMalloc( (void **)&A_d, sizeof(int) *N));
  checkCudaErrors(cudaMalloc( (void **)&B_d, sizeof(int) *N));
  checkCudaErrors(cudaMalloc( (void **)&C_d, sizeof(int) *N));
  
  checkCudaErrors(cudaMemcpy( A_d, A, sizeof(int)*N, cudaMemcpyHostToDevice));
  checkCudaErrors(cudaMemcpy( B_d, B, sizeof(int)*N, cudaMemcpyHostToDevice));

  addition_vect<<<1, N>>>(A_d, B_d, C_d, N);
  
  checkCudaErrors(cudaMemcpy( C, C_d, sizeof(int)*N, cudaMemcpyDeviceToHost));

  checkCudaErrors(cudaFree(A_d));
  checkCudaErrors(cudaFree(B_d));
  checkCudaErrors(cudaFree(C_d));

}

The main fortran program

program test

  implicit none
  integer, allocatable :: A(:), B(:), C(:)
  integer :: N
  integer :: i

  N=5

  allocate(A(N))
  allocate(B(N))
  allocate(C(N))

  do i =1, N
    A(i) = i
    B(i) = 2*i
  enddo
  C = 0

  print*, 'fort add: ', A+B

  call addition_vect_cuda(A, B, C, N)

  print*, 'cuda add: ', C

  deallocate(A)
  deallocate(B)
  deallocate(C)

end program

and the Makefile,

CC=nvcc
FC=gfortran
ARCH = -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_60,code=compute_60
FLAGS = -g -G -lineinfo -m64 -dc $(ARCH)
FCFLAGS=-g
PROGRAM=test
OBJECTS=func.o test.o
CUDA=/usr/local/cuda
CUDA_HELPER=$(CUDA)/samples/common/inc
LIB=-lcuda -lcudart -lcublas -lcublas_device -lcudadevrt 

all:$(PROGRAM)

$(PROGRAM): $(OBJECTS)
		$(FC) -I$(CUDA)/include -I$(CUDA_HELPER) $(FCFLAGS) -o $@ $^ -L$(CUDA)/lib64 $(LIB) -lstdc++
%.o: %.f90 
		$(FC) -I$(CUDA)/include -I$(CUDA_HELPER) -L$(CUDA)/lib64 $(LIB) -lstdc++ $(FCFLAGS) -c $<
%.o: %.cu
		$(CC) -I$(CUDA_HELPER) $(LIB) $(FLAGS) -c $<


# Utility targets
.PHONY: clean veryclean

clean:
	rm *.o *.mod $(PROGRAM)

If you remove the calls to the cublas_v2, and the -dc option from the FLAGS, then the routine is running fine. In fact it seems that it is the -dc option that causes the error.

test_fortran $ make
nvcc -I/usr/local/cuda/samples/common/inc -lcuda -lcudart -lcublas -lcublas_device -lcudadevrt  -g -G -lineinfo -m64 -dc -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_60,code=compute_60 -c func.cu
gfortran -I/usr/local/cuda/include -I/usr/local/cuda/samples/common/inc -L/usr/local/cuda/lib64 -lcuda -lcudart -lcublas -lcublas_device -lcudadevrt  -lstdc++ -g -c test.f90
gfortran -I/usr/local/cuda/include -I/usr/local/cuda/samples/common/inc -g -o test func.o test.o -L/usr/local/cuda/lib64 -lcuda -lcudart -lcublas -lcublas_device -lcudadevrt  -lstdc++
func.o: In function `__sti____cudaRegisterAll_51_tmpxft_00001b98_00000000_15_func_compute_60_cpp1_ii_b582f6e5()':
/tmp/tmpxft_00001b98_00000000-4_func.compute_60.cudafe1.stub.c:2: undefined reference to `__cudaRegisterLinkedBinary_51_tmpxft_00001b98_00000000_15_func_compute_60_cpp1_ii_b582f6e5'
collect2: error: ld returned 1 exit status
Makefile:15: recipe for target 'test' failed
make: *** [test] Error 1

If using a C program to call the same function, then the compilation goes fine.
Does someone knows what is wrong here?

Thank for your helps,
Marc Barbry

tera · March 2, 2017, 10:11am

I haven’t looked into your problem closely, but have you tried using nvcc to do the linking steps where CUDA code is involved?

mbarbry · March 2, 2017, 10:15am

When compiling the cuda code with nvcc I use the -dc option to do the linking (since it is necessary to call the cublas_v2 it seems).
But then it is causing the error when linking the cuda and the fortran code together with gfortran.

tera · March 2, 2017, 10:25am

gfortran doesn’t know about the device-side linking, so I’m not surprised it fails. On the other hand, gfortran-produced code canb easily be linked by gcc without special Fortran knowledge. So I’d suggest using nvcc for the final linking stage.

mbarbry · March 2, 2017, 10:33am

Compiling the final stage with nvcc solve the problem, thank you for your help.

Here is the corrected makefile,

CC=nvcc
FC=gfortran
ARCH = -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_60,code=compute_60
FLAGS = -g -G -lineinfo -m64 -dc $(ARCH)
FCFLAGS=-g
PROGRAM=test
OBJECTS=func.o test.o
CUDA=/usr/local/cuda
CUDA_HELPER=$(CUDA)/samples/common/inc
LIB=-lcuda -lcudart -lcublas -lcublas_device -lcudadevrt 

all:$(PROGRAM)

$(PROGRAM): $(OBJECTS)
		$(CC) -I/usr/include -I$(CUDA)/include -I$(CUDA_HELPER) $(FCFLAGS) $(ARCH) -o $@ $^ -L$(CUDA)/lib64 $(LIB) -lstdc++ -lgfortran
%.o: %.f90 
		$(FC) -I$(CUDA)/include -I$(CUDA_HELPER) -L$(CUDA)/lib64 $(LIB) -lstdc++ $(FCFLAGS) -c $<
%.o: %.cu
		$(CC) -I$(CUDA_HELPER) $(LIB) $(FLAGS) -c $<


# Utility targets
# .PHONY: clean veryclean

clean:
	rm *.o *.mod $(PROGRAM)

When linking fortran code with nvcc one need to use the -lgfortran library.

Topic		Replies	Views
Problem using Cuda as a static library with C++ and Fortran on VS2012 CUDA Programming and Performance	3	720	February 23, 2017
openacc with cublas batched routine in fortran Legacy PGI Compilers	7	8218	January 27, 2017
How to call cublas library into my cuda fortran code? Legacy PGI Compilers	5	5934	December 8, 2011
CUDA 6.5 building problem CUDA Setup and Installation	21	10445	March 22, 2021
cuBLAS Dgemm "Could not Resolve Generic Procedure nvc, nvc++ and nvfortran cuda , hpc	7	1448	December 17, 2021
[openACC]nvfortran minloc/maxloc became unable after update to sdk22.7, cuda11.7 + libcudaforwraprand.so error on the execution nvc, nvc++ and nvfortran	5	649	August 17, 2022
NVCC forces c++ compilation of .cu files CUDA Programming and Performance	11	25537	December 11, 2011
Calling cuBlas from a Fortran program Legacy PGI Compilers	9	769	June 8, 2020
Calling CUBLAS' cublasDgetrfBatched proper procedure Legacy PGI Compilers	3	4843	January 30, 2017
fortran cuda interface passing pointer from fortran and allocating memory on device CUDA Programming and Performance	8	9934	May 14, 2010

Calling cuda with cublas_v2 from fortran

Related topics