Calling cuda with cublas_v2 from fortran

Dear Cuda developers,

I’m trying to write some complex functions using cuda that use blas routines in the device code.
This cuda routines are then call from a fortran program.
Unfortunately, I don’t manage to compile my program, I reproduce a small test.

The cuda routines,

#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <math.h>

#include <cuda.h>
#include <cuda_runtime.h>
#include <helper_functions.h>
#include <helper_cuda.h>
#include <cublas_v2.h>


int addition(int a, int b)
{
  int c;
  c = a + b;
  return c;
}

__global__ void addition_vect(int *A, int *B, int *C, int N)
{
  int i = blockIdx.x * blockDim.x + threadIdx.x;

  cublasHandle_t cnpHandle;
  cublasCreate(&cnpHandle);

  if (i<N)
  {
    C[i] = A[i] + B[i];
  }

  cublasDestroy(cnpHandle);
}

__global__ void print_array_dev(float *arr, int N, int M)
{
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  int j = blockIdx.y * blockDim.y + threadIdx.y;

  if(i<N)
  {
    if (j<M)
    {
      printf("arr[%d, %d] = %f\n", i, j, arr[j+i*M]);
    }
  }

}

extern "C" void addition_vect_cuda_(int *A, int *B, int *C, int *Np)
{
  int N = *Np;

  // device variables
  int *A_d, *B_d, *C_d;
  
  checkCudaErrors(cudaMalloc( (void **)&A_d, sizeof(int) *N));
  checkCudaErrors(cudaMalloc( (void **)&B_d, sizeof(int) *N));
  checkCudaErrors(cudaMalloc( (void **)&C_d, sizeof(int) *N));
  
  checkCudaErrors(cudaMemcpy( A_d, A, sizeof(int)*N, cudaMemcpyHostToDevice));
  checkCudaErrors(cudaMemcpy( B_d, B, sizeof(int)*N, cudaMemcpyHostToDevice));

  addition_vect<<<1, N>>>(A_d, B_d, C_d, N);
  
  checkCudaErrors(cudaMemcpy( C, C_d, sizeof(int)*N, cudaMemcpyDeviceToHost));

  checkCudaErrors(cudaFree(A_d));
  checkCudaErrors(cudaFree(B_d));
  checkCudaErrors(cudaFree(C_d));

}

The main fortran program

program test

  implicit none
  integer, allocatable :: A(:), B(:), C(:)
  integer :: N
  integer :: i

  N=5

  allocate(A(N))
  allocate(B(N))
  allocate(C(N))

  do i =1, N
    A(i) = i
    B(i) = 2*i
  enddo
  C = 0

  print*, 'fort add: ', A+B

  call addition_vect_cuda(A, B, C, N)

  print*, 'cuda add: ', C

  deallocate(A)
  deallocate(B)
  deallocate(C)

end program

and the Makefile,

CC=nvcc
FC=gfortran
ARCH = -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_60,code=compute_60
FLAGS = -g -G -lineinfo -m64 -dc $(ARCH)
FCFLAGS=-g
PROGRAM=test
OBJECTS=func.o test.o
CUDA=/usr/local/cuda
CUDA_HELPER=$(CUDA)/samples/common/inc
LIB=-lcuda -lcudart -lcublas -lcublas_device -lcudadevrt 

all:$(PROGRAM)

$(PROGRAM): $(OBJECTS)
		$(FC) -I$(CUDA)/include -I$(CUDA_HELPER) $(FCFLAGS) -o $@ $^ -L$(CUDA)/lib64 $(LIB) -lstdc++
%.o: %.f90 
		$(FC) -I$(CUDA)/include -I$(CUDA_HELPER) -L$(CUDA)/lib64 $(LIB) -lstdc++ $(FCFLAGS) -c $<
%.o: %.cu
		$(CC) -I$(CUDA_HELPER) $(LIB) $(FLAGS) -c $<


# Utility targets
.PHONY: clean veryclean

clean:
	rm *.o *.mod $(PROGRAM)

If you remove the calls to the cublas_v2, and the -dc option from the FLAGS, then the routine is running fine. In fact it seems that it is the -dc option that causes the error.

test_fortran $ make
nvcc -I/usr/local/cuda/samples/common/inc -lcuda -lcudart -lcublas -lcublas_device -lcudadevrt  -g -G -lineinfo -m64 -dc -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_60,code=compute_60 -c func.cu
gfortran -I/usr/local/cuda/include -I/usr/local/cuda/samples/common/inc -L/usr/local/cuda/lib64 -lcuda -lcudart -lcublas -lcublas_device -lcudadevrt  -lstdc++ -g -c test.f90
gfortran -I/usr/local/cuda/include -I/usr/local/cuda/samples/common/inc -g -o test func.o test.o -L/usr/local/cuda/lib64 -lcuda -lcudart -lcublas -lcublas_device -lcudadevrt  -lstdc++
func.o: In function `__sti____cudaRegisterAll_51_tmpxft_00001b98_00000000_15_func_compute_60_cpp1_ii_b582f6e5()':
/tmp/tmpxft_00001b98_00000000-4_func.compute_60.cudafe1.stub.c:2: undefined reference to `__cudaRegisterLinkedBinary_51_tmpxft_00001b98_00000000_15_func_compute_60_cpp1_ii_b582f6e5'
collect2: error: ld returned 1 exit status
Makefile:15: recipe for target 'test' failed
make: *** [test] Error 1

If using a C program to call the same function, then the compilation goes fine.
Does someone knows what is wrong here?

Thank for your helps,
Marc Barbry

I haven’t looked into your problem closely, but have you tried using nvcc to do the linking steps where CUDA code is involved?

When compiling the cuda code with nvcc I use the -dc option to do the linking (since it is necessary to call the cublas_v2 it seems).
But then it is causing the error when linking the cuda and the fortran code together with gfortran.

gfortran doesn’t know about the device-side linking, so I’m not surprised it fails. On the other hand, gfortran-produced code canb easily be linked by gcc without special Fortran knowledge. So I’d suggest using nvcc for the final linking stage.

Compiling the final stage with nvcc solve the problem, thank you for your help.

Here is the corrected makefile,

CC=nvcc
FC=gfortran
ARCH = -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_60,code=compute_60
FLAGS = -g -G -lineinfo -m64 -dc $(ARCH)
FCFLAGS=-g
PROGRAM=test
OBJECTS=func.o test.o
CUDA=/usr/local/cuda
CUDA_HELPER=$(CUDA)/samples/common/inc
LIB=-lcuda -lcudart -lcublas -lcublas_device -lcudadevrt 

all:$(PROGRAM)

$(PROGRAM): $(OBJECTS)
		$(CC) -I/usr/include -I$(CUDA)/include -I$(CUDA_HELPER) $(FCFLAGS) $(ARCH) -o $@ $^ -L$(CUDA)/lib64 $(LIB) -lstdc++ -lgfortran
%.o: %.f90 
		$(FC) -I$(CUDA)/include -I$(CUDA_HELPER) -L$(CUDA)/lib64 $(LIB) -lstdc++ $(FCFLAGS) -c $<
%.o: %.cu
		$(CC) -I$(CUDA_HELPER) $(LIB) $(FLAGS) -c $<


# Utility targets
# .PHONY: clean veryclean

clean:
	rm *.o *.mod $(PROGRAM)

When linking fortran code with nvcc one need to use the -lgfortran library.