Dear Cuda developers,
I’m trying to write some complex functions using cuda that use blas routines in the device code.
This cuda routines are then call from a fortran program.
Unfortunately, I don’t manage to compile my program, I reproduce a small test.
The cuda routines,
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <math.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <helper_functions.h>
#include <helper_cuda.h>
#include <cublas_v2.h>
int addition(int a, int b)
{
int c;
c = a + b;
return c;
}
__global__ void addition_vect(int *A, int *B, int *C, int N)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
cublasHandle_t cnpHandle;
cublasCreate(&cnpHandle);
if (i<N)
{
C[i] = A[i] + B[i];
}
cublasDestroy(cnpHandle);
}
__global__ void print_array_dev(float *arr, int N, int M)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if(i<N)
{
if (j<M)
{
printf("arr[%d, %d] = %f\n", i, j, arr[j+i*M]);
}
}
}
extern "C" void addition_vect_cuda_(int *A, int *B, int *C, int *Np)
{
int N = *Np;
// device variables
int *A_d, *B_d, *C_d;
checkCudaErrors(cudaMalloc( (void **)&A_d, sizeof(int) *N));
checkCudaErrors(cudaMalloc( (void **)&B_d, sizeof(int) *N));
checkCudaErrors(cudaMalloc( (void **)&C_d, sizeof(int) *N));
checkCudaErrors(cudaMemcpy( A_d, A, sizeof(int)*N, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy( B_d, B, sizeof(int)*N, cudaMemcpyHostToDevice));
addition_vect<<<1, N>>>(A_d, B_d, C_d, N);
checkCudaErrors(cudaMemcpy( C, C_d, sizeof(int)*N, cudaMemcpyDeviceToHost));
checkCudaErrors(cudaFree(A_d));
checkCudaErrors(cudaFree(B_d));
checkCudaErrors(cudaFree(C_d));
}
The main fortran program
program test
implicit none
integer, allocatable :: A(:), B(:), C(:)
integer :: N
integer :: i
N=5
allocate(A(N))
allocate(B(N))
allocate(C(N))
do i =1, N
A(i) = i
B(i) = 2*i
enddo
C = 0
print*, 'fort add: ', A+B
call addition_vect_cuda(A, B, C, N)
print*, 'cuda add: ', C
deallocate(A)
deallocate(B)
deallocate(C)
end program
and the Makefile,
CC=nvcc
FC=gfortran
ARCH = -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_60,code=compute_60
FLAGS = -g -G -lineinfo -m64 -dc $(ARCH)
FCFLAGS=-g
PROGRAM=test
OBJECTS=func.o test.o
CUDA=/usr/local/cuda
CUDA_HELPER=$(CUDA)/samples/common/inc
LIB=-lcuda -lcudart -lcublas -lcublas_device -lcudadevrt
all:$(PROGRAM)
$(PROGRAM): $(OBJECTS)
$(FC) -I$(CUDA)/include -I$(CUDA_HELPER) $(FCFLAGS) -o $@ $^ -L$(CUDA)/lib64 $(LIB) -lstdc++
%.o: %.f90
$(FC) -I$(CUDA)/include -I$(CUDA_HELPER) -L$(CUDA)/lib64 $(LIB) -lstdc++ $(FCFLAGS) -c $<
%.o: %.cu
$(CC) -I$(CUDA_HELPER) $(LIB) $(FLAGS) -c $<
# Utility targets
.PHONY: clean veryclean
clean:
rm *.o *.mod $(PROGRAM)
If you remove the calls to the cublas_v2, and the -dc option from the FLAGS, then the routine is running fine. In fact it seems that it is the -dc option that causes the error.
test_fortran $ make
nvcc -I/usr/local/cuda/samples/common/inc -lcuda -lcudart -lcublas -lcublas_device -lcudadevrt -g -G -lineinfo -m64 -dc -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_60,code=compute_60 -c func.cu
gfortran -I/usr/local/cuda/include -I/usr/local/cuda/samples/common/inc -L/usr/local/cuda/lib64 -lcuda -lcudart -lcublas -lcublas_device -lcudadevrt -lstdc++ -g -c test.f90
gfortran -I/usr/local/cuda/include -I/usr/local/cuda/samples/common/inc -g -o test func.o test.o -L/usr/local/cuda/lib64 -lcuda -lcudart -lcublas -lcublas_device -lcudadevrt -lstdc++
func.o: In function `__sti____cudaRegisterAll_51_tmpxft_00001b98_00000000_15_func_compute_60_cpp1_ii_b582f6e5()':
/tmp/tmpxft_00001b98_00000000-4_func.compute_60.cudafe1.stub.c:2: undefined reference to `__cudaRegisterLinkedBinary_51_tmpxft_00001b98_00000000_15_func_compute_60_cpp1_ii_b582f6e5'
collect2: error: ld returned 1 exit status
Makefile:15: recipe for target 'test' failed
make: *** [test] Error 1
If using a C program to call the same function, then the compilation goes fine.
Does someone knows what is wrong here?
Thank for your helps,
Marc Barbry