Hello,
I’ve been stuck for two days on a simple test on CMAKE to build a “.cu” & link it to “.cpp”
cmake_minimum_required(VERSION 2.8 FATAL_ERROR)
SET(CMAKE_CXX_STANDARD 11)
SET(CMAKE_THREAD_PREFER_PTHREAD TRUE)
SET(THREADS_PREFER_PTHREAD_FLAG TRUE)
PROJECT(TEST_CUDA)
FIND_PACKAGE(OpenCV REQUIRED)
FIND_PACKAGE(Threads REQUIRED)
# CUDA PACKAGE
FIND_PACKAGE(CUDA REQUIRED)
INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
SET(LIBRARIES ${CUDA_LIBRARIES})
SET(CUDA_SEPARABLE_COMPILATION ON)
SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
SET(CUDA_HOST_COMPILER g++)
SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler")
FILE(GLOB CUDA_FILES "." *.cu)
LIST(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_53,code=sm_53; -std=c++11")
CUDA_ADD_EXECUTABLE(CU_O ${CUDA_FILES})
TARGET_LINK_LIBRARIES(CU_O ${CUDA_LIBRARIES})
ADD_EXECUTABLE(TEST_GPU main.cpp ${CU_O})
TARGET_LINK_LIBRARIES(TEST_GPU ${OpenCV_LIBS})
TARGET_LINK_LIBRARIES(TEST_GPU ${CMAKE_THREAD_LIBS_INIT})
TARGET_LINK_LIBRARIES(TEST_GPU ${CUDA_LIBRARIES})
Cmake output:
-- Configuring done
-- Generating done
-- Build files have been written to: /home/ubuntu/Desktop/TEST_GPU/build
[ 14%] Linking CXX executable TEST_GPU
CMakeFiles/TEST_GPU.dir/main.cpp.o: In function `Gpu_Dot_(cv::cuda::GpuMat, cv::cuda::GpuMat, float&)':
main.cpp:(.text+0x336c): undefined reference to `Vector_Dot_Product_(float const*, float const*, float*)'
CMakeFiles/TEST_GPU.dir/build.make:200: recipe for target 'TEST_GPU' failed
CMakeFiles/Makefile2:67: recipe for target 'CMakeFiles/TEST_GPU.dir/all' failed
Makefile:83: recipe for target 'all' failed
collect2: error: ld returned 1 exit status
make[2]: *** [TEST_GPU] Error 1
make[1]: *** [CMakeFiles/TEST_GPU.dir/all] Error 2
make: *** [all] Error 2
10:32:52: The process "/usr/bin/make" exited with code 2.
Error while building/deploying project TEST_GPU(kit: TX1)
When executing step "Make"
(note that CMAKE output has been edited to remove personal information only)
with: *.cu file being:
#include "dot.h"
using namespace std ;
#define min(x,y) (x>y?x:y)
#define N 33*1024
#define ThreadPerBlock 256
//smallest multiple of threadsPerBlock that is greater than or equal to N
#define blockPerGrid min(32 , (N+ThreadPerBlock-1) / ThreadPerBlock )
__global__
void Vector_Dot_Product_ ( const float *V1 , const float *V2 , float *V3 )
{
__shared__ float chache[ThreadPerBlock] ;
float temp = 0 ;
int tid = blockDim.x * blockIdx.x + threadIdx.x ;
int chacheindex = threadIdx.x ;
while ( tid < N )
{
temp += V1[tid] * V2[tid] ;
tid += blockDim.x * gridDim.x ;
}
chache[chacheindex] = temp ;
__syncthreads () ;
int i = blockDim.x / 2 ;
while ( i!=0 )
{
if ( chacheindex < i )
chache[chacheindex] += chache [chacheindex + i] ;
__syncthreads();
i/=2 ;
}
if ( chacheindex == 0 )
V3[blockIdx.x] = chache [0] ;
}
dot.h:
#ifndef DOT_H
#define DOT_H
#include <iostream>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <device_launch_parameters.h>
__global__ void Vector_Dot_Product_ ( const float *V1 , const float *V2 , float *V3 );
#endif
and.cpp file define:
void Gpu_Dot_(cv::cuda::GpuMat A, cv::cuda::GpuMat B, float &retval)
{
#if DEBUG
CV_Assert(A.type() == B.type() && A.type() == CV_32F);
CV_Assert(A.rows == B.rows && A.cols == B.cols);
#endif
float retval = 0;
for(int i = 0; i < A.rows; i++)
{
float res = 0;
/* __device__ */ Vector_Dot_Product_(A.ptr<float>(i), B.ptr<float>(i), &res);
retval += res;
}
}
The undefined error is definitely a linker error. But I can’t get it right.
Any help appreciated.