CUDA8.0 TX1 CMake file

Hello,

I’ve been stuck for two days on a simple test on CMAKE to build a “.cu” & link it to “.cpp”

cmake_minimum_required(VERSION 2.8 FATAL_ERROR)

SET(CMAKE_CXX_STANDARD 11)
SET(CMAKE_THREAD_PREFER_PTHREAD TRUE)
SET(THREADS_PREFER_PTHREAD_FLAG TRUE)

PROJECT(TEST_CUDA)
FIND_PACKAGE(OpenCV REQUIRED)
FIND_PACKAGE(Threads REQUIRED)

# CUDA PACKAGE
FIND_PACKAGE(CUDA REQUIRED)
INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
SET(LIBRARIES ${CUDA_LIBRARIES})
SET(CUDA_SEPARABLE_COMPILATION ON)
SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
SET(CUDA_HOST_COMPILER g++)
SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler")

FILE(GLOB CUDA_FILES "." *.cu)
LIST(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_53,code=sm_53; -std=c++11")
CUDA_ADD_EXECUTABLE(CU_O ${CUDA_FILES})
TARGET_LINK_LIBRARIES(CU_O ${CUDA_LIBRARIES})

ADD_EXECUTABLE(TEST_GPU main.cpp ${CU_O})
TARGET_LINK_LIBRARIES(TEST_GPU ${OpenCV_LIBS})
TARGET_LINK_LIBRARIES(TEST_GPU ${CMAKE_THREAD_LIBS_INIT})
TARGET_LINK_LIBRARIES(TEST_GPU ${CUDA_LIBRARIES})

Cmake output:

-- Configuring done
-- Generating done
-- Build files have been written to: /home/ubuntu/Desktop/TEST_GPU/build
[ 14%] Linking CXX executable TEST_GPU
CMakeFiles/TEST_GPU.dir/main.cpp.o: In function `Gpu_Dot_(cv::cuda::GpuMat, cv::cuda::GpuMat, float&)':
main.cpp:(.text+0x336c): undefined reference to `Vector_Dot_Product_(float const*, float const*, float*)'
CMakeFiles/TEST_GPU.dir/build.make:200: recipe for target 'TEST_GPU' failed
CMakeFiles/Makefile2:67: recipe for target 'CMakeFiles/TEST_GPU.dir/all' failed
Makefile:83: recipe for target 'all' failed
collect2: error: ld returned 1 exit status
make[2]: *** [TEST_GPU] Error 1
make[1]: *** [CMakeFiles/TEST_GPU.dir/all] Error 2
make: *** [all] Error 2
10:32:52: The process "/usr/bin/make" exited with code 2.
Error while building/deploying project TEST_GPU(kit: TX1)
When executing step "Make"

(note that CMAKE output has been edited to remove personal information only)

with: *.cu file being:

#include "dot.h"
using namespace std ;

#define min(x,y) (x>y?x:y)
#define N 33*1024

#define ThreadPerBlock 256

//smallest multiple of threadsPerBlock that is greater than or equal to N
#define blockPerGrid min(32 , (N+ThreadPerBlock-1) / ThreadPerBlock )

__global__
void Vector_Dot_Product_ ( const float *V1 , const float *V2 , float *V3   )
{
    __shared__ float chache[ThreadPerBlock] ;

    float temp = 0 ;

int tid = blockDim.x * blockIdx.x + threadIdx.x ;

    int chacheindex = threadIdx.x ;

    while ( tid < N )
    {
        temp += V1[tid] * V2[tid] ;

        tid += blockDim.x * gridDim.x ;
    }

    chache[chacheindex] = temp ;

    __syncthreads () ;

    int i  = blockDim.x / 2 ;

    while ( i!=0 )
    {

        if ( chacheindex < i )
            chache[chacheindex] += chache [chacheindex + i] ;

        __syncthreads();

        i/=2 ;
    }

    if ( chacheindex == 0 )
        V3[blockIdx.x] = chache [0] ;

}

dot.h:

#ifndef DOT_H
#define DOT_H

#include <iostream>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <device_launch_parameters.h>

__global__ void Vector_Dot_Product_ ( const float *V1 , const float *V2 , float *V3   );

#endif

and.cpp file define:

void Gpu_Dot_(cv::cuda::GpuMat A, cv::cuda::GpuMat B, float &retval)
{

#if DEBUG
    CV_Assert(A.type() == B.type() && A.type() == CV_32F);
    CV_Assert(A.rows == B.rows && A.cols == B.cols);
#endif

float retval = 0;
    for(int i = 0; i < A.rows; i++)
    {
        float res = 0;
        /* __device__ */ Vector_Dot_Product_(A.ptr<float>(i), B.ptr<float>(i), &res);
        retval += res;
    }

}

The undefined error is definitely a linker error. But I can’t get it right.

Any help appreciated.

It’s probably a good idea to ask TX1-specific questions in the TX1 sub-forum:

https://devtalk.nvidia.com/default/board/164/jetson-tx1/

Its not so much related to a Tx1 or jetson architecture, its a linker error, thus a setup error in the CMake file.

TX1 info is here just so its possible to chck my arch config/other flag configs.

Regards,

You can’t call a kernel from a .cpp file:

/* __device__ */ Vector_Dot_Product_(A.ptr<float>(i), B.ptr<float>(i), &res);

You’ve obviously excerpted that file, so you have some sort of weirdness with your forward declaration for that function which you haven’t shown (and there is no way it could match the forward declaration in dot.h)

Furthermore, kernel calls must be configured for launch <<<…>>>

It appears you lack basic skills at writing a cuda program.

First study the cuda sample code vectorAdd.

Then learn how to get cmake working with vectorAdd.

Then go on to your own code.

@txbobb: I know, I was just trying to link for now.

I was able to fix the Cmakelist with:

make_minimum_required(VERSION 2.8 FATAL_ERROR)
SET(CMAKE_CXX_STANDARD 11)


project( preProcess )
find_package( OpenCV REQUIRED )
find_package(CUDA REQUIRED)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -gencode arch=compute_53,code=sm_53)

file( GLOB  dot  dot.cu)
cuda_add_executable( preProcess main.cpp ECC_class.cpp ECC.cpp ${dot} )
target_link_libraries( preProcess ${OpenCV_LIBS} )

Code does compile, but as I (and you) expected it crashes on runtime.

Do you have anylink to said examples? (ordered a C CUDA book, but its expected to arrive this Weekend only).

The CUDA samples are installed when you install CUDA on your machine. Please read the install guide for your particular operating system; it will give basic instructions for compiling and running sample codes.

CUDA documentation is available:

http://docs.nvidia.com