Using custom cuda-kernels within .cpp files using cmake

I have created a custom cuda kernel to use on a GpuMat.

__global__ void custom_kernel(unsigned char* input, unsigned char* output, int width, int height, int colorWidthStep, int outputWidthStep) {
	const int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
	const int yIndex = blockIdx.y * blockDim.y + threadIdx.y;

	if ((xIndex < width) && (yIndex < height)) {
      /*Something to do*/
	}
}

There are different errors depending on the order I put lines in my CMakeLists.txt

First error is:

nvlink error   : Multiple definition of '_Z18custom_kernelPhS_iiii' in 'CMakeFiles/cuda_compile_1.dir/cu/cuda_compile_1_generated_kernel.cu.o', first defined in 'CMakeFiles/cuda_execs.dir/cu/cuda_execs_generated_customkernel.cu.o'
nvlink fatal   : merge_elf failed
CMakeFiles/cuda_execs.dir/build.make:727: recipe for target 'CMakeFiles/cuda_execs.dir/cmake_device_link.o' failed
make[2]: *** [CMakeFiles/cuda_execs.dir/cmake_device_link.o] Error 1
CMakeFiles/Makefile2:67: recipe for target 'CMakeFiles/cuda_execs.dir/all' failed
make[1]: *** [CMakeFiles/cuda_execs.dir/all] Error 2
Makefile:83: recipe for target 'all' failed
make: *** [all] Error 2

when the order is as following (I put the whole cmakelists at the end)

CUDA_COMPILE(cuda_objs {CUDA_FILES} ) cuda_add_executable(cuda_execs {CUDA_FILES} {cuda_objs}) add_executable({PROJECT_NAME} {HDR_FILES} {SRC_FILES})
add_definitions(-std=c++14)

the second error is

CMakeFiles/ZED_Object_detection_image_viewer.dir/src/main.o: In function `main':
main.cpp:(.text+0x758): undefined reference to `convert_image'
collect2: error: ld returned 1 exit status
CMakeFiles/ZED_Object_detection_image_viewer.dir/build.make:191: recipe for target 'ZED_Object_detection_image_viewer' failed
make[2]: *** [ZED_Object_detection_image_viewer] Error 1
CMakeFiles/Makefile2:67: recipe for target 'CMakeFiles/ZED_Object_detection_image_viewer.dir/all' failed
make[1]: *** [CMakeFiles/ZED_Object_detection_image_viewer.dir/all] Error 2
Makefile:83: recipe for target 'all' failed
make: *** [all] Error 2

which is the function that is used in the main.cpp that calls the kernel.
Now I thing i read several times, that the .cu files should be compiled first, so this might be the case in this scenario, as the order here is reversed

add_executable(${PROJECT_NAME} ${HDR_FILES} ${SRC_FILES})
add_definitions(-std=c++14) 
CUDA_COMPILE(cuda_objs ${CUDA_FILES} )
cuda_add_executable(cuda_execs ${CUDA_FILES} ${cuda_objs})

To summarize and maybe further explain the goal: I am trying to perform an operation on a whole image that uses the bgr-channels to create a gray-scale-image. This action should ideally be calculated on the GPU, as it should be faster than the CPU - I hope.
Unfortunately I have to use the nvcc-compiler for the .cu files and a standard g++ compiler for the other files included, therefore I have to link them together after compiling them separately. Most examples are using a main.cu which does not reflect my setup currently.

I am using the libraries for the Zed2-Camera, therefor there are some more things to compile, that’s why I dont compile everything with the nvcc.

Currently there is a kernel.h included in the main.cpp and in the kernel.cu which looks like this

#if !defined (kernel_h)
#define kernel_h
    #include <opencv2/core/core.hpp>
    #include <opencv2/core/cuda.hpp>
    #include <opencv2/highgui/highgui.hpp>
    extern "C" void convert_image(cv::cuda::GpuMat& input, cv::cuda::GpuMat& output);
#endif

The whole cmakelists.txt

cmake_minimum_required(VERSION 2.4)
PROJECT(ZED_Object_detection_image_viewer LANGUAGES CXX CUDA)
list(APPEND SAMPLE_LIST ${PROJECT_NAME})

option(LINK_SHARED_ZED "Link with the ZED SDK shared executable" ON)

if (NOT LINK_SHARED_ZED AND MSVC)
    message(FATAL_ERROR "LINK_SHARED_ZED OFF : ZED SDK static libraries not available on Windows")
endif()

if(COMMAND cmake_policy)
        cmake_policy(SET CMP0003 OLD)
        cmake_policy(SET CMP0004 OLD)
        cmake_policy(SET CMP0015 OLD)
endif(COMMAND cmake_policy)

SET(EXECUTABLE_OUTPUT_PATH ".")
SET(SPECIAL_OS_LIBS "")
set(CUDA_HOST_COMPILER "c++")
set(CUDA_PROPAGATE_HOST_FLAGS OFF)

find_package(ZED 3 REQUIRED)
find_package(OpenCV REQUIRED)
find_package(GLUT REQUIRED)
find_package(GLEW REQUIRED)
find_package(OpenGL REQUIRED)
find_package(CUDA REQUIRED)

IF(NOT WIN32)
    SET(SPECIAL_OS_LIBS "pthread" "X11")
    add_definitions(-Wno-write-strings)
ENDIF()

include_directories(${ZED_INCLUDE_DIRS})
include_directories(${OpenCV_INCLUDE_DIRS})
include_directories(${GLEW_INCLUDE_DIRS})
include_directories(${GLUT_INCLUDE_DIR})
include_directories(${CUDA_INCLUDE_DIRS})
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)

link_directories(${ZED_LIBRARY_DIR})
link_directories(${OpenCV_LIBRARY_DIRS})
link_directories(${GLEW_LIBRARY_DIRS})
link_directories(${GLUT_LIBRARY_DIRS})
link_directories(${OpenGL_LIBRARY_DIRS})
link_directories(${CUDA_LIBRARY_DIRS})

FILE(GLOB_RECURSE CUDA_FILES cu/*.cu)
FILE(GLOB_RECURSE SRC_FILES src/*.c*)
FILE(GLOB_RECURSE HDR_FILES include/*.h*)


message("SRC files: ${SRC_FILES}")
message("HDR files: ${HDR_FILES}")
message("Cuda files: ${CUDA_FILES}")

set(CUDA_ARCH_BIN " 30 " CACHE STRING "Specify 'real' GPU arch to build binaries for, BIN(PTX) format is supported. Example: 1.3 2.1(1.3) or 13 21(13)")
set(CUDA_ARCH_PTX "" CACHE STRING "Specify 'virtual' PTX arch to build PTX intermediate code for. Example: 1.0 1.2 or 10 12")    
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}  "-Xcompiler;-fPIC;-std=c++11")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "--ftz=true;--prec-div=false;--prec-sqrt=false; -rdc=true") 
      
add_executable(${PROJECT_NAME} ${HDR_FILES} ${SRC_FILES})
add_definitions(-std=c++14) 
CUDA_COMPILE(cuda_objs ${CUDA_FILES} )
cuda_add_executable(cuda_execs ${CUDA_FILES} ${cuda_objs})

if (LINK_SHARED_ZED)
    SET(ZED_LIBS ${ZED_LIBRARIES} ${CUDA_CUDA_LIBRARY} ${CUDA_CUDART_LIBRARY} ${CUDA_DEP_LIBRARIES_ZED})
else()
    SET(ZED_LIBS ${ZED_STATIC_LIBRARIES} ${CUDA_CUDA_LIBRARY} ${CUDA_LIBRARY})
endif()

target_link_libraries(${PROJECT_NAME}
                        ${SPECIAL_OS_LIBS}
                        ${ZED_LIBS}
                        ${OpenCV_LIBRARIES}
                        ${OPENGL_LIBRARIES}
                        ${GLUT_LIBRARY}
                        ${GLEW_LIBRARIES})

and at last the whole kernel.cu

#include <opencv2/core/core.hpp>
#include <opencv2/core/cuda.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <cuda_runtime.h>
#include <iostream>
#include "../include/kernel.h"

/**
 * @brief      CUDA safe call.
 *
 * @param[in]  err          The error
 * @param[in]  msg          The message
 * @param[in]  file_name    The file name
 * @param[in]  line_number  The line number
 */

/// Safe call macro.
#define SAFE_CALL(call,msg) _safe_cuda_call((call),(msg),__FILE__,__LINE__)
#ifndef __KERNEL__
#define __KERNEL__
static inline void _safe_cuda_call(cudaError err, const char* msg, const char* file_name, const int line_number) {
	if(err!=cudaSuccess) {
		fprintf(stderr,"%s\n\nFile: %s\n\nLine Number: %d\n\nReason: %s\n",msg,file_name,line_number,cudaGetErrorString(err));
		std::cin.get();
		exit(EXIT_FAILURE);
	}
}


__global__ void custom_kernel(unsigned char* input, unsigned char* output, int width, int height, int colorWidthStep, int outputWidthStep) {
	const int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
	const int yIndex = blockIdx.y * blockDim.y + threadIdx.y;

	if ((xIndex < width) && (yIndex < height)) {
		/* Something */
	}
}
#endif
#ifndef __INTERFACE__
#define __INTERFACE__
extern "C" void convert_image(cv::cuda::GpuMat& input, cv::cuda::GpuMat& output) {
	// Calculate total number of bytes of input and output image
	const int colorBytes = input.step * input.rows;
	const int outputBytes = output.step * output.rows;

	unsigned char *d_input, *d_output;

	// Allocate device memory
	SAFE_CALL(cudaMalloc<unsigned char>(&d_input,colorBytes),"CUDA Malloc Failed");
	SAFE_CALL(cudaMalloc<unsigned char>(&d_output,outputBytes ),"CUDA Malloc Failed");

	// Copy data from OpenCV input image to device memory
	SAFE_CALL(cudaMemcpy(d_input,input.ptr(),colorBytes,cudaMemcpyHostToDevice),"CUDA Memcpy Host To Device Failed");

	// Specify a reasonable block size
	const dim3 block(16,16);

	// Calculate grid size to cover the whole image
	const dim3 grid((input.cols + block.x - 1)/block.x, (input.rows + block.y - 1)/block.y);

	// Launch the color conversion kernel
	custom_kernel<<<grid,block>>>(d_input,d_output,input.cols,input.rows,input.step,output.step);

	// Synchronize to check for any kernel launch errors
	SAFE_CALL(cudaDeviceSynchronize(),"Kernel Launch Failed");

	// Copy back data from destination device meory to OpenCV output image
	SAFE_CALL(cudaMemcpy(output.ptr(),d_output,outputBytes ,cudaMemcpyDeviceToHost),"CUDA Memcpy Host To Device Failed");

	// Free the device memory
	SAFE_CALL(cudaFree(d_input),"CUDA Free Failed");
	SAFE_CALL(cudaFree(d_output),"CUDA Free Failed");
}

#endif
CUDA_COMPILE(cuda_objs ${CUDA_FILES})
cuda_add_executable(${PROJECT_NAME} ${CUDA_FILES})
add_executable(${PROJECT_NAME} ${HDR_FILES} ${SRC_FILES})
add_definitions(-std=c++14) 

This one brought me at least one step further. The code compiles and i can make it, but now there are almost a hundred “all CUDA-capable devices are busy” errors when using cuda-memcheck ./yourexecutable.

I have found a couple dozen threads on almost a dozen different sites. So I don’t know whether that’s an easy fix, but the CMakeLists error has been resolved.