I have created a custom cuda kernel to use on a GpuMat.
__global__ void custom_kernel(unsigned char* input, unsigned char* output, int width, int height, int colorWidthStep, int outputWidthStep) {
const int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
const int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
if ((xIndex < width) && (yIndex < height)) {
/*Something to do*/
}
}
There are different errors depending on the order I put lines in my CMakeLists.txt
First error is:
nvlink error : Multiple definition of '_Z18custom_kernelPhS_iiii' in 'CMakeFiles/cuda_compile_1.dir/cu/cuda_compile_1_generated_kernel.cu.o', first defined in 'CMakeFiles/cuda_execs.dir/cu/cuda_execs_generated_customkernel.cu.o'
nvlink fatal : merge_elf failed
CMakeFiles/cuda_execs.dir/build.make:727: recipe for target 'CMakeFiles/cuda_execs.dir/cmake_device_link.o' failed
make[2]: *** [CMakeFiles/cuda_execs.dir/cmake_device_link.o] Error 1
CMakeFiles/Makefile2:67: recipe for target 'CMakeFiles/cuda_execs.dir/all' failed
make[1]: *** [CMakeFiles/cuda_execs.dir/all] Error 2
Makefile:83: recipe for target 'all' failed
make: *** [all] Error 2
when the order is as following (I put the whole cmakelists at the end)
CUDA_COMPILE(cuda_objs ${CUDA_FILES} )
cuda_add_executable(cuda_execs ${CUDA_FILES} ${cuda_objs})
add_executable(${PROJECT_NAME} ${HDR_FILES} ${SRC_FILES})
add_definitions(-std=c++14)
the second error is
CMakeFiles/ZED_Object_detection_image_viewer.dir/src/main.o: In function `main':
main.cpp:(.text+0x758): undefined reference to `convert_image'
collect2: error: ld returned 1 exit status
CMakeFiles/ZED_Object_detection_image_viewer.dir/build.make:191: recipe for target 'ZED_Object_detection_image_viewer' failed
make[2]: *** [ZED_Object_detection_image_viewer] Error 1
CMakeFiles/Makefile2:67: recipe for target 'CMakeFiles/ZED_Object_detection_image_viewer.dir/all' failed
make[1]: *** [CMakeFiles/ZED_Object_detection_image_viewer.dir/all] Error 2
Makefile:83: recipe for target 'all' failed
make: *** [all] Error 2
which is the function that is used in the main.cpp that calls the kernel.
Now I thing i read several times, that the .cu files should be compiled first, so this might be the case in this scenario, as the order here is reversed
add_executable(${PROJECT_NAME} ${HDR_FILES} ${SRC_FILES})
add_definitions(-std=c++14)
CUDA_COMPILE(cuda_objs ${CUDA_FILES} )
cuda_add_executable(cuda_execs ${CUDA_FILES} ${cuda_objs})
To summarize and maybe further explain the goal: I am trying to perform an operation on a whole image that uses the bgr-channels to create a gray-scale-image. This action should ideally be calculated on the GPU, as it should be faster than the CPU - I hope.
Unfortunately I have to use the nvcc-compiler for the .cu files and a standard g++ compiler for the other files included, therefore I have to link them together after compiling them separately. Most examples are using a main.cu which does not reflect my setup currently.
I am using the libraries for the Zed2-Camera, therefor there are some more things to compile, that’s why I dont compile everything with the nvcc.
Currently there is a kernel.h included in the main.cpp and in the kernel.cu which looks like this
#if !defined (kernel_h)
#define kernel_h
#include <opencv2/core/core.hpp>
#include <opencv2/core/cuda.hpp>
#include <opencv2/highgui/highgui.hpp>
extern "C" void convert_image(cv::cuda::GpuMat& input, cv::cuda::GpuMat& output);
#endif
The whole cmakelists.txt
cmake_minimum_required(VERSION 2.4)
PROJECT(ZED_Object_detection_image_viewer LANGUAGES CXX CUDA)
list(APPEND SAMPLE_LIST ${PROJECT_NAME})
option(LINK_SHARED_ZED "Link with the ZED SDK shared executable" ON)
if (NOT LINK_SHARED_ZED AND MSVC)
message(FATAL_ERROR "LINK_SHARED_ZED OFF : ZED SDK static libraries not available on Windows")
endif()
if(COMMAND cmake_policy)
cmake_policy(SET CMP0003 OLD)
cmake_policy(SET CMP0004 OLD)
cmake_policy(SET CMP0015 OLD)
endif(COMMAND cmake_policy)
SET(EXECUTABLE_OUTPUT_PATH ".")
SET(SPECIAL_OS_LIBS "")
set(CUDA_HOST_COMPILER "c++")
set(CUDA_PROPAGATE_HOST_FLAGS OFF)
find_package(ZED 3 REQUIRED)
find_package(OpenCV REQUIRED)
find_package(GLUT REQUIRED)
find_package(GLEW REQUIRED)
find_package(OpenGL REQUIRED)
find_package(CUDA REQUIRED)
IF(NOT WIN32)
SET(SPECIAL_OS_LIBS "pthread" "X11")
add_definitions(-Wno-write-strings)
ENDIF()
include_directories(${ZED_INCLUDE_DIRS})
include_directories(${OpenCV_INCLUDE_DIRS})
include_directories(${GLEW_INCLUDE_DIRS})
include_directories(${GLUT_INCLUDE_DIR})
include_directories(${CUDA_INCLUDE_DIRS})
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
link_directories(${ZED_LIBRARY_DIR})
link_directories(${OpenCV_LIBRARY_DIRS})
link_directories(${GLEW_LIBRARY_DIRS})
link_directories(${GLUT_LIBRARY_DIRS})
link_directories(${OpenGL_LIBRARY_DIRS})
link_directories(${CUDA_LIBRARY_DIRS})
FILE(GLOB_RECURSE CUDA_FILES cu/*.cu)
FILE(GLOB_RECURSE SRC_FILES src/*.c*)
FILE(GLOB_RECURSE HDR_FILES include/*.h*)
message("SRC files: ${SRC_FILES}")
message("HDR files: ${HDR_FILES}")
message("Cuda files: ${CUDA_FILES}")
set(CUDA_ARCH_BIN " 30 " CACHE STRING "Specify 'real' GPU arch to build binaries for, BIN(PTX) format is supported. Example: 1.3 2.1(1.3) or 13 21(13)")
set(CUDA_ARCH_PTX "" CACHE STRING "Specify 'virtual' PTX arch to build PTX intermediate code for. Example: 1.0 1.2 or 10 12")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;-fPIC;-std=c++11")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "--ftz=true;--prec-div=false;--prec-sqrt=false; -rdc=true")
add_executable(${PROJECT_NAME} ${HDR_FILES} ${SRC_FILES})
add_definitions(-std=c++14)
CUDA_COMPILE(cuda_objs ${CUDA_FILES} )
cuda_add_executable(cuda_execs ${CUDA_FILES} ${cuda_objs})
if (LINK_SHARED_ZED)
SET(ZED_LIBS ${ZED_LIBRARIES} ${CUDA_CUDA_LIBRARY} ${CUDA_CUDART_LIBRARY} ${CUDA_DEP_LIBRARIES_ZED})
else()
SET(ZED_LIBS ${ZED_STATIC_LIBRARIES} ${CUDA_CUDA_LIBRARY} ${CUDA_LIBRARY})
endif()
target_link_libraries(${PROJECT_NAME}
${SPECIAL_OS_LIBS}
${ZED_LIBS}
${OpenCV_LIBRARIES}
${OPENGL_LIBRARIES}
${GLUT_LIBRARY}
${GLEW_LIBRARIES})
and at last the whole kernel.cu
#include <opencv2/core/core.hpp>
#include <opencv2/core/cuda.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <cuda_runtime.h>
#include <iostream>
#include "../include/kernel.h"
/**
* @brief CUDA safe call.
*
* @param[in] err The error
* @param[in] msg The message
* @param[in] file_name The file name
* @param[in] line_number The line number
*/
/// Safe call macro.
#define SAFE_CALL(call,msg) _safe_cuda_call((call),(msg),__FILE__,__LINE__)
#ifndef __KERNEL__
#define __KERNEL__
static inline void _safe_cuda_call(cudaError err, const char* msg, const char* file_name, const int line_number) {
if(err!=cudaSuccess) {
fprintf(stderr,"%s\n\nFile: %s\n\nLine Number: %d\n\nReason: %s\n",msg,file_name,line_number,cudaGetErrorString(err));
std::cin.get();
exit(EXIT_FAILURE);
}
}
__global__ void custom_kernel(unsigned char* input, unsigned char* output, int width, int height, int colorWidthStep, int outputWidthStep) {
const int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
const int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
if ((xIndex < width) && (yIndex < height)) {
/* Something */
}
}
#endif
#ifndef __INTERFACE__
#define __INTERFACE__
extern "C" void convert_image(cv::cuda::GpuMat& input, cv::cuda::GpuMat& output) {
// Calculate total number of bytes of input and output image
const int colorBytes = input.step * input.rows;
const int outputBytes = output.step * output.rows;
unsigned char *d_input, *d_output;
// Allocate device memory
SAFE_CALL(cudaMalloc<unsigned char>(&d_input,colorBytes),"CUDA Malloc Failed");
SAFE_CALL(cudaMalloc<unsigned char>(&d_output,outputBytes ),"CUDA Malloc Failed");
// Copy data from OpenCV input image to device memory
SAFE_CALL(cudaMemcpy(d_input,input.ptr(),colorBytes,cudaMemcpyHostToDevice),"CUDA Memcpy Host To Device Failed");
// Specify a reasonable block size
const dim3 block(16,16);
// Calculate grid size to cover the whole image
const dim3 grid((input.cols + block.x - 1)/block.x, (input.rows + block.y - 1)/block.y);
// Launch the color conversion kernel
custom_kernel<<<grid,block>>>(d_input,d_output,input.cols,input.rows,input.step,output.step);
// Synchronize to check for any kernel launch errors
SAFE_CALL(cudaDeviceSynchronize(),"Kernel Launch Failed");
// Copy back data from destination device meory to OpenCV output image
SAFE_CALL(cudaMemcpy(output.ptr(),d_output,outputBytes ,cudaMemcpyDeviceToHost),"CUDA Memcpy Host To Device Failed");
// Free the device memory
SAFE_CALL(cudaFree(d_input),"CUDA Free Failed");
SAFE_CALL(cudaFree(d_output),"CUDA Free Failed");
}
#endif