How to add .cu to ROS package? How to call function in .cu?

Could you please help me with the .cpp and .cu in ROS? I wish to call the function in .cu file. I use some functions in https://answers.ros.org/question/162646/how-to-catkin_make-cuda-code/

However there must be something wrong with the CUDA it seems couldn’t link the cuda. The code is :

CMakeLists
cmake_minimum_required(VERSION 2.8.3)
project(pointcloud_from_camera)
add_compile_options(-std=c++11)
find_package(catkin REQUIRED COMPONENTS 
  roscpp
  rospy
  std_msgs
  sensor_msgs
  cv_bridge
  image_transport
  pcl_conversions
  pcl_ros
)
find_package(CUDA REQUIRED)
include_directories(${CUDA_INCLUDE_DIRS})
link_directories(${CUDA_LIBRARY_DIRS})

 set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -DMY_DEF=1")
 set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DMY_DEF=1" )
 set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DMY_DEF=1" )

 # only necessary if you wish to create a lib
   set(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF)
   set(BUILD_SHARED_LIBS ON)

 # cuda compile GPU_CODE for executable
     cuda_compile(  TRANSFORM_O src/transform.cu )

     cuda_add_library( ${TRANS} src/lib_transform.cu )

catkin_package(
    CATKIN_DEPENDS
    roscpp
)
include_directories(
include
  ${catkin_INCLUDE_DIRS}
  ${PCL_INCLUDE_DIRS}
)
add_executable (sensordata src/sensordata.cpp)

target_include_directories(sensordata
    PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}
    PUBLIC ${CUDA_INCLUDE_DIRS}
)

target_link_libraries(sensordata
    LINK_PUBLIC ${catkin_LIBRARIES}
LINK_PUBLIC ${CUDA_LIBRARIES}
${CUDA_CUDA_LIBRARY} ${CUDA_CUDART_LIBRARY} ${CUDA_NPP_LIBRARIES_ZED}
${TRANS}
)

The .cu https://stackoverflow.com/questions/14054153/how-can-multiply-vector-by-a-matrix-using-cuda-c

#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <time.h>
#include <curand_kernel.h>

__global__ void kernel(float *vec, float *mat, float *out, const int N, const int M){
    int tid=threadIdx.x+blockIdx.x*blockDim.x;
        float sum=0;
        if(tid<M){
        for(int i=0; i<N; i++)
            sum += vec[i]*mat[(i*M)+tid];
        out[tid]=sum;
    }
}
extern "C"  int testmain (void) {
        srand( time(NULL) );

float *a, *b, *c;
    float *dev_a, *dev_b, *dev_c

cudaMemcpy(dev_a, a, sizeof(float)*N, cudaMemcpyHostToDevice);
    cudaMemcpy(dev_b, b, sizeof(float)*N*M, cudaMemcpyHostToDevice);
    kernel<<<M/256+1, 256>>>(dev_a, dev_b, dev_c, N, M);
cudaMemcpy(c, dev_c, sizeof(float)*M, cudaMemcpyDeviceToHost);
};

The .cpp

#include "transform.cu"
#include <cuda.h>

extern "C" int testmain();

void callbackfunction(){
testmain();
}

int main (int argc, char** argv)
  ros::init (argc, argv, "get_pointcloud_into_gpuvoxel");
  ros::NodeHandle nh;
}

It seems the function in .cpp is wrong.

The error is:

[ 20%] Linking CXX shared library /home/Downloads/map/devel/lib/libsrc/lib_transform.cu.so
/usr/bin/ld: cannot open output file /home/sDownloads/map/devel/lib/libsrc/lib_transform.cu.so: No such file or directory
collect2: error: ld returned 1 exit status
pointcloud_from_camera/CMakeFiles/src/lib_transform.cu.dir/build.make:70: recipe for target '/home/shupeng/Downloads/lyx/probability_map/devel/lib/libsrc/lib_transform.cu.so' failed
make[2]: *** [/home/shupeng/Downloads/lyx/probability_map/devel/lib/libsrc/lib_transform.cu.so] Error 1
CMakeFiles/Makefile2:3043: recipe for target 'pointcloud_from_camera/CMakeFiles/src/lib_transform.cu.dir/all' failed
make[1]: *** [pointcloud_from_camera/CMakeFiles/src/lib_transform.cu.dir/all] Error 2
make[1]: *** Waiting for unfinished jobs....

Checkout this pull request from the jets on hacks author he uses cu files and you can look at the changes he made to the CMakeLists.txt file
https://github.com/dusty-nv/ros_deep_learning/pull/8