hi iam trying to create a CMakeLists.txt for cpp file and cu file
my main file main.cpp
#include <algorithm>
#include <cassert>
#include <iostream>
#include <vector>
#include <iostream>
#include <cuda_runtime.h>
#include <vector_types.h>
#include "01_vector_addition.h"
// Check vector add result
void verify_result(std::vector<int> &a, std::vector<int> &b,
std::vector<int> &c)
{
int j = 0;
for (size_t i = 0; i < a.size(); i++) {
// assert(c[i] == a[i] + b[i])
std::cout<<"c "<<c[i]<<" i"<<i<<"\n";
if(c[i] != a[i] + b[i])
{
j++;
}
}
std::cout<<" "<<j<<"\n";
}
int main() {
// Array size of 2^16 (65536 elements)
const int N = 100;
const size_t bytes = sizeof(int) * N;
// Vectors for holding the host-side (CPU-side) data
std::vector<int> a;
a.reserve(N);
std::vector<int> b;
b.reserve(N);
std::vector<int> c;
c.reserve(N);
// Initialize random numbers in each array
for (int i = 0; i < N; i++) {
a.push_back(i);
b.push_back(i);
}
std::cout<<"a size"<<a.size();
start(N,bytes,a,b,c);
verify_result(a,b,c);
std::cout << "COMPLETED SUCCESSFULLY\n";
cudaDeviceReset();
// exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
return 0;
}
and my separate cu file
#include "01_vector_addition.h"
#include<stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <Common/helper_cuda.h>
#include <Common/helper_functions.h>
// #include "cuPrintf.cu"
// CUDA kernel for vector addition
// __global__ means this is called from the CPU, and runs on the GPU
__global__ void vectorAdd(const int *__restrict a, const int *__restrict b,int *__restrict c, int N)
{
printf("Hello from GPU");
// Calculate global thread ID
int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
printf("c value %d \n", tid);
// Boundary check
if (tid < N) {
c[tid] = a[tid] + b[tid];
printf("c value %d \n", c[tid]);
}
# if __CUDA_ARCH__>=200
#endif
__syncthreads();
}
void start(const int N , const size_t bytes,std::vector<int> &a,std::vector<int> &b,std::vector<int> &c)
{
std::cout<<"\n\n\n\nhello\n";
// Allocate memory on the device
int *d_a, *d_b, *d_c;
checkCudaErrors(cudaMalloc(&d_a, bytes));
cudaMalloc(&d_b, bytes);
cudaMalloc(&d_c, bytes);
// Copy data from the host to the device (CPU -> GPU)
checkCudaErrors(cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_b, b.data(), bytes, cudaMemcpyHostToDevice));
// Threads per CTA (1024)
int NUM_THREADS = 1 << 10;
// CTAs per Grid
// We need to launch at LEAST as many threads as we have elements
// This equation pads an extra CTA to the grid if N cannot evenly be divided
// by NUM_THREADS (e.g. N = 1025, NUM_THREADS = 1024)
int NUM_BLOCKS = (N + NUM_THREADS - 1) / NUM_THREADS;
// Launch the kernel on the GPU
// Kernel calls are asynchronous (the CPU program continues execution after
// call, but no necessarily before the kernel finishes)
vectorAdd <<< NUM_BLOCKS, NUM_THREADS >>> (d_a, d_b, d_c, N);
cudaDeviceSynchronize();
// Copy sum vector from device to host
// cudaMemcpy is a synchronous operation, and waits for the prior kernel
// launch to complete (both go to the default stream in this case).
// Therefore, this cudaMemcpy acts as both a memcpy and synchronization
// barrier.
// c.clear();
cudaMemcpy(c.data(), d_c, bytes, cudaMemcpyDeviceToHost);
// Check result for errors
// verify_result(a, b, c);
// Free memory on device
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
}
and my .h file for cu file
// #include <cuda_runtime_api.h>
// #include <cuda.h>
#include <iostream>
#include <stdio.h>
#include <vector>
#include <cublas_v2.h>
void start(const int N ,const size_t bytes,std::vector<int> &a,std::vector<int> &b,std::vector<int> &c);
__global__ void vectorAdd(const int *__restrict a, const int *__restrict b,int *__restrict c, int N);
how to create my Cmakelists
cmake_minimum_required(VERSION 2.8)
project( preProcess )
find_package( OpenCV REQUIRED )
find_package(CUDA REQUIRED)
include(FindCUDA)
include_directories(/usr/local/cuda/include)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61)
include_directories(${YAML_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR} ${CUDA_INCLUDE_DIRS} ${CUDA_INCLUDE_DIRS}/Common ${CUDA_CUBLAS_DIRS} src)
file( GLOB cu *.cu)
add_executable( preProcess src/main.cpp )
CUDA_ADD_EXECUTABLE(test ${preProcess} src/01_vector_addition.cu)
target_link_libraries(preProcess /usr/local/cuda/lib64/libcudart.so ${OpenCV_LIBS} )
I used this now I am getting error
25%] Building NVCC (Device) object CMakeFiles/test.dir/src/test_generated_01_vector_addition.cu.o
Scanning dependencies of target test
[ 50%] Linking CXX executable test
/usr/bin/ld: /usr/lib/gcc/x86_64-linux-gnu/9/../../../x86_64-linux-gnu/Scrt1.o: in function `_start':
(.text+0x24): undefined reference to `main'
collect2: error: ld returned 1 exit status
I need help with linking the cu file with cpp