Linux cpp and cu file compilation and usage

hi iam trying to create a CMakeLists.txt for cpp file and cu file
my main file main.cpp

#include <algorithm>
#include <cassert>
#include <iostream>
#include <vector>
#include <iostream>

#include <cuda_runtime.h>
#include <vector_types.h>

#include "01_vector_addition.h"

// Check vector add result
void verify_result(std::vector<int> &a, std::vector<int> &b,
                   std::vector<int> &c)
                    int j = 0;
  for (size_t i = 0; i < a.size(); i++) {
    // assert(c[i] == a[i] + b[i])
      std::cout<<"c "<<c[i]<<" i"<<i<<"\n";

    if(c[i] != a[i] + b[i])

      std::cout<<" "<<j<<"\n";


int main() {
  // Array size of 2^16 (65536 elements)
  const int N = 100;
  const size_t bytes = sizeof(int) * N;

  // Vectors for holding the host-side (CPU-side) data
  std::vector<int> a;
  std::vector<int> b;
  std::vector<int> c;

  // Initialize random numbers in each array
  for (int i = 0; i < N; i++) {
  std::cout<<"a size"<<a.size();
  std::cout << "COMPLETED SUCCESSFULLY\n";
  //  exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
  return 0;

and my separate cu file

#include "01_vector_addition.h"

#include <cuda.h>
#include <cuda_runtime.h>

#include <Common/helper_cuda.h>
#include <Common/helper_functions.h>
// #include ""
// CUDA kernel for vector addition
// __global__ means this is called from the CPU, and runs on the GPU

__global__ void vectorAdd(const int *__restrict a, const int *__restrict b,int *__restrict c, int N)
     printf("Hello from GPU");
    // Calculate global thread ID
    int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
    printf("c value %d \n", tid);
    // Boundary check
    if (tid < N) {
        c[tid] = a[tid] + b[tid];
         printf("c value %d \n", c[tid]);

    # if __CUDA_ARCH__>=200
 void start(const int N , const size_t bytes,std::vector<int> &a,std::vector<int> &b,std::vector<int> &c)
      // Allocate memory on the device
  int *d_a, *d_b, *d_c;
  checkCudaErrors(cudaMalloc(&d_a, bytes));
  cudaMalloc(&d_b, bytes);
  cudaMalloc(&d_c, bytes);

  // Copy data from the host to the device (CPU -> GPU)
  checkCudaErrors(cudaMemcpy(d_a,, bytes, cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_b,, bytes, cudaMemcpyHostToDevice));

  // Threads per CTA (1024)
  int NUM_THREADS = 1 << 10;

  // CTAs per Grid
  // We need to launch at LEAST as many threads as we have elements
  // This equation pads an extra CTA to the grid if N cannot evenly be divided
  // by NUM_THREADS (e.g. N = 1025, NUM_THREADS = 1024)
  // Launch the kernel on the GPU
  // Kernel calls are asynchronous (the CPU program continues execution after
  // call, but no necessarily before the kernel finishes)
  vectorAdd <<< NUM_BLOCKS, NUM_THREADS >>> (d_a, d_b, d_c, N);
  // Copy sum vector from device to host
  // cudaMemcpy is a synchronous operation, and waits for the prior kernel
  // launch to complete (both go to the default stream in this case).
  // Therefore, this cudaMemcpy acts as both a memcpy and synchronization
  // barrier.
    // c.clear();
  cudaMemcpy(, d_c, bytes, cudaMemcpyDeviceToHost);

  // Check result for errors
  // verify_result(a, b, c);

  // Free memory on device


and my .h file for cu file

// #include <cuda_runtime_api.h>
// #include <cuda.h>
#include <iostream>
#include <stdio.h>
#include <vector>
#include <cublas_v2.h>
 void start(const int N ,const size_t bytes,std::vector<int> &a,std::vector<int> &b,std::vector<int> &c);
__global__ void vectorAdd(const int *__restrict a, const int *__restrict b,int *__restrict c, int N);

how to create my Cmakelists

cmake_minimum_required(VERSION 2.8)
project( preProcess )
find_package( OpenCV REQUIRED )
find_package(CUDA REQUIRED)

set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61)
file( GLOB  cu  *.cu)
add_executable( preProcess src/main.cpp )
CUDA_ADD_EXECUTABLE(test ${preProcess} src/
target_link_libraries(preProcess /usr/local/cuda/lib64/  ${OpenCV_LIBS} )

I used this now I am getting error

 25%] Building NVCC (Device) object CMakeFiles/test.dir/src/
Scanning dependencies of target test
[ 50%] Linking CXX executable test
/usr/bin/ld: /usr/lib/gcc/x86_64-linux-gnu/9/../../../x86_64-linux-gnu/Scrt1.o: in function `_start':
(.text+0x24): undefined reference to `main'
collect2: error: ld returned 1 exit status

I need help with linking the cu file with cpp

cuda 11.4 have problem i installed 11.2 problem solved

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.