Linux cpp and cu file compilation and usage

hi iam trying to create a CMakeLists.txt for cpp file and cu file
my main file main.cpp

#include <algorithm>
#include <cassert>
#include <iostream>
#include <vector>
#include <iostream>

#include <cuda_runtime.h>
#include <vector_types.h>


#include "01_vector_addition.h"


// Check vector add result
void verify_result(std::vector<int> &a, std::vector<int> &b,
                   std::vector<int> &c)
                   {
                    int j = 0;
  for (size_t i = 0; i < a.size(); i++) {
    // assert(c[i] == a[i] + b[i])
      std::cout<<"c "<<c[i]<<" i"<<i<<"\n";

    if(c[i] != a[i] + b[i])
    {
      j++;
    }

  }
      std::cout<<" "<<j<<"\n";

}

int main() {
  // Array size of 2^16 (65536 elements)
  const int N = 100;
  const size_t bytes = sizeof(int) * N;

  // Vectors for holding the host-side (CPU-side) data
  std::vector<int> a;
  a.reserve(N);
  std::vector<int> b;
  b.reserve(N);
  std::vector<int> c;
  c.reserve(N);

  // Initialize random numbers in each array
  for (int i = 0; i < N; i++) {
    a.push_back(i);
    b.push_back(i);
  }
  std::cout<<"a size"<<a.size();
  
  start(N,bytes,a,b,c);
  verify_result(a,b,c);
  std::cout << "COMPLETED SUCCESSFULLY\n";
 cudaDeviceReset();
  //  exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
  return 0;
}

and my separate cu file

#include "01_vector_addition.h"

#include<stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>

#include <Common/helper_cuda.h>
#include <Common/helper_functions.h>
// #include "cuPrintf.cu"
// CUDA kernel for vector addition
// __global__ means this is called from the CPU, and runs on the GPU

__global__ void vectorAdd(const int *__restrict a, const int *__restrict b,int *__restrict c, int N)
{  
     printf("Hello from GPU");
    // Calculate global thread ID
    int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
    printf("c value %d \n", tid);
    // Boundary check
    if (tid < N) {
        c[tid] = a[tid] + b[tid];
         printf("c value %d \n", c[tid]);

    }
    # if __CUDA_ARCH__>=200
    #endif
    __syncthreads();
}
 void start(const int N , const size_t bytes,std::vector<int> &a,std::vector<int> &b,std::vector<int> &c)
{
    std::cout<<"\n\n\n\nhello\n";
      // Allocate memory on the device
  int *d_a, *d_b, *d_c;
  checkCudaErrors(cudaMalloc(&d_a, bytes));
  cudaMalloc(&d_b, bytes);
  cudaMalloc(&d_c, bytes);

  // Copy data from the host to the device (CPU -> GPU)
  checkCudaErrors(cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_b, b.data(), bytes, cudaMemcpyHostToDevice));

  // Threads per CTA (1024)
  int NUM_THREADS = 1 << 10;

  // CTAs per Grid
  // We need to launch at LEAST as many threads as we have elements
  // This equation pads an extra CTA to the grid if N cannot evenly be divided
  // by NUM_THREADS (e.g. N = 1025, NUM_THREADS = 1024)
  int NUM_BLOCKS = (N + NUM_THREADS - 1) / NUM_THREADS;
    
  // Launch the kernel on the GPU
  // Kernel calls are asynchronous (the CPU program continues execution after
  // call, but no necessarily before the kernel finishes)
  vectorAdd <<< NUM_BLOCKS, NUM_THREADS >>> (d_a, d_b, d_c, N);
    cudaDeviceSynchronize();
  // Copy sum vector from device to host
  // cudaMemcpy is a synchronous operation, and waits for the prior kernel
  // launch to complete (both go to the default stream in this case).
  // Therefore, this cudaMemcpy acts as both a memcpy and synchronization
  // barrier.
    // c.clear();
  cudaMemcpy(c.data(), d_c, bytes, cudaMemcpyDeviceToHost);

  // Check result for errors
  // verify_result(a, b, c);

  // Free memory on device
  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);

}


and my .h file for cu file

// #include <cuda_runtime_api.h>
// #include <cuda.h>
#include <iostream>
#include <stdio.h>
#include <vector>
#include <cublas_v2.h>
 void start(const int N ,const size_t bytes,std::vector<int> &a,std::vector<int> &b,std::vector<int> &c);
__global__ void vectorAdd(const int *__restrict a, const int *__restrict b,int *__restrict c, int N);

how to create my Cmakelists

cmake_minimum_required(VERSION 2.8)
project( preProcess )
find_package( OpenCV REQUIRED )
find_package(CUDA REQUIRED)
include(FindCUDA)

include_directories(/usr/local/cuda/include)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61)
include_directories(${YAML_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}  ${CUDA_INCLUDE_DIRS} ${CUDA_INCLUDE_DIRS}/Common ${CUDA_CUBLAS_DIRS} src)
file( GLOB  cu  *.cu)
add_executable( preProcess src/main.cpp )
CUDA_ADD_EXECUTABLE(test ${preProcess} src/01_vector_addition.cu)
target_link_libraries(preProcess /usr/local/cuda/lib64/libcudart.so  ${OpenCV_LIBS} )

I used this now I am getting error

 25%] Building NVCC (Device) object CMakeFiles/test.dir/src/test_generated_01_vector_addition.cu.o
Scanning dependencies of target test
[ 50%] Linking CXX executable test
/usr/bin/ld: /usr/lib/gcc/x86_64-linux-gnu/9/../../../x86_64-linux-gnu/Scrt1.o: in function `_start':
(.text+0x24): undefined reference to `main'
collect2: error: ld returned 1 exit status

I need help with linking the cu file with cpp

cuda 11.4 have problem i installed 11.2 problem solved

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.