Compile device function from other class CUDA and C++

I could not compile a program. I am trying to achieve a device function declared in a class to be called from a global function. I keep receiving different errors. I think the problem is in makefile. I followed nvcc documentation and cuda developer documentation. Here is the code.
makefile:

NVCC=nvcc
CUDAFLAGS=-arch=sm_60
RM=/bin/rm -f

all: main

main: main.o wrapperCaller.o open_acc_map_cuda.o Vec4Simple.o
	${NVCC} ${CUDAFLAGS} -dlink open_acc_map_cuda.o Vec4Simple.o -o link.o
	g++ main.o wrapperCaller.o open_acc_map_cuda.o Vec4Simple.o link.o -o main -L/usr/local/cuda/lib64 -lcuda -lcudart

main.o: main.cpp open_acc_map_header.cuh
	g++ -std=c++11 -c main.cpp

wrapperCaller.o: wrapperCaller.cpp open_acc_map_header.cuh
	g++ -std=c++11 -c wrapperCaller.cpp

open_acc_map_cuda.o: open_acc_map_cuda.cu Vec4Simple.cu open_acc_map_header.cuh
	${NVCC} ${CUDAFLAGS} -dc open_acc_map_cuda.cu Vec4Simple.cu

#Vec4Simple.o: Vec4Simple.cu open_acc_map_header.cuh
#	${NVCC} ${CUDAFLAGS} -dc Vec4Simple.cu

clean:
	${RM} *.o main

Then header file open_acc_map_header.cuh

#ifdef __CUDACC__
#define CUDA_HOSTDEV __host__ __device__
#else
#define CUDA_HOSTDEV
#endif

#define DIMS 1
#define BLOCKS 4
#define THREADS 32
#define CUDASIZE 10

extern void wrapperCaller(int a);
extern void wrapper(int b);
extern CUDA_HOSTDEV int getSquare(int c);

File open_acc_map_cuda.cu

 #include "open_acc_map_header.cuh"
//#include "Vec4Simple.cu"
#include "device_launch_parameters.h"
#include "cuda.h"

#include <cuda_runtime.h>
#include <utility>
#include <type_traits>
#include <stdio.h>
#include <stdlib.h>

//__global__ void cuda_global(int *dev_a, int *dev_b, Vec4Simple<int> *dev_c)
__global__ void cuda_global(int *dev_a, int *dev_b, int *dev_c)
{
  int i = threadIdx.x;
  if (i < CUDASIZE)
  {
    dev_c[i] = dev_a[i] + dev_b[i];
    int s = getSquare(25);
    printf("square of 25 = %d\n", s);
  }
}

void wrapper(int z)
{
  printf("STAGE 3\n");
  printf("z = %d\n", z);

  int a[CUDASIZE] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
  int b[CUDASIZE] = {11, 21, 31, 41, 51, 61, 71, 81, 91, 101};
  int c[CUDASIZE];
  //Vec4Simple<int> c[CUDASIZE];

  int *dev_a;
  int *dev_b;
  int *dev_c;

  //Vec4Simple<int> *dev_c;

  cudaMalloc((void**)&dev_a, CUDASIZE*sizeof(int));
  cudaMalloc((void**)&dev_b, CUDASIZE*sizeof(int));
  cudaMalloc((void**)&dev_c, CUDASIZE*sizeof(int));
  //cudaMalloc((void**)&dev_c, CUDASIZE*sizeof(Vec4Simple<int>));

  cudaMemcpy(dev_a, a, CUDASIZE*sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(dev_b, b, CUDASIZE*sizeof(int), cudaMemcpyHostToDevice);

  cuda_global<<<BLOCKS, THREADS>>>(dev_a, dev_b, dev_c);

  cudaMemcpy(c, dev_c, CUDASIZE*sizeof(int), cudaMemcpyDeviceToHost);
  //cudaMemcpy(c, dev_c, CUDASIZE*sizeof(Vec4Simple<int>), cudaMemcpyDeviceToHost);
  //printf("after: c = %d\n", c[CUDASIZE-3]);
  printf("after: c = %d\n", 2021);

  cudaFree(dev_a);
  cudaFree(dev_b);
  cudaFree(dev_c);
}

File Vec4Simple.cu

#include "open_acc_map_header.cuh"

#include "device_launch_parameters.h"
#include "cuda.h"
#include "cuda_runtime.h"
#include <stdio.h>
#include <stdlib.h>

template <typename T>
class Vec4Simple
{
  T *_p = nullptr;
  public:
    CUDA_HOSTDEV Vec4Simple() { }
    CUDA_HOSTDEV Vec4Simple(T *p) : _p(p) {}
    CUDA_HOSTDEV int getSquare(int b)
    { return b*b; }
};

File wrapperCaller.cpp

#include "open_acc_map_header.cuh"
#include <stdio.h>
#include <stdlib.h>

void wrapperCaller(int b)
{
  printf("STAGE 2\n");
  printf("b = %d\n", b);
  wrapper(b);
}

File main.cpp

#include "open_acc_map_header.cuh"
#include <stdio.h>
#include <stdlib.h>

int main()
{
  printf("STAGE 1\n");
  int a = 1000;
  printf("a = %d\n", a);
  wrapperCaller(a);
  return 0;
}

please use the available formatting tools to format the code and makefile in your post. A simple approach is to edit your posting, then select the code you want to format, then press the </> button at the top of the edit windows. Do this for each code and makefile. Thanks. Then save your edits.

  1. Your getSquare member function is defined as a member of the class Vec4Simple. This is not the correct prototype:

    extern CUDA_HOSTDEV int getSquare(int c);
    

    That would be the correct prototype if it were a bare function. But since it is defined as a class member function, the prototype must reference the class somehow, e.g. something like this:

     CUDA_HOSTDEV int Vec4Simple<T>::getSquare(int c);
    

    This is a C++ concept, not unique or specific to CUDA. However declaring the prototype this way will immediately require a declaration of the class Vec4Simple itself, which is entirely missing from your header files.

  2. The next problem we will run into if we fix that, is that the class Vec4Simple is a templated class, meaning it is not actually instantiated by the compiler until it is used. However it is not used in the compilation unit it is in, which means the compiler will have no actual definition/implementation for use in any other compilation unit. A typical method to resolve this is to force template instantiation for the desired type(s). Again, this is a C++ concept, not unique or specific to CUDA in any way.

The net of this is that we must provide the declaration of the Vec4Simple class in the header file, and refactor the implementation accordingly. The following were the minimum changes I could make to get your code to compile:

file open_acc_map_header.cuh:

#ifdef __CUDACC__
#define CUDA_HOSTDEV __host__ __device__
#else
#define CUDA_HOSTDEV
#endif

#define DIMS 1
#define BLOCKS 4
#define THREADS 32
#define CUDASIZE 10

extern void wrapperCaller(int a);
extern void wrapper(int b);

template <typename T>
class Vec4Simple
{
  T *_p;
  public:
    CUDA_HOSTDEV Vec4Simple();
    CUDA_HOSTDEV Vec4Simple(T *p);
    static CUDA_HOSTDEV T getSquare(T b);
};

file open_acc_map_cuda.cu:

#include "open_acc_map_header.cuh"
//#include "Vec4Simple.cu"
#include "device_launch_parameters.h"
#include "cuda.h"

#include <cuda_runtime.h>
#include <utility>
#include <type_traits>
#include <stdio.h>
#include <stdlib.h>

//__global__ void cuda_global(int *dev_a, int *dev_b, Vec4Simple<int> *dev_c)
__global__ void cuda_global(int *dev_a, int *dev_b, int *dev_c)
{
  int i = threadIdx.x;
  if (i < CUDASIZE)
  {
    dev_c[i] = dev_a[i] + dev_b[i];
    int s = Vec4Simple<int>::getSquare(25);
    printf("square of 25 = %d\n", s);
  }
}

void wrapper(int z)
{
  printf("STAGE 3\n");
  printf("z = %d\n", z);

  int a[CUDASIZE] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
  int b[CUDASIZE] = {11, 21, 31, 41, 51, 61, 71, 81, 91, 101};
  int c[CUDASIZE];
  //Vec4Simple<int> c[CUDASIZE];

  int *dev_a;
  int *dev_b;
  int *dev_c;

  //Vec4Simple<int> *dev_c;

  cudaMalloc((void**)&dev_a, CUDASIZE*sizeof(int));
  cudaMalloc((void**)&dev_b, CUDASIZE*sizeof(int));
  cudaMalloc((void**)&dev_c, CUDASIZE*sizeof(int));
  //cudaMalloc((void**)&dev_c, CUDASIZE*sizeof(Vec4Simple<int>));

  cudaMemcpy(dev_a, a, CUDASIZE*sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(dev_b, b, CUDASIZE*sizeof(int), cudaMemcpyHostToDevice);

  cuda_global<<<BLOCKS, THREADS>>>(dev_a, dev_b, dev_c);

  cudaMemcpy(c, dev_c, CUDASIZE*sizeof(int), cudaMemcpyDeviceToHost);
  //cudaMemcpy(c, dev_c, CUDASIZE*sizeof(Vec4Simple<int>), cudaMemcpyDeviceToHost);
  //printf("after: c = %d\n", c[CUDASIZE-3]);
  printf("after: c = %d\n", 2021);

  cudaFree(dev_a);
  cudaFree(dev_b);
  cudaFree(dev_c);
}

file Vec4Simple.cu:

#include "open_acc_map_header.cuh"

#include "device_launch_parameters.h"
#include "cuda.h"
#include "cuda_runtime.h"
#include <stdio.h>
#include <stdlib.h>
    template <typename T>
    CUDA_HOSTDEV Vec4Simple<T>::Vec4Simple() { }
    template <typename T>
    CUDA_HOSTDEV Vec4Simple<T>::Vec4Simple(T *p) : _p(p) {}
    template <typename T>
    CUDA_HOSTDEV T Vec4Simple<T>::getSquare(T b)
    { return b*b; }

template class Vec4Simple<int>;

Thanks, for advice

Thank you very much Robert!