Compile device operator from other class CUDA and C++

Recently, I asked about device function compilation and now cannot go through compilation of operator. Here is the code:
FILE MAKEFILE:

NVCC=nvcc
CUDAFLAGS=-arch=sm_60
RM=/bin/rm -f

all: main

main: main.o wrapperCaller.o open_acc_map_cuda.o Vec4Simple.o
	${NVCC} ${CUDAFLAGS} -dlink open_acc_map_cuda.o Vec4Simple.o -o link.o
	g++ main.o wrapperCaller.o open_acc_map_cuda.o Vec4Simple.o link.o -o main -L/usr/local/cuda/lib64 -lcuda -lcudart

main.o: main.cpp open_acc_map_header.cuh
	g++ -std=c++11 -c main.cpp

wrapperCaller.o: wrapperCaller.cpp open_acc_map_header.cuh
	g++ -std=c++11 -c wrapperCaller.cpp

open_acc_map_cuda.o: open_acc_map_cuda.cu Vec4Simple.cu open_acc_map_header.cuh
	${NVCC} ${CUDAFLAGS} -dc open_acc_map_cuda.cu Vec4Simple.cu

clean:
	${RM} *.o main

FILE main.cpp

#include "open_acc_map_header.cuh"
#include <stdio.h>
#include <stdlib.h>

int main()
{
  printf("STAGE 1\n");
  int a = 1000;
  printf("a = %d\n", a);
  wrapperCaller(a);
  return 0;
}

FILE wrapperCaller.cpp

#include "open_acc_map_header.cuh"
#include <stdio.h>
#include <stdlib.h>

void wrapperCaller(int b)
{
  printf("STAGE 2\n");
  printf("b = %d\n", b);
  wrapper(b);
}

FILE openacc_map_header.cuh

#ifdef __CUDACC__
#define CUDA_HOSTDEV __host__ __device__
#else
#define CUDA_HOSTDEV
#endif

#define DIMS 1
#define BLOCKS 4
#define THREADS 32
#define CUDASIZE 10

extern void wrapperCaller(int a);
extern void wrapper(int b);

template <typename T>
class Vec4Simple
{
  T *_p;
  //T val[4];
  T val[4] __attribute__((aligned(32)));
  public:
    CUDA_HOSTDEV Vec4Simple();
    CUDA_HOSTDEV Vec4Simple(T *p);
    CUDA_HOSTDEV Vec4Simple(T x);
    CUDA_HOSTDEV Vec4Simple(T a,T b,T c,T d);
    CUDA_HOSTDEV Vec4Simple(Vec4Simple const &x);
    CUDA_HOSTDEV Vec4Simple<T> & load(T const * p);
    CUDA_HOSTDEV Vec4Simple<T> & load_a(T const * p);
    CUDA_HOSTDEV Vec4Simple<T> & insert(int i,T const &x);
    CUDA_HOSTDEV void store(T * p) const;
    CUDA_HOSTDEV void store_a(T * p) const;
    CUDA_HOSTDEV Vec4Simple<T> & operator = (T const & r);
    CUDA_HOSTDEV T operator [](int i) const;
    CUDA_HOSTDEV Vec4Simple<T> operator++ (int);
    static CUDA_HOSTDEV T getSquare(T b);
};

FILE openacc_map_cuda.cuh

#include "open_acc_map_header.cuh"
//#include "Vec4Simple.cu"
#include "device_launch_parameters.h"
#include "cuda.h"

#include <cuda_runtime.h>
#include <utility>
#include <type_traits>
#include <stdio.h>
#include <stdlib.h>

//__global__ void cuda_global(int *dev_a, int *dev_b, Vec4Simple<int> *dev_c)
__global__ void cuda_global(int *dev_a, int *dev_b, int *dev_c)
{
  int i = threadIdx.x;
  if (i < CUDASIZE)
  {
    dev_c[i] = dev_a[i] + dev_b[i];
    int s = Vec4Simple<int>::getSquare(25);
    printf("square of 25 = %d\n", s);
  }
}

void wrapper(int z)
{
  printf("STAGE 3\n");
  printf("z = %d\n", z);

  int a[CUDASIZE] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
  int b[CUDASIZE] = {11, 21, 31, 41, 51, 61, 71, 81, 91, 101};
  int c[CUDASIZE];
  //Vec4Simple<int> c[CUDASIZE];

  int *dev_a;
  int *dev_b;
  int *dev_c;

  //Vec4Simple<int> *dev_c;
  cudaMalloc((void**)&dev_a, CUDASIZE*sizeof(int));
  cudaMalloc((void**)&dev_b, CUDASIZE*sizeof(int));
  cudaMalloc((void**)&dev_c, CUDASIZE*sizeof(int));
  //cudaMalloc((void**)&dev_c, CUDASIZE*sizeof(Vec4Simple<int>));

  cudaMemcpy(dev_a, a, CUDASIZE*sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(dev_b, b, CUDASIZE*sizeof(int), cudaMemcpyHostToDevice);

  cuda_global<<<BLOCKS, THREADS>>>(dev_a, dev_b, dev_c);

  cudaMemcpy(c, dev_c, CUDASIZE*sizeof(int), cudaMemcpyDeviceToHost);
  //cudaMemcpy(c, dev_c, CUDASIZE*sizeof(Vec4Simple<int>), cudaMemcpyDeviceToHost);
  //printf("after: c = %d\n", c[CUDASIZE-3]);
  printf("after: c = %d\n", 2021);

  cudaFree(dev_a);
  cudaFree(dev_b);
  cudaFree(dev_c);
}

FILE Vec4Simple.cu

#include "open_acc_map_header.cuh"

#include "device_launch_parameters.h"
#include "cuda.h"
#include "cuda_runtime.h"
#include <stdio.h>
#include <stdlib.h>

template <typename T>
CUDA_HOSTDEV Vec4Simple<T>::Vec4Simple() { }
template <typename T>
CUDA_HOSTDEV Vec4Simple<T>::Vec4Simple(T *p) : _p(p) {}
template <typename T>
CUDA_HOSTDEV T Vec4Simple<T>::getSquare(T b)
{
  return b*b;
}
template <typename T>
CUDA_HOSTDEV Vec4Simple<T>::Vec4Simple(T x)
{
  for(unsigned int i=0;i<4;i++)
    val[i]=x;
}
template <typename T>
CUDA_HOSTDEV Vec4Simple<T>::Vec4Simple(T a,T b,T c,T d)
{
   val[0]=a;
   val[1]=b;
   val[2]=c;
   val[3]=d;
}
template <typename T>
CUDA_HOSTDEV Vec4Simple<T>::Vec4Simple(Vec4Simple const &x)
{
   for(unsigned int i=0;i<4;i++)
      val[i]=x.val[i];
}
// Member function to load from array (unaligned)
template <typename T>
CUDA_HOSTDEV Vec4Simple<T> & load(T const * p)
{
   for(unsigned int i=0;i<4;i++)
      val[i]=p[i];
   return *this;
}
// Member function to load from array, aligned by 32
template <typename T>
CUDA_HOSTDEV Vec4Simple<T> & load_a(T const * p)
{
   return this->load(p);
}
template <typename T>
CUDA_HOSTDEV Vec4Simple<T> & insert(int i,T const &x)
{
   val[i]=x;
   return *this;
}
// Member function to store into array (unaligned)
template <typename T>
CUDA_HOSTDEV void Vec4Simple<T>::store(T * p) const
{
   for(unsigned int i=0;i<4;i++)
      p[i]=val[i];
}
// Member function to store into array, aligned by 32
template <typename T>
CUDA_HOSTDEV void Vec4Simple<T>::store_a(T * p) const
{
   this->store(p);
}
template <typename T>
CUDA_HOSTDEV Vec4Simple<T> & Vec4Simple<T>::operator= (T const & r)
{
   for(unsigned int i=0;i<4;i++)
      val[i]=r.val[i];
   return *this;
}
template <typename T>
CUDA_HOSTDEV T Vec4Simple<T>::operator [](int i) const
{
   return val[i];
}

CUDA_HOSTDEV Vec4Simple<T> Vec4Simple<T>::operator++ (int)
{
   Vec4Simple<T> temp (*this);
   for(unsigned int i=0;i<4;i++)
      val[i]++;
   return temp;
}

template class Vec4Simple<int>;

Thanks

My colleague helped me with this. Key points are in following files:
FILE open_acc_map_header.cuh

#ifdef __CUDACC__
#define CUDA_HOSTDEV __host__ __device__
#else
#define CUDA_HOSTDEV
#endif

#define DIMS 1
#define BLOCKS 4
#define THREADS 32
#define CUDASIZE 10

extern void wrapperCaller(int a);
extern void wrapper(int b);

template <typename T>
class Vec4Simple
{
  T *_p;
  //T val[4];
  T val[4] __attribute__((aligned(32)));
  public:
CUDA_HOSTDEV Vec4Simple();
CUDA_HOSTDEV Vec4Simple(T *p);
CUDA_HOSTDEV Vec4Simple(T x);
CUDA_HOSTDEV Vec4Simple(T a,T b,T c,T d);
CUDA_HOSTDEV Vec4Simple(Vec4Simple const &x);
CUDA_HOSTDEV Vec4Simple<T> & load(T const * p);
CUDA_HOSTDEV Vec4Simple<T> & load_a(T const * p);
CUDA_HOSTDEV Vec4Simple<T> & insert(int i,T const &x);
CUDA_HOSTDEV void store(T * p) const;
CUDA_HOSTDEV void store_a(T * p) const;
CUDA_HOSTDEV Vec4Simple<T> & operator = (Vec4Simple<T> const & r);
CUDA_HOSTDEV T operator [](int i) const;
CUDA_HOSTDEV Vec4Simple<T> operator++ (int);
static CUDA_HOSTDEV T getSquare(T b);
};
template <typename T>
CUDA_HOSTDEV Vec4Simple<T> truncate_to_int(Vec4Simple<T> const & a);

FILE Vec4Simple.cu

#include "open_acc_map_header.cuh"

#include "device_launch_parameters.h"
#include "cuda.h"
#include "cuda_runtime.h"
#include <stdio.h>
#include <stdlib.h>

template <typename T>
CUDA_HOSTDEV Vec4Simple<T>::Vec4Simple() { }
template <typename T>
CUDA_HOSTDEV Vec4Simple<T>::Vec4Simple(T *p) : _p(p) {}
template <typename T>
CUDA_HOSTDEV T Vec4Simple<T>::getSquare(T b)
{ return b*b; }
template <typename T>
CUDA_HOSTDEV Vec4Simple<T>::Vec4Simple(T x)
{
  for(unsigned int i=0;i<4;i++)
    val[i]=x;
}
template <typename T>
CUDA_HOSTDEV Vec4Simple<T>::Vec4Simple(T a,T b,T c,T d)
{
   val[0]=a;
   val[1]=b;
   val[2]=c;
   val[3]=d;
}
template <typename T>
CUDA_HOSTDEV Vec4Simple<T>::Vec4Simple(Vec4Simple const &x)
{
   for(unsigned int i=0;i<4;i++)
      val[i]=x.val[i];
}

// Member function to load from array (unaligned)
template <typename T>
CUDA_HOSTDEV Vec4Simple<T> & Vec4Simple<T>::load(T const * p)
{
   for(unsigned int i=0;i<4;i++)
      val[i]=p[i];
   return *this;
}

// Member function to load from array, aligned by 32
template <typename T>
CUDA_HOSTDEV Vec4Simple<T> & Vec4Simple<T>::load_a(T const * p)
{
   return this->load(p);
}

template <typename T>
CUDA_HOSTDEV Vec4Simple<T> & Vec4Simple<T>::insert(int i,T const &x)
{
   val[i]=x;
   return *this;
}

// Member function to store into array (unaligned)
template <typename T>
CUDA_HOSTDEV void Vec4Simple<T>::store(T * p) const
{
   for(unsigned int i=0;i<4;i++)
      p[i]=val[i];
}

// Member function to store into array, aligned by 32
template <typename T>
CUDA_HOSTDEV void Vec4Simple<T>::store_a(T * p) const
{
   this->store(p);
}

template <typename T>
CUDA_HOSTDEV Vec4Simple<T> & Vec4Simple<T>::operator = (Vec4Simple<T> const & r)
{
   for(unsigned int i=0;i<4;i++)
      val[i]=r.val[i];
   return *this;
}

template <typename T>
CUDA_HOSTDEV T Vec4Simple<T>::operator [](int i) const
{
   return val[i];
}

template <typename T>
CUDA_HOSTDEV Vec4Simple<T> Vec4Simple<T>::operator++ (int)
{
   Vec4Simple<T> temp (*this);
   for(unsigned int i=0;i<4;i++)
      val[i]++;
   return temp;
}

template class Vec4Simple<int>;

template <class T>
static inline Vec4Simple<T> truncate_to_int(Vec4Simple<T> const & a)
{
  return Vec4Simple<int>(a.val[0], a.val[1], a.val[2], a.val[3]);
}