Recently, I asked about device function compilation and now cannot go through compilation of operator. Here is the code:
FILE MAKEFILE:
NVCC=nvcc
CUDAFLAGS=-arch=sm_60
RM=/bin/rm -f
all: main
main: main.o wrapperCaller.o open_acc_map_cuda.o Vec4Simple.o
${NVCC} ${CUDAFLAGS} -dlink open_acc_map_cuda.o Vec4Simple.o -o link.o
g++ main.o wrapperCaller.o open_acc_map_cuda.o Vec4Simple.o link.o -o main -L/usr/local/cuda/lib64 -lcuda -lcudart
main.o: main.cpp open_acc_map_header.cuh
g++ -std=c++11 -c main.cpp
wrapperCaller.o: wrapperCaller.cpp open_acc_map_header.cuh
g++ -std=c++11 -c wrapperCaller.cpp
open_acc_map_cuda.o: open_acc_map_cuda.cu Vec4Simple.cu open_acc_map_header.cuh
${NVCC} ${CUDAFLAGS} -dc open_acc_map_cuda.cu Vec4Simple.cu
clean:
${RM} *.o main
FILE main.cpp
#include "open_acc_map_header.cuh"
#include <stdio.h>
#include <stdlib.h>
int main()
{
printf("STAGE 1\n");
int a = 1000;
printf("a = %d\n", a);
wrapperCaller(a);
return 0;
}
FILE wrapperCaller.cpp
#include "open_acc_map_header.cuh"
#include <stdio.h>
#include <stdlib.h>
void wrapperCaller(int b)
{
printf("STAGE 2\n");
printf("b = %d\n", b);
wrapper(b);
}
FILE openacc_map_header.cuh
#ifdef __CUDACC__
#define CUDA_HOSTDEV __host__ __device__
#else
#define CUDA_HOSTDEV
#endif
#define DIMS 1
#define BLOCKS 4
#define THREADS 32
#define CUDASIZE 10
extern void wrapperCaller(int a);
extern void wrapper(int b);
template <typename T>
class Vec4Simple
{
T *_p;
//T val[4];
T val[4] __attribute__((aligned(32)));
public:
CUDA_HOSTDEV Vec4Simple();
CUDA_HOSTDEV Vec4Simple(T *p);
CUDA_HOSTDEV Vec4Simple(T x);
CUDA_HOSTDEV Vec4Simple(T a,T b,T c,T d);
CUDA_HOSTDEV Vec4Simple(Vec4Simple const &x);
CUDA_HOSTDEV Vec4Simple<T> & load(T const * p);
CUDA_HOSTDEV Vec4Simple<T> & load_a(T const * p);
CUDA_HOSTDEV Vec4Simple<T> & insert(int i,T const &x);
CUDA_HOSTDEV void store(T * p) const;
CUDA_HOSTDEV void store_a(T * p) const;
CUDA_HOSTDEV Vec4Simple<T> & operator = (T const & r);
CUDA_HOSTDEV T operator [](int i) const;
CUDA_HOSTDEV Vec4Simple<T> operator++ (int);
static CUDA_HOSTDEV T getSquare(T b);
};
FILE openacc_map_cuda.cuh
#include "open_acc_map_header.cuh"
//#include "Vec4Simple.cu"
#include "device_launch_parameters.h"
#include "cuda.h"
#include <cuda_runtime.h>
#include <utility>
#include <type_traits>
#include <stdio.h>
#include <stdlib.h>
//__global__ void cuda_global(int *dev_a, int *dev_b, Vec4Simple<int> *dev_c)
__global__ void cuda_global(int *dev_a, int *dev_b, int *dev_c)
{
int i = threadIdx.x;
if (i < CUDASIZE)
{
dev_c[i] = dev_a[i] + dev_b[i];
int s = Vec4Simple<int>::getSquare(25);
printf("square of 25 = %d\n", s);
}
}
void wrapper(int z)
{
printf("STAGE 3\n");
printf("z = %d\n", z);
int a[CUDASIZE] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
int b[CUDASIZE] = {11, 21, 31, 41, 51, 61, 71, 81, 91, 101};
int c[CUDASIZE];
//Vec4Simple<int> c[CUDASIZE];
int *dev_a;
int *dev_b;
int *dev_c;
//Vec4Simple<int> *dev_c;
cudaMalloc((void**)&dev_a, CUDASIZE*sizeof(int));
cudaMalloc((void**)&dev_b, CUDASIZE*sizeof(int));
cudaMalloc((void**)&dev_c, CUDASIZE*sizeof(int));
//cudaMalloc((void**)&dev_c, CUDASIZE*sizeof(Vec4Simple<int>));
cudaMemcpy(dev_a, a, CUDASIZE*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, CUDASIZE*sizeof(int), cudaMemcpyHostToDevice);
cuda_global<<<BLOCKS, THREADS>>>(dev_a, dev_b, dev_c);
cudaMemcpy(c, dev_c, CUDASIZE*sizeof(int), cudaMemcpyDeviceToHost);
//cudaMemcpy(c, dev_c, CUDASIZE*sizeof(Vec4Simple<int>), cudaMemcpyDeviceToHost);
//printf("after: c = %d\n", c[CUDASIZE-3]);
printf("after: c = %d\n", 2021);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
}
FILE Vec4Simple.cu
#include "open_acc_map_header.cuh"
#include "device_launch_parameters.h"
#include "cuda.h"
#include "cuda_runtime.h"
#include <stdio.h>
#include <stdlib.h>
template <typename T>
CUDA_HOSTDEV Vec4Simple<T>::Vec4Simple() { }
template <typename T>
CUDA_HOSTDEV Vec4Simple<T>::Vec4Simple(T *p) : _p(p) {}
template <typename T>
CUDA_HOSTDEV T Vec4Simple<T>::getSquare(T b)
{
return b*b;
}
template <typename T>
CUDA_HOSTDEV Vec4Simple<T>::Vec4Simple(T x)
{
for(unsigned int i=0;i<4;i++)
val[i]=x;
}
template <typename T>
CUDA_HOSTDEV Vec4Simple<T>::Vec4Simple(T a,T b,T c,T d)
{
val[0]=a;
val[1]=b;
val[2]=c;
val[3]=d;
}
template <typename T>
CUDA_HOSTDEV Vec4Simple<T>::Vec4Simple(Vec4Simple const &x)
{
for(unsigned int i=0;i<4;i++)
val[i]=x.val[i];
}
// Member function to load from array (unaligned)
template <typename T>
CUDA_HOSTDEV Vec4Simple<T> & load(T const * p)
{
for(unsigned int i=0;i<4;i++)
val[i]=p[i];
return *this;
}
// Member function to load from array, aligned by 32
template <typename T>
CUDA_HOSTDEV Vec4Simple<T> & load_a(T const * p)
{
return this->load(p);
}
template <typename T>
CUDA_HOSTDEV Vec4Simple<T> & insert(int i,T const &x)
{
val[i]=x;
return *this;
}
// Member function to store into array (unaligned)
template <typename T>
CUDA_HOSTDEV void Vec4Simple<T>::store(T * p) const
{
for(unsigned int i=0;i<4;i++)
p[i]=val[i];
}
// Member function to store into array, aligned by 32
template <typename T>
CUDA_HOSTDEV void Vec4Simple<T>::store_a(T * p) const
{
this->store(p);
}
template <typename T>
CUDA_HOSTDEV Vec4Simple<T> & Vec4Simple<T>::operator= (T const & r)
{
for(unsigned int i=0;i<4;i++)
val[i]=r.val[i];
return *this;
}
template <typename T>
CUDA_HOSTDEV T Vec4Simple<T>::operator [](int i) const
{
return val[i];
}
CUDA_HOSTDEV Vec4Simple<T> Vec4Simple<T>::operator++ (int)
{
Vec4Simple<T> temp (*this);
for(unsigned int i=0;i<4;i++)
val[i]++;
return temp;
}
template class Vec4Simple<int>;
Thanks