I could not compile a program. I am trying to achieve a device function declared in a class to be called from a global function. I keep receiving different errors. I think the problem is in makefile. I followed nvcc documentation and cuda developer documentation. Here is the code.
makefile:
NVCC=nvcc
CUDAFLAGS=-arch=sm_60
RM=/bin/rm -f
all: main
main: main.o wrapperCaller.o open_acc_map_cuda.o Vec4Simple.o
${NVCC} ${CUDAFLAGS} -dlink open_acc_map_cuda.o Vec4Simple.o -o link.o
g++ main.o wrapperCaller.o open_acc_map_cuda.o Vec4Simple.o link.o -o main -L/usr/local/cuda/lib64 -lcuda -lcudart
main.o: main.cpp open_acc_map_header.cuh
g++ -std=c++11 -c main.cpp
wrapperCaller.o: wrapperCaller.cpp open_acc_map_header.cuh
g++ -std=c++11 -c wrapperCaller.cpp
open_acc_map_cuda.o: open_acc_map_cuda.cu Vec4Simple.cu open_acc_map_header.cuh
${NVCC} ${CUDAFLAGS} -dc open_acc_map_cuda.cu Vec4Simple.cu
#Vec4Simple.o: Vec4Simple.cu open_acc_map_header.cuh
# ${NVCC} ${CUDAFLAGS} -dc Vec4Simple.cu
clean:
${RM} *.o main
Then header file open_acc_map_header.cuh
#ifdef __CUDACC__
#define CUDA_HOSTDEV __host__ __device__
#else
#define CUDA_HOSTDEV
#endif
#define DIMS 1
#define BLOCKS 4
#define THREADS 32
#define CUDASIZE 10
extern void wrapperCaller(int a);
extern void wrapper(int b);
extern CUDA_HOSTDEV int getSquare(int c);
File open_acc_map_cuda.cu
#include "open_acc_map_header.cuh"
//#include "Vec4Simple.cu"
#include "device_launch_parameters.h"
#include "cuda.h"
#include <cuda_runtime.h>
#include <utility>
#include <type_traits>
#include <stdio.h>
#include <stdlib.h>
//__global__ void cuda_global(int *dev_a, int *dev_b, Vec4Simple<int> *dev_c)
__global__ void cuda_global(int *dev_a, int *dev_b, int *dev_c)
{
int i = threadIdx.x;
if (i < CUDASIZE)
{
dev_c[i] = dev_a[i] + dev_b[i];
int s = getSquare(25);
printf("square of 25 = %d\n", s);
}
}
void wrapper(int z)
{
printf("STAGE 3\n");
printf("z = %d\n", z);
int a[CUDASIZE] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
int b[CUDASIZE] = {11, 21, 31, 41, 51, 61, 71, 81, 91, 101};
int c[CUDASIZE];
//Vec4Simple<int> c[CUDASIZE];
int *dev_a;
int *dev_b;
int *dev_c;
//Vec4Simple<int> *dev_c;
cudaMalloc((void**)&dev_a, CUDASIZE*sizeof(int));
cudaMalloc((void**)&dev_b, CUDASIZE*sizeof(int));
cudaMalloc((void**)&dev_c, CUDASIZE*sizeof(int));
//cudaMalloc((void**)&dev_c, CUDASIZE*sizeof(Vec4Simple<int>));
cudaMemcpy(dev_a, a, CUDASIZE*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, CUDASIZE*sizeof(int), cudaMemcpyHostToDevice);
cuda_global<<<BLOCKS, THREADS>>>(dev_a, dev_b, dev_c);
cudaMemcpy(c, dev_c, CUDASIZE*sizeof(int), cudaMemcpyDeviceToHost);
//cudaMemcpy(c, dev_c, CUDASIZE*sizeof(Vec4Simple<int>), cudaMemcpyDeviceToHost);
//printf("after: c = %d\n", c[CUDASIZE-3]);
printf("after: c = %d\n", 2021);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
}
File Vec4Simple.cu
#include "open_acc_map_header.cuh"
#include "device_launch_parameters.h"
#include "cuda.h"
#include "cuda_runtime.h"
#include <stdio.h>
#include <stdlib.h>
template <typename T>
class Vec4Simple
{
T *_p = nullptr;
public:
CUDA_HOSTDEV Vec4Simple() { }
CUDA_HOSTDEV Vec4Simple(T *p) : _p(p) {}
CUDA_HOSTDEV int getSquare(int b)
{ return b*b; }
};
File wrapperCaller.cpp
#include "open_acc_map_header.cuh"
#include <stdio.h>
#include <stdlib.h>
void wrapperCaller(int b)
{
printf("STAGE 2\n");
printf("b = %d\n", b);
wrapper(b);
}
File main.cpp
#include "open_acc_map_header.cuh"
#include <stdio.h>
#include <stdlib.h>
int main()
{
printf("STAGE 1\n");
int a = 1000;
printf("a = %d\n", a);
wrapperCaller(a);
return 0;
}