Linking CUDA and C++

I’m new to CUDA and have working CUDA code. I now want to input data from a file and send it to the kernel. I have a .cpp file that does this. I’ve created a header file that declares my cuda function that calls the kernel as extern “C” (or without) but when I call the function in my .cpp file I get the error

Undefined symbols:

“_pcfkernel”, referenced from:

  _main in cc04vKDX.o

ld: symbol(s) not found

I can’t figure out what I’m doing wrong as this should be really simple and the cu and cpp files work when I don’t try link them.

I’ve included the cpp and header file contents below. My function in the cu file is pcfkernel.

#include <iostream>

#include <fstream>

#include "cu2pcf.h"

using std::cout;

using std::cerr;

using std::endl;

using namespace std;

struct dataArray

{

	long	length;

	float*	array;

};

dataArray datain(float* inArray, char* filedata)

{

	long ii;

	dataArray retData;

	ifstream myfile;

	myfile.open(filedata, ios::out);

	ii=0;

	while(!myfile.eof())

//for(ii=0;ii<7619346;ii++)

		{

		myfile >> inArray[ii];

		ii++;

		}

	myfile.close();

	

	retData.length=ii;

	retData.array=inArray;

	return retData;

}

int main(int argc, char* argv[]) {

dataArray allData;

long	ii;

//ifstream myfile;

float* datatot=NULL;

char* datafile="boxout.txt";

int	datasize=1024*1024*16;

datatot= new float[datasize];

//  myfile.close();

allData= datain(datatot,datafile);

	printf("%f, %f, %f, %ld \n",allData.array[0],allData.array[1],allData.array[2],allData.length);

	delete []datatot;

	pcfkernel(1);

  return 0;

}
#include <stdio.h>

#define PI (float) 3.14159

#define BLOCKX 1

#define BLOCKY 1

#define WARP_SIZE 16 //Arch dep

#define TILE_LENGTH 256 //Architectue dependant

#define TILE_WIDTH 3 //For 2-D correlation - x-y component plus magnitude and correlation distance

#define INPUTSIZE 512*2

#define GPU_MAX_THREADS 512

#define HIS_SIZE 64

#define HIS_LOC_BIT 6 //size in bits of histogram bins. Use HIS_SIZE and. GPU_MAX_THREADS and L1 cache size to calculate

#define HIS_LOC_WRITE 64

#define HIS_SHARE_SIZE 32 //number of threads that share a histogram. Lowering this number reduces number of conflicts with atomicAdd but increases shared memory needed. Shared memory needed is GPU_MAX_THREADS*HIS_SIZE/HIS_SHARE_SIZE*sizeof(float)

#define BIN_START (float) (PI/1000000)

#define BIN_END PI

#define LOG_BIN_SIZE (float) log2(BIN_END/BIN_START)/(HIS_SIZE)

extern "C" {void pcfkernel(int k);}