I’m new to CUDA and have working CUDA code. I now want to input data from a file and send it to the kernel. I have a .cpp file that does this. I’ve created a header file that declares my cuda function that calls the kernel as extern “C” (or without) but when I call the function in my .cpp file I get the error
Undefined symbols:
“_pcfkernel”, referenced from:
_main in cc04vKDX.o
ld: symbol(s) not found
I can’t figure out what I’m doing wrong as this should be really simple and the cu and cpp files work when I don’t try link them.
I’ve included the cpp and header file contents below. My function in the cu file is pcfkernel.
#include <iostream>
#include <fstream>
#include "cu2pcf.h"
using std::cout;
using std::cerr;
using std::endl;
using namespace std;
struct dataArray
{
long length;
float* array;
};
dataArray datain(float* inArray, char* filedata)
{
long ii;
dataArray retData;
ifstream myfile;
myfile.open(filedata, ios::out);
ii=0;
while(!myfile.eof())
//for(ii=0;ii<7619346;ii++)
{
myfile >> inArray[ii];
ii++;
}
myfile.close();
retData.length=ii;
retData.array=inArray;
return retData;
}
int main(int argc, char* argv[]) {
dataArray allData;
long ii;
//ifstream myfile;
float* datatot=NULL;
char* datafile="boxout.txt";
int datasize=1024*1024*16;
datatot= new float[datasize];
// myfile.close();
allData= datain(datatot,datafile);
printf("%f, %f, %f, %ld \n",allData.array[0],allData.array[1],allData.array[2],allData.length);
delete []datatot;
pcfkernel(1);
return 0;
}
#include <stdio.h>
#define PI (float) 3.14159
#define BLOCKX 1
#define BLOCKY 1
#define WARP_SIZE 16 //Arch dep
#define TILE_LENGTH 256 //Architectue dependant
#define TILE_WIDTH 3 //For 2-D correlation - x-y component plus magnitude and correlation distance
#define INPUTSIZE 512*2
#define GPU_MAX_THREADS 512
#define HIS_SIZE 64
#define HIS_LOC_BIT 6 //size in bits of histogram bins. Use HIS_SIZE and. GPU_MAX_THREADS and L1 cache size to calculate
#define HIS_LOC_WRITE 64
#define HIS_SHARE_SIZE 32 //number of threads that share a histogram. Lowering this number reduces number of conflicts with atomicAdd but increases shared memory needed. Shared memory needed is GPU_MAX_THREADS*HIS_SIZE/HIS_SHARE_SIZE*sizeof(float)
#define BIN_START (float) (PI/1000000)
#define BIN_END PI
#define LOG_BIN_SIZE (float) log2(BIN_END/BIN_START)/(HIS_SIZE)
extern "C" {void pcfkernel(int k);}