I’m new to cuda programming,^-^. i wrote a template kernel function and when i called it in my main.cpp, i encountered a undefined error. My code is quite simple and as follows:
////////////////////////////////////////////////////////////////////////
//kernel.cu
template
global void addKernel(const Ta,const Tb,int n,T* result)
{
int nIndexX=threadIdx.x+blockIdx.xblockDim.x;
int nIndexY=threadIdx.y+blockIdx.yblockDim.y;
int nIndex=nIndexYblockDim.xgridDim.x+nIndexX;
/int nIndex=threadIdx.x;/
if(nIndex<n)
result[nIndex]=a[nIndex]+b[nIndex];
}
template
void addByGPU(const Ta,const Tb,int n,T* result)
{
if(a==NULL || b==NULL || result==NULL)
return;
cout<<"GPU execution starts."<<endl;
int nSize=sizeof(T)*n;
cudaError_t status;
T* nA_d,*nB_d,*nC_d;
status=cudaMalloc(&nA_d,nSize);
status=cudaMalloc(&nB_d,nSize);
status=cudaMalloc(&nC_d,nSize);
status=cudaMemcpy(nA_d,a,nSize,cudaMemcpyHostToDevice);
status=cudaMemcpy(nB_d,b,nSize,cudaMemcpyHostToDevice);
dim3 blocksPerGrid(1024/16,1024/16,1);
dim3 threadsPerBlock(16,16,1);
addKernel<T><<<blocksPerGrid,threadsPerBlock>>>(nA_d,nB_d,n,nC_d);
status=cudaMemcpy(result,nC_d,nSize,cudaMemcpyDeviceToHost);
status=cudaDeviceSynchronize();
status=cudaFree(nA_d);
status=cudaFree(nB_d);
status=cudaFree(nC_d);
status = cudaDeviceReset();
return;
}
//main.cpp
#include <stdlib.h>
#include <time.h>
#include
using namespace std;
//template extern void addByGPU(const Ta,const Tb,int n,T* result);
//extern void addByGPU(const inta,const intb,int n,int* result);
int main(void)
{
// MathOp mathOp;
int nCount=4*4*256*256;
int* a=new int[nCount];
int* b=new int[nCount];
srand(time(0));
for(int i=0;i<nCount;i++)
{
a[i]=rand();
b[i]=rand();
}
int*c=new int[nCount];
addByGPU<int>(a,b,nCount,c);
//int* c=mathOp.add(a,b,nCount);
for(int i=0;i<nCount;i++)
{
if(a[i]+b[i]!=c[i])
{
cout<<"Error!"<<endl;
break;
}
}
cout<<"Completed..."<<endl;
delete [] a;
delete [] b;
delete [] c;
return 0;
}
////////////////////////////////////////////////////////////////
i tried to add the line “extern ****” at the beginning of the main.cpp file, but it still doesn’t work.
it turns out that the declaration and the definition of the template function in c++ must in the same file so the compiler can determine the exact type while compiling. i figure it out that if i move the main function into the cu file, then everything will be fine. but i wanna know if any alternative exists avoiding doing that. it makes the code structure so wired. Any suggestion would be appreciated.