I have a device function that is defined in a header file. The reason it is in a header file is because it is used by a global template kernel, which needs to be in a header file since it is a template kernel.
When this header file is included across 2 or more .cu files, I get a LNK2005 error during linking:
[indent] [/indent] I checked the generated .ptx file and found that fooKernel does not even call getCurThreadIdx, it has been inlined! If this is the case, why is there a multiple definition error? External Image
I can make it go away if I add a forceinline qualifier to the device function. But, my doubt about the normal inlining of device function here still remains.
Another reason I am concerned is that this means that the device functions used by a template kernel need to be always forceinline! This severely limits the amount of functionality that can be achieved inside a template kernel, especially on the Fermi architecture! External Image
Here is sample code to produce the above error on CUDA 3.2 and Visual Studio 2008:
FooDevice.h:
#ifndef FOO_DEVICE_H
#define FOO_DEVICE_H
__device__ int getCurThreadIdx()
{ return ( ( blockIdx.x * blockDim.x ) + threadIdx.x ); }
template< typename T >
__global__ void fooKernel( const T* inArr, int num, T* outArr )
{
const int threadNum = ( gridDim.x * blockDim.x );
for ( int idx = getCurThreadIdx(); idx < num; idx += threadNum )
outArr[ idx ] = inArr[ idx ];
return;
}
__global__ void fooKernel2( const int* inArr, int num, int* outArr );
#endif // FOO_DEVICE_H
FooDevice.cu:
#include "FooDevice.h"
// One other kernel that uses getCurThreadIdx()
__global__ void fooKernel2( const int* inArr, int num, int* outArr )
{
const int threadNum = ( gridDim.x * blockDim.x );
for ( int idx = getCurThreadIdx(); idx < num; idx += threadNum )
outArr[ idx ] = inArr[ idx ];
return;
}
Main.cu:
#include "FooDevice.h"
int main()
{
int num = 10;
int* dInArr = NULL;
int* dOutArr = NULL;
const int arrSize = num * sizeof( *dInArr );
cudaMalloc( &dInArr, arrSize );
cudaMalloc( &dOutArr, arrSize );
// Using template kernel
fooKernel<<< 10, 10 >>>( dInArr, num, dOutArr );
return 0;
}