problem using templates and friend functions in CUDA I can't compile templated friend functions

EDIT #2:

I have finally come up with the test case I’m actually stymied by. My original post (below) was a matter of a syntax error, but wasn’t fully reproducing the problem I’m seeing with a larger piece of code. The “solution” to my compilation problem in my original post (removing the template< typename > declaration at the top of the friend function) doesn’t work in this case.

#include <cuda.h>

#include <iostream>

template < typename T >

class xyzVec

{

private:

  T x_, y_, z_;

public:

  __device__

  xyzVec() : x_(T(0)), y_(T(0)), z_(T(0)) {}

  __device__

  xyzVec( T x, T y, T z ) : x_(x), y_(y), z_(z) {}

friend

  __device__

  xyzVec

  operator + ( xyzVec const & a, xyzVec const & b )

  {

    return xyzVec( a.x_ + b.x_, a.y_ + b.y_, a.z_ + b.z_ );

  }

__device__ T length() const { return sqrt( x_*x_ + y_*y_ + z_*z_ ); }

__device__ T dot( xyzVec const & other ) const { return x_*other.x_ + y_*other.y_ + z_*other.z_; }

friend

  __device__

  T cos_of( xyzVec const & a, xyzVec const & b ) {

    T const mag = a.length() * b.length();

    if ( mag > T( 0 )) {

      return  a.dot( b ) / mag;

    } else {

      return T( 1 );

    }

  }

};

//template< typename T >

//__device__

//xyzVec< T >

//operator + ( xyzVec<T> const & a, xyzVec<T> const & b );

template < typename T >

__device__

T cos_of( xyzVec<T> const & a, xyzVec<T> const & b );

template< class T >

__global__

void

device_add( T * d_dest, T * d_src1, T * d_src2, unsigned int n ) {

  unsigned int index = threadIdx.x + blockIdx.x * blockDim.x;

  if ( index <= n ) {

    xyzVec< T > a(1,2,3);

    xyzVec< T > b(2,3,4);

    xyzVec< T > c( a+b );

    d_dest[ index ] = d_src1[ index ] + d_src2[ index ] + cos_of( a, b );

  }

}

void

add_vectors_on_device(

  float * h_dest,

  float * h_src1,

  float * h_src2,

  unsigned int n

)

{

  std::cout << "arrived in add_vectors_on_device" << std::endl;

float * d_dest, * d_src1, * d_src2;

  int err1 = cudaMalloc( & d_dest, sizeof( float ) * n );

  int err2 = cudaMalloc( & d_src1, sizeof( float ) * n );

  int err3 = cudaMalloc( & d_src2, sizeof( float ) * n );

if ( err1 != 0 ) { std::cerr << "Could not allocate block of size " << sizeof( float ) * n << " bytes on device when allocating space for d_dest.  Err: " << err1 << " from " << __FILE__ << " " << __LINE__ << std::endl; exit(1); }

  if ( err2 != 0 ) { std::cerr << "Could not allocate block of size " << sizeof( float ) * n << " bytes on device when allocating space for d_src1.  Err: " << err2 << " from " << __FILE__ << " " << __LINE__ << std::endl; exit(1); }

  if ( err3 != 0 ) { std::cerr << "Could not allocate block of size " << sizeof( float ) * n << " bytes on device when allocating space for d_src2.  Err: " << err3 << " from " << __FILE__ << " " << __LINE__ << std::endl; exit(1); }

cudaMemcpy( d_src1, h_src1, sizeof( float ) * n, cudaMemcpyHostToDevice );

  cudaMemcpy( d_src2, h_src2, sizeof( float ) * n, cudaMemcpyHostToDevice );

unsigned int n_blocks = ( n + 255 ) / 256;

  device_add<<< n_blocks, 256 >>>( d_dest, d_src1, d_src2, n );

cudaMemcpy( h_dest, d_dest, sizeof( float ) * n, cudaMemcpyDeviceToHost );

  int err4 = cudaFree( d_dest ); d_dest = 0;

  int err5 = cudaFree( d_src1 ); d_src1 = 0;

  int err6 = cudaFree( d_src2 ); d_src2 = 0;

  if ( err4 != 0 ) { std::cerr << "Could not deallocate block of size " << sizeof( float ) * n << " bytes on device when allocating space for d_dest.  Err: " << err4 << " from " << __FILE__ << " " << __LINE__ << std::endl; exit(1); }

  if ( err5 != 0 ) { std::cerr << "Could not deallocate block of size " << sizeof( float ) * n << " bytes on device when allocating space for d_src1.  Err: " << err5 << " from " << __FILE__ << " " << __LINE__ << std::endl; exit(1); }

  if ( err6 != 0 ) { std::cerr << "Could not deallocate block of size " << sizeof( float ) * n << " bytes on device when allocating space for d_src2.  Err: " << err6 << " from " << __FILE__ << " " << __LINE__ << std::endl; exit(1); }

}

int

main()

{

bool error( false );

  unsigned int num_floats = 1000;

  float * h_dest; float * h_src1; float * h_src2;

  h_dest = (float*) malloc( sizeof( float ) * num_floats );

  h_src1 = (float*) malloc( sizeof( float ) * num_floats );

  h_src2 = (float*) malloc( sizeof( float ) * num_floats );

  for ( int ii = 0; ii < num_floats; ++ii ) {

    h_src1[ ii ] = ii;

    h_src2[ ii ] = num_floats - ii;

  }

  add_vectors_on_device( h_dest, h_src1, h_src2, num_floats );

  for ( int ii = 0; ii < num_floats; ++ii ) {

    if( std::abs( h_dest[ ii ] - num_floats) > 1e-5 ) {

      std::cout << "Error with position " << ii << " in add_vectors_on_device: " << std::abs( h_dest[ ii ] - (num_floats)) << std::endl;

      error = true;

    }

  }

  free( (void*) h_dest );

  free( (void*) h_src1 );

  free( (void*) h_src2 );

  if ( ! error ) {

    std::cout << "SUCCESS" << std::endl;

  }

}

which produces this error:

w$ nvcc -o test  -I/usr/local/cuda/include -lcuda main.cu

main.cu(30): error: calling a __device__ function("length") from a __host__ function("cos_of") is not allowed

main.cu(30): error: calling a __device__ function("length") from a __host__ function("cos_of") is not allowed

main.cu(32): error: calling a __device__ function("dot") from a __host__ function("cos_of") is not allowed

3 errors detected in the compilation of "/tmp/tmpxft_00003cd8_00000000-9_main.cpp4.ii".

but if I put in the “template< typename >” preface to the function,

template< typename >

  friend

  __device__

  T cos_of( xyzVec const & a, xyzVec const & b ) {

    T const mag = a.length() * b.length();

    if ( mag > T( 0 )) {

      return  a.dot( b ) / mag;

    } else {

      return T( 1 );

    }

  }

I get this error:

./main.cu(61): Error: External calls are not supported (found non-inlined call to _Z6cos_ofIfET_RK6xyzVecIS0_ES4_)

If I comment out the “external” declaration of this function, I get this error:

$ nvcc -o test  -I/usr/local/cuda/include -lcuda main.cu

main.cu(61): error: no instance of "cos_of" matches the argument list

            argument types are: (xyzVec<float>, xyzVec<float>)

          detected during instantiation of "void device_add(T *, T *, T *, unsigned int) [with T=float]" 

(88): here

1 error detected in the compilation of "/tmp/tmpxft_00003d61_00000000-4_main.cpp1.ii".

I’m not sure why “operator +” and “cos_of” should have a different syntax, but I can work around that. What’s got me most puzzled is that I cannot seem to get this “cos_of” function implemented at all.

Any help would be appreciated.

ORIGINAL POST BELOW

I have boiled my problem down to a pretty small test case. I want to define a friend function for a templated class and have this friend function run on the device. I can’t get past the compiler errors, though I’ve tried very many things.

Here is my example code:

#include <cuda.h>

#include <iostream>

template < typename T >

class xyzVec

{

private:

  T x_, y_, z_;

public:

  __device__ xyzVec() : x_(T(0)), y_(T(0)), z_(T(0)) {}

  __device__ xyzVec( T x, T y, T z ) : x_(x), y_(y), z_(z) {}

template< typename >

  friend

  __device__

  xyzVec

  operator + ( xyzVec const & a, xyzVec const & b ) {

    return xyzVec( a.x_ + b.x_, a.y_ + b.y_, a.z_ + b.z_ );

  }

};

template< typename T >

__device__

xyzVec<T>

operator + ( xyzVec<T> const & a, xyzVec<T> const & b );

template< class T >

__global__

void

device_add( T * d_dest, T * d_src1, T * d_src2, unsigned int n ) {

  unsigned int index = threadIdx.x + blockIdx.x * blockDim.x;

  if ( index <= n ) {

    //xyzVec< T > a(1,2,3);

    //xyzVec< T > b(2,3,4);

    //xyzVec< T > c( a+b );

    d_dest[ index ] = d_src1[ index ] + d_src2[ index ];// + cos_of( a, b );

  }

}

void

add_vectors_on_device(

  float * h_dest,

  float * h_src1,

  float * h_src2,

  unsigned int n

)

{

  std::cout << "arrived in add_vectors_on_device" << std::endl;

float * d_dest, * d_src1, * d_src2;

  int err1 = cudaMalloc( & d_dest, sizeof( float ) * n );

  int err2 = cudaMalloc( & d_src1, sizeof( float ) * n );

  int err3 = cudaMalloc( & d_src2, sizeof( float ) * n );

if ( err1 != 0 ) { std::cerr << "Could not allocate block of size " << sizeof( float ) * n << " bytes on device when allocating space for d_dest.  Err: " << err1 << " from " << __FILE__ << " " << __LINE__ << std::endl; exit(1); }

  if ( err2 != 0 ) { std::cerr << "Could not allocate block of size " << sizeof( float ) * n << " bytes on device when allocating space for d_src1.  Err: " << err2 << " from " << __FILE__ << " " << __LINE__ << std::endl; exit(1); }

  if ( err3 != 0 ) { std::cerr << "Could not allocate block of size " << sizeof( float ) * n << " bytes on device when allocating space for d_src2.  Err: " << err3 << " from " << __FILE__ << " " << __LINE__ << std::endl; exit(1); }

cudaMemcpy( d_src1, h_src1, sizeof( float ) * n, cudaMemcpyHostToDevice );

  cudaMemcpy( d_src2, h_src2, sizeof( float ) * n, cudaMemcpyHostToDevice );

unsigned int n_blocks = ( n + 255 ) / 256;

  device_add<<< n_blocks, 256 >>>( d_dest, d_src1, d_src2, n );

cudaMemcpy( h_dest, d_dest, sizeof( float ) * n, cudaMemcpyDeviceToHost );

  int err4 = cudaFree( d_dest ); d_dest = 0;

  int err5 = cudaFree( d_src1 ); d_src1 = 0;

  int err6 = cudaFree( d_src2 ); d_src2 = 0;

  if ( err4 != 0 ) { std::cerr << "Could not deallocate block of size " << sizeof( float ) * n << " bytes on device when allocating space for d_dest.  Err: " << err4 << " from " << __FILE__ << " " << __LINE__ << std::endl; exit(1); }

  if ( err5 != 0 ) { std::cerr << "Could not deallocate block of size " << sizeof( float ) * n << " bytes on device when allocating space for d_src1.  Err: " << err5 << " from " << __FILE__ << " " << __LINE__ << std::endl; exit(1); }

  if ( err6 != 0 ) { std::cerr << "Could not deallocate block of size " << sizeof( float ) * n << " bytes on device when allocating space for d_src2.  Err: " << err6 << " from " << __FILE__ << " " << __LINE__ << std::endl; exit(1); }

}

int

main()

{

bool error( false );

  unsigned int num_floats = 1000;

  float * h_dest; float * h_src1; float * h_src2;

  h_dest = (float*) malloc( sizeof( float ) * num_floats );

  h_src1 = (float*) malloc( sizeof( float ) * num_floats );

  h_src2 = (float*) malloc( sizeof( float ) * num_floats );

  for ( int ii = 0; ii < num_floats; ++ii ) {

    h_src1[ ii ] = ii;

    h_src2[ ii ] = num_floats - ii;

  }

  add_vectors_on_device( h_dest, h_src1, h_src2, num_floats );

  for ( int ii = 0; ii < num_floats; ++ii ) {

    if( std::abs( h_dest[ ii ] - num_floats) > 1e-5 ) {

      std::cout << "Error with position " << ii << " in add_vectors_on_device: " << std::abs( h_dest[ ii ] - (num_floats)) << std::endl;

      error = true;

    }

  }

  free( (void*) h_dest );

  free( (void*) h_src1 );

  free( (void*) h_src2 );

  if ( ! error ) {

    std::cout << "SUCCESS" << std::endl;

  }

}

which I can compile with:

nvcc -o test  -I/usr/local/cuda/include -lcuda main.cu

and it compiles and runs. But if I uncomment the use of the xyzVec class in “device_add”, I am unable to work past the compilation problems. That is, if I change device_add as follows:

template< class T >

__global__

void

device_add( T * d_dest, T * d_src1, T * d_src2, unsigned int n ) {

  unsigned int index = threadIdx.x + blockIdx.x * blockDim.x;

  if ( index <= n ) {

    xyzVec< T > a(1,2,3);

    xyzVec< T > b(2,3,4);

    xyzVec< T > c( a+b );

    d_dest[ index ] = d_src1[ index ] + d_src2[ index ];// + cos_of( a, b );                                                                                                                    

  }

}

I get this error messages:

$ nvcc -o test  -I/usr/local/cuda/include -lcuda main.cu

./main.cu(38): Error: External calls are not supported (found non-inlined call to _ZplIfE6xyzVecIT_ERKS2_S4_)

If I comment out the outside-of-the-class declaration of operator + (lines 25–28), I get this error message:

main.cu(38): error: no operator "+" matches these operands

            operand types are: xyzVec<float> + xyzVec<float>

          detected during instantiation of "void device_add(T *, T *, T *, unsigned int) [with T=float]" 

(66): here

1 error detected in the compilation of "/tmp/tmpxft_00003bc0_00000000-4_main.cpp1.ii".

If I move the function body outside of the class, and simply forward-declare “operator +” (replacing the old class declaration and the function declaration) as shown in the code below,

template < typename T >

class xyzVec

{

private:

  T x_, y_, z_;

public:

  __device__

  xyzVec() : x_(T(0)), y_(T(0)), z_(T(0)) {}

  __device__

  xyzVec( T x, T y, T z ) : x_(x), y_(y), z_(z) {}

template< typename >

  friend

  __device__

  xyzVec

  operator + ( xyzVec const & a, xyzVec const & b );

};

template< typename T >

__device__

xyzVec< T >

operator + ( xyzVec<T> const & a, xyzVec<T> const & b )

{

  return xyzVec< T >( a.x_ + b.x_, a.y_ + b.y_, a.z_ + b.z_ );

}

then I get this error message:

$ nvcc -o test  -I/usr/local/cuda/include -lcuda main.cu

main.cu(28): error: member "xyzVec<T>:<img src='http://forums.nvidia.com/public/style_emoticons/<#EMO_DIR#>/bloated.gif' class='bbc_emoticon' alt=':X' />_ [with T=float]"

(8): here is inaccessible

          detected during:

            instantiation of "xyzVec<T> operator+(const xyzVec<T> &, const xyzVec<T> &) [with T=float]" 

(40): here

            instantiation of "void device_add(T *, T *, T *, unsigned int) [with T=float]" 

(68): here

main.cu(28): error: member "xyzVec<T>:<img src='http://forums.nvidia.com/public/style_emoticons/<#EMO_DIR#>/bloated.gif' class='bbc_emoticon' alt=':X' />_ [with T=float]"

(8): here is inaccessible

          detected during:

            instantiation of "xyzVec<T> operator+(const xyzVec<T> &, const xyzVec<T> &) [with T=float]" 

(40): here

            instantiation of "void device_add(T *, T *, T *, unsigned int) [with T=float]" 

(68): here

main.cu(28): error: member "xyzVec<T>::y_ [with T=float]"

(8): here is inaccessible

          detected during:

            instantiation of "xyzVec<T> operator+(const xyzVec<T> &, const xyzVec<T> &) [with T=float]" 

(40): here

            instantiation of "void device_add(T *, T *, T *, unsigned int) [with T=float]" 

(68): here

main.cu(28): error: member "xyzVec<T>::y_ [with T=float]"

(8): here is inaccessible

          detected during:

            instantiation of "xyzVec<T> operator+(const xyzVec<T> &, const xyzVec<T> &) [with T=float]" 

(40): here

            instantiation of "void device_add(T *, T *, T *, unsigned int) [with T=float]" 

(68): here

main.cu(28): error: member "xyzVec<T>::z_ [with T=float]"

(8): here is inaccessible

          detected during:

            instantiation of "xyzVec<T> operator+(const xyzVec<T> &, const xyzVec<T> &) [with T=float]" 

(40): here

            instantiation of "void device_add(T *, T *, T *, unsigned int) [with T=float]" 

(68): here

main.cu(28): error: member "xyzVec<T>::z_ [with T=float]"

(8): here is inaccessible

          detected during:

            instantiation of "xyzVec<T> operator+(const xyzVec<T> &, const xyzVec<T> &) [with T=float]" 

(40): here

            instantiation of "void device_add(T *, T *, T *, unsigned int) [with T=float]" 

(68): here

6 errors detected in the compilation of "/tmp/tmpxft_00003bd3_00000000-4_main.cpp1.ii".

… and at this point, I have exhausted all permutations of this code that I can think of.

Any help would be appreciated.

EDIT:

It seems like one more permutation and it works:

template < typename T >

class xyzVec

{

private:

  T x_, y_, z_;

public:

  __device__

  xyzVec() : x_(T(0)), y_(T(0)), z_(T(0)) {}

  __device__

  xyzVec( T x, T y, T z ) : x_(x), y_(y), z_(z) {}

friend

  __device__

  xyzVec

  operator + ( xyzVec const & a, xyzVec const & b )

  {

    return xyzVec( a.x_ + b.x_, a.y_ + b.y_, a.z_ + b.z_ );

  }

};

//template< typename T >                                                                                                                                                                        

//__device__                                                                                                                                                                                    

//xyzVec< T >                                                                                                                                                                                   

//operator + ( xyzVec<T> const & a, xyzVec<T> const & b );

I thought I had tried this, so, I’m a little puzzled at the moment. Sorry for the confusion.