Overloading between device and host?

Hi, I’d like to overload the c++ operators to work both on devices and on host with different implementations each. Is this possible? As an example, I am trying to use polimorphism to run the operator+ in the device with a gpu implementation and on the host, by calling a kernel that uses this operator.

// This function runs on the device
template<class T> inline __device__
interval_gpu<T> operator+(interval_gpu<T> const &x, interval_gpu<T> const &y)
{
    rounded_arith<T> rnd;
    return interval_gpu<T>(rnd.add_down(x.lower(), y.lower()),
                           rnd.add_up(x.upper(), y.upper()));
}

// kernel to call in the host implementation of the operator
template <class T>
__global__ void add(T a, T b, T *c){
	*c = a + b;
}

template<class T> inline __host__
interval_gpu<T> operator+(interval_gpu<T> const &x, interval_gpu<T> const &y)
{
	interval_gpu<T> c;
	interval_gpu<T> *d_c;
	cudaMalloc((void**)&d_c, sizeof(interval_gpu<T>));
	add<<<1,1>>>(x, y, d_c);
	cudaDeviceSynchronize();
	cudaMemcpy(&c, d_c, sizeof(interval_gpu<T>), cudaMemcpyDeviceToHost);
	cudaFree(d_c);
	return c;
}

So far I get errors for redeclaring the operator+, is there a workaround?

I’d be ok with a solution such as:

template<class T> inline __host__ __device__
interval_gpu<T> operator+(interval_gpu<T> const &x, interval_gpu<T> const &y)
{
  //if on gpu execute A (device)
  //else execute B (host)
  return result
}

But can I find out at runtime if I am on the device or on the host? How would I do that?

I just tried the following code, with no idea if it would work, but get the message I cant call a Kernel from a function declared on the device in my architecture (2.1), this would need architecture 3.5 or above (compute capability)… Anyone can think of another way i could use on my 2.1?

// Binary operators
template <class T>
__global__ void add(T a, T b, T *c){
	*c = a + b;
}

template<class T> inline __device__ __host__
interval_gpu<T> operator+(interval_gpu<T> const &x, interval_gpu<T> const &y)
{
		int i = threadIdx.x;
		cudaError_t error = cudaGetLastError();
		if(error == cudaSuccess){
    	rounded_arith<T> rnd;
    	return interval_gpu<T>(rnd.add_down(x.lower(), y.lower()),
                           rnd.add_up(x.upper(), y.upper()));
		}
		else{
			interval_gpu<T> c;
			interval_gpu<T> *d_c;
			cudaMalloc((void**)&d_c, sizeof(interval_gpu<T>));
			add<<<1,1>>>(x, y, d_c);
			cudaDeviceSynchronize();
			cudaMemcpy(&c, d_c, sizeof(interval_gpu<T>), cudaMemcpyDeviceToHost);
			cudaFree(d_c);
			return c;
		}
}