Hi, I’d like to overload the c++ operators to work both on devices and on host with different implementations each. Is this possible? As an example, I am trying to use polimorphism to run the operator+ in the device with a gpu implementation and on the host, by calling a kernel that uses this operator.
// This function runs on the device
template<class T> inline __device__
interval_gpu<T> operator+(interval_gpu<T> const &x, interval_gpu<T> const &y)
{
rounded_arith<T> rnd;
return interval_gpu<T>(rnd.add_down(x.lower(), y.lower()),
rnd.add_up(x.upper(), y.upper()));
}
// kernel to call in the host implementation of the operator
template <class T>
__global__ void add(T a, T b, T *c){
*c = a + b;
}
template<class T> inline __host__
interval_gpu<T> operator+(interval_gpu<T> const &x, interval_gpu<T> const &y)
{
interval_gpu<T> c;
interval_gpu<T> *d_c;
cudaMalloc((void**)&d_c, sizeof(interval_gpu<T>));
add<<<1,1>>>(x, y, d_c);
cudaDeviceSynchronize();
cudaMemcpy(&c, d_c, sizeof(interval_gpu<T>), cudaMemcpyDeviceToHost);
cudaFree(d_c);
return c;
}
So far I get errors for redeclaring the operator+, is there a workaround?
I’d be ok with a solution such as:
template<class T> inline __host__ __device__
interval_gpu<T> operator+(interval_gpu<T> const &x, interval_gpu<T> const &y)
{
//if on gpu execute A (device)
//else execute B (host)
return result
}
But can I find out at runtime if I am on the device or on the host? How would I do that?
I just tried the following code, with no idea if it would work, but get the message I cant call a Kernel from a function declared on the device in my architecture (2.1), this would need architecture 3.5 or above (compute capability)… Anyone can think of another way i could use on my 2.1?
// Binary operators
template <class T>
__global__ void add(T a, T b, T *c){
*c = a + b;
}
template<class T> inline __device__ __host__
interval_gpu<T> operator+(interval_gpu<T> const &x, interval_gpu<T> const &y)
{
int i = threadIdx.x;
cudaError_t error = cudaGetLastError();
if(error == cudaSuccess){
rounded_arith<T> rnd;
return interval_gpu<T>(rnd.add_down(x.lower(), y.lower()),
rnd.add_up(x.upper(), y.upper()));
}
else{
interval_gpu<T> c;
interval_gpu<T> *d_c;
cudaMalloc((void**)&d_c, sizeof(interval_gpu<T>));
add<<<1,1>>>(x, y, d_c);
cudaDeviceSynchronize();
cudaMemcpy(&c, d_c, sizeof(interval_gpu<T>), cudaMemcpyDeviceToHost);
cudaFree(d_c);
return c;
}
}