Converting python to Numba CUDA implementation

I am attempting to convert the following code to run on a GPU. Linux 64 bit Debian arch.

def loess_point(x,h,xp,yp):
	b=sum(w*xp)*sum(w*yp) - sum(w)*sum(w*xp*yp)
	b /= sum(w*xp)**2 - sum(w)*sum(x*xp**2)
	return a+b*x

One issue is the line of code that contains (x-xp) where x is an integer and xp is an array of floats.

The JIT compiler complains with the following error:

Failed at nopython (nopython frontend)
Invalid usage of - with parameters (uint32, array(float32, 1d, A)).

Not sure how to implement this differently. The code below is the full version of what I am attempting to compile/run.

import numpy as np
import math
from pylab import *
from numbapro import cuda
from numba import *

@cuda.jit(restype=float32,argtypes=[float32[:]], device = True)
def sum(x):
     sum_tmp = float32(0)
     for k in range(0,len(x)):
        sum_tmp += x[k]
     return sum_tmp

@cuda.jit(restype=float32, argtypes=[uint32, f8, float32[:],float32[:]], device=True)
def loess_point(x,h,xp,yp):
        b = float32(0)
        a = float32(0)
	w = math.exp(-0.5*(((x-xp)/h)**2)/math.sqrt(2*pi*h**2))
	b = sum(w*xp)*sum(w*yp) - sum(w)*sum(w*xp*yp)
	b /= sum(w*xp)**2 - sum(w)*sum(x*xp**2)
	a = (sum(w*yp)-b*sum(w*xp))/sum(w)
	return a+b*x

@cuda.jit(argtypes=[float32[:], float32[:], float32[:]])
def loess_kernel(X,Y, Result):
       startX = cuda.grid(1)
       gridX = cuda.gridDim.x * cuda.blockDim.x;
       #gridY = cuda.gridDim.y * cuda.blockDim.y;
       h = 50
       for k in range(startX, len(X), gridX):