The following is a script I put together for anyone to follow. The dependencies are NumPy, cuPy and Numba.
import numpy as np
import cupy as cp
from numba import cuda
import math
@cuda.jit
def elementwise_matrix_multiplication_3D(A_d,B_d,C_d,N,M,K):
'''
INPUTS:
A_d : np array, shape (N,M,K) matrix.
B_d : np array, shape (N,M,K) matrix.
N : int, number of rows.
M : int, number of columns.
K : int, number of depth.
OUTPUTS:
C_d : np array, shape (N,M,K) matrix.
DETAILS:
GPU Accelerated elementwise matrix multiplication.
'''
thidx = cuda.blockDim.x*cuda.blockIdx.x + cuda.threadIdx.x
thidy = cuda.blockDim.y*cuda.blockIdx.y + cuda.threadIdx.y
thidz = cuda.blockDim.z*cuda.blockIdx.z + cuda.threadIdx.z
if thidx < N and thidy < M and thidz < K:
prod = A_d[thidx,thidy,thidz]*B_d[thidx,thidy,thidz]
#print('A:(',thidx,thidy,thidz,'):',A_d[thidx,thidy,thidz],'B:(',thidx,thidy,thidz,'):',B_d[thidx,thidy,thidz],'Prod:(',thidx,thidy,thidz,'):',prod)
#cuda.atomic.add(C_d,(thidx,thidy,thidz),prod)
C_d[thidx,thidy,thidz] = prod
cuda.syncthreads()
@cuda.jit
def FWF_FeatureMap_DEBUG(X_d,X_vec_d,X_e,X_p,X_n,x_d,N,L,Order,sigma,factorial_list_d):
thidx = cuda.blockDim.x*cuda.blockIdx.x + cuda.threadIdx.x
thidy = cuda.blockDim.y*cuda.blockIdx.y + cuda.threadIdx.y
thidz = cuda.blockDim.z*cuda.blockIdx.z + cuda.threadIdx.z
if thidx < N and thidy < L and thidz < Order:
exp_part = math.exp((-x_d[thidx, thidy]**2)/(2*sigma**2))
pow_part = x_d[thidx, thidy]**thidz
norm_part = (sigma**thidz) * math.sqrt(factorial_list_d[thidz])
X_e[thidx,thidy] = exp_part
X_p[thidx,thidy,thidz] = pow_part
X_n[thidz] = norm_part
X_d[thidx,thidy,thidz] = exp_part * pow_part / norm_part
X_vec_d[thidx,Order*thidy+thidz] = exp_part * pow_part / norm_part
cuda.syncthreads()
def FWF_ACC_CPUOnly_DEBUG(x,d,xtest,Order,sigma,rcond=1e-15):
'''
x: np array, shape (N,L), input time-series data.
d: np array, shape (N,), output time-series data.
xtest: np array, shape (Ntest,L), input test time-series data.
dtest: np array, shape (Ntest,), output test time-series data.
Order: int, maximum order considered.
sigma: float, kernel spread.
rcond: float, regularization constant.
'''
#region Define the constants
N,L = x.shape
Ntest = xtest.shape[0]
O = np.arange(Order)
sigma_pow = np.power(sigma,O)
factorial_sqrt = np.sqrt(np.array([math.factorial(o) for o in O],dtype=float))
#endregion
#region Vectorized feature map
#print(x.shape)
x_e = x[:,:,np.newaxis]
#print(x_e.shape)
xtest_e = xtest[:,:,np.newaxis]
X_e = np.exp(-x_e**2/(2*sigma**2))
X_p = x_e**O
X_n = sigma_pow*factorial_sqrt
X = (X_e * X_p/X_n).reshape(N,L,Order)
X_vec = (np.exp(-x_e**2/(2*sigma**2)) * (x_e**O)/(sigma_pow*factorial_sqrt)).reshape(N,-1)
Xtest = (np.exp(-xtest_e**2/(2*sigma**2)) * (xtest_e**O)/(sigma_pow*factorial_sqrt)).reshape(Ntest,L,Order)
Xtest_vec = (np.exp(-xtest_e**2/(2*sigma**2)) * (xtest_e**O)/(sigma_pow*factorial_sqrt)).reshape(Ntest,-1)
D = np.repeat(d[:, np.newaxis], Order, axis=1)
D_e = np.repeat(D[:, np.newaxis, :], L, axis=1)
#endregion
#region Calculate predictions
V = (X_vec.T@X_vec)/N
Vinv = np.linalg.pinv(V,rcond=rcond)
Ptemp = X*D_e
P = np.mean(Ptemp,axis=0).flatten()
W = Vinv@P
Y = X_vec@W
Ytest = Xtest_vec@W
#endregion
return Y,Ytest,X,X_vec,X_e,X_p,X_n,D_e,V,Ptemp,P,W
def FWF_ACC_DEBUG(x,d,xtest,Order,sigma,rcond=1e-15):
#region Define the constants
N = x.shape[0]
L = x.shape[1]
Nt = xtest.shape[0]
OL = Order*L
#endregion
#region Allocate the host memory
f_list = np.array([np.math.factorial(i) for i in range(Order)],dtype=np.float64)
#endregion
#region Allocate the device memory
X_d = cp.zeros((N,L,Order),dtype=cp.float64)
X_vec_d = cp.zeros((N,OL),dtype=cp.float64)
X_e_d = cp.zeros((N,L),dtype=cp.float64)
X_p_d = cp.zeros((N,L,Order),dtype=cp.float64)
X_n_d = cp.zeros((Order,),dtype=cp.float64)
Xt_d = cp.zeros((Nt,L,Order),dtype=cp.float64)
Xt_vec_d = cp.zeros((Nt,OL),dtype=cp.float64)
Xt_e_d = cp.zeros((N,L),dtype=cp.float64)
Xt_p_d = cp.zeros((N,L,Order),dtype=cp.float64)
Xt_n_d = cp.zeros((Order,),dtype=cp.float64)
#endregion
#region Copy the data to the device
x_d = cp.asarray(x)
xt_d = cp.asarray(xtest)
d_d = cp.asarray(d)
f_list_d = cp.asarray(f_list)
#endregion
#region Initialize the device memory
D_d = cp.repeat(d_d[:, cp.newaxis], Order, axis=1)
D_e_d = cp.repeat(D_d[:, cp.newaxis, :], L, axis=1)
D = cp.asnumpy(D_e_d)
#endregion
#region Create the feature map for the training data
FWF_FeatureMap_DEBUG[(N//32+1,L//8+1,Order//4+1),(32,8,4)](X_d,X_vec_d,X_e_d,X_p_d,X_n_d,x_d,N,L,Order,sigma,f_list_d)
FWF_FeatureMap_DEBUG[(Nt//32+1,L//8+1,Order//4+1),(32,8,4)](Xt_d,Xt_vec_d,Xt_e_d,Xt_p_d,Xt_n_d,xt_d,Nt,L,Order,sigma,f_list_d)
X = cp.asnumpy(X_d)
X_vec = cp.asnumpy(X_vec_d)
X_e = cp.asnumpy(X_e_d)
X_p = cp.asnumpy(X_p_d)
X_n = cp.asnumpy(X_n_d)
#endregion
#region Calculate the covariance matrix
V_d = X_vec_d.T@X_vec_d/N
#endregion
#region Get the inverse of the covariance matrix
V = cp.asnumpy(V_d)
Vinv_d = cp.linalg.pinv(V_d,rcond=rcond)
#endregion
#region Calculate the projection matrix
#Calculate the projection matrix
Ptemp_d = X_d*D_e_d
Ptemp = cp.asnumpy(Ptemp_d)
#Calculate the column mean
P_d = cp.mean(Ptemp_d,axis=0).flatten()
P = cp.asnumpy(P_d)
#endregion
#region Calculate the weights
W_d = Vinv_d@P_d
W = cp.asarray(W_d)
#endregion
#region Calculate the predictions
Y_d = X_vec_d@W_d
Yt_d = Xt_vec_d@W_d
Y = cp.asnumpy(Y_d)
Yt = cp.asnumpy(Yt_d)
#endregion
#region Free the memory
del X_d,X_vec_d,Xt_d,Xt_vec_d,V_d,Vinv_d,D_d,Ptemp_d,P_d,W_d,Y_d,Yt_d,f_list_d
#endregion
return Y,Yt,X,X_vec,X_e,X_p,X_n,D,V,Ptemp,P,W
if __name__ == '__main__':
#Load the x's & d's
x = np.loadtxt('x.txt')
d = np.loadtxt('d.txt')
xtest = np.loadtxt('xtest.txt')
test_N,test_L = x.shape
test_Order = 8
test_sigma = 0.125
test_rcond = 2**(-25)
pTrain_FWF,pTest_FWF,X_FWF,X_vec_FWF,X_e_FWF,X_p_FWF,X_n_FWF,D_FWF,V_FWF,Ptemp_FWF,P_FWF,W_FWF=FWF_ACC_CPUOnly_DEBUG(x,d,xtest,test_Order,test_sigma,test_rcond)
pTrain_FWFE,pTest_FWFE,X_FWFE,X_vec_FWFE,X_e_FWFE,X_p_FWFE,X_n_FWFE,D_FWFE,V_FWFE,Ptemp_FWFE,P_FWFE,W_FWFE=FWF_ACC_DEBUG(x,d,xtest,test_Order,test_sigma,test_rcond)
#region checking elementwise matrix multiplication
test_mat = np.arange(test_N*test_L*test_Order).reshape(test_N,test_L,test_Order)
test_mat2 = (np.arange(test_N*test_L*test_Order)*2+1).reshape(test_N,test_L,test_Order)
test_mult = test_mat*test_mat2
test_mat_d = cuda.to_device(test_mat)
test_mat2_d = cuda.to_device(test_mat2)
test_mult_d = cuda.device_array((test_N,test_L,test_Order),dtype=np.float64)
elementwise_matrix_multiplication_3D[(2,1,1),(16,8,8)](test_mat_d,test_mat2_d,test_mult_d,test_N,test_L,test_Order)
test_mult_device = test_mult_d.copy_to_host()
print('Synthetic Test Matrix Multiplication')
print(test_mult.shape)
print(np.allclose(test_mult,test_mult_device))
test_Ptemp_d = cp.zeros((test_N,test_L,test_Order),dtype=cp.float64)
test_X_d = cp.array(X_FWFE)
test_D_d = cp.array(D_FWFE)
elementwise_matrix_multiplication_3D[(2,1,1),(16,8,8)](test_X_d,test_D_d,test_Ptemp_d,test_N,test_L,test_Order)
test_Ptemp = cp.asnumpy(test_Ptemp_d)
print('Compare with Function Output')
print(np.allclose(Ptemp_FWF,Ptemp_FWFE))
print('Compare with Externally Computed Output')
print(np.allclose(Ptemp_FWF,test_Ptemp))
#endregion