from __future__ import annotations
import matplotlib.pyplot as plt, numpy as np, numpy.typing as npt


# Solves the Least Squares problem for given X, Y.
# alpha is the regularization coefficient
# Output is the estimated y for both X and X_all
def solve_ols(
  X_train: npt.NDArray[np.float64], Y_train: npt.NDArray[np.float64],
  X_test: npt.NDArray[np.float64], alpha: float
):
  beta = alpha * np.identity(np.shape(X_train)[1])
  W = np.dot(
    np.linalg.pinv(np.dot(X_train.T, X_train) + beta),
    np.dot(X_train.T, Y_train)
  )
  # print("Optimal W is ", W.flatten())
  return np.dot(X_train, W), np.dot(X_test, W)


# Solves the Kernelized Regularized Least Squares problem for given K_train, Y.
# K_train is an n-by-n kernel matrix for train data (n := #training_points)
# K_test is an m-by-d kernel matrix (m:= #test_points, n := #training_points)
# It assumes that K_train is invertible
# alpha is the regularization coefficient
# Output is the estimated y for both X and X_all
def solve_kernel_ls(
  K_train: npt.NDArray[np.float64], Y_train: npt.NDArray[np.float64],
  K_test: npt.NDArray[np.float64], alpha: float
):
  a = np.dot(np.linalg.pinv(K_train + alpha * np.identity(np.shape(X_train)[0])), Y_train)
  return np.dot(K, a), np.dot(K_test.T, a)


# If X1 and X2 are the same, then it just computes the kernel matrix of X1
# In general, it computes the kernel matrix corresponding to rows of X1 and rows of X2
# sigma is the bandwidth of the kernel
def gaussian_kernel(X1: npt.NDArray[np.float64], X2: npt.NDArray[np.float64], sigma: float):
  m, n = np.shape(X2)[0], np.shape(X1)[0]
  # vectorized form of computing the kernel matrix
  norms_X1 = np.sum(np.multiply(X1, X1), axis=1)
  norms_X2 = np.sum(np.multiply(X2, X2), axis=1)
  K0 = np.tile(norms_X2, (n, 1)) + np.tile(norms_X1, (m, 1)).T - 2 * (X1 @ X2.T)
  return np.power(np.exp(-1.0 / sigma**2), K0)


def run_ols(
  X_train: npt.NDArray[np.float64], Y_train: npt.NDArray[np.float64],
  X_test: npt.NDArray[np.float64], Y_test: npt.NDArray[np.float64],
  alpha: float,
  plot_X_train: npt.NDArray[np.float64], plot_X_test: npt.NDArray[np.float64],
  description: str,
  kernel: bool = False,
):
  if kernel == False:
    Y_LS_train, Y_LS_test = solve_ols(X_train, Y_train, X_test, alpha)
  else:
    Y_LS_train, Y_LS_test = solve_kernel_ls(X_train, Y_train, X_test, alpha)

  print('Mean Squarred Error (MSE) of train data:', np.square(np.linalg.norm(Y_LS_train - Y_train)) / Y_train.size)
  print('Mean Squarred Error (MSE) of test data:', np.square(np.linalg.norm(Y_LS_test - Y_test)) / Y_test.size)


# generate n data points based on a combination of sinosuidal and polynomial functions
def generate_data(n: int):
  X = np.random.rand(n, 1) * 5
  X = np.sort(X, axis=0)
  Y = 10 + (X - 0.1) * (X - 0.1) * (X - 0.1) - 5 * (X - 0.5) * (X - 0.5) + 10 * X + 5 * np.sin(5 * X)
  # Adding noise
  Y += 0.1 * np.random.randn(n, 1)
  return X, Y


if __name__ == '__main__':
  # Number of training and test points.
  n_train = 50
  n_test = 1000

  # Regularizaion Coefficient in RLS (lambda in the class)
  alpha = 0.1

  # Bandiwidth of the Gaussian Kernel
  sigma = 0.1

  # Generating train and test data.
  X_train, Y_train = generate_data(n_train)
  X_test, Y_test = generate_data(n_test)

  # Homogenous line/hyperplane (goes through the origin)
  run_ols(X_train, Y_train, X_test, Y_test, alpha, X_train, X_test, 'Homogenous line')

  # Non-homogenous line/hyperplane
  # First we augment the data with an all 1 column/feature
  X_augmented_train = np.concatenate((X_train, np.ones((n_train, 1))), axis=1)
  X_augmented_test = np.concatenate((X_test, np.ones((n_test, 1))), axis=1)

  # OLS on the augmented data.
  run_ols(X_augmented_train, Y_train, X_augmented_test, Y_test, alpha, X_train, X_test, 'Nonhomogenous line')

  # Kernelized Least Squares with Gaussian Kernel
  K = gaussian_kernel(X_train, X_train, sigma)
  K_test = gaussian_kernel(X_train, X_test, sigma)
  run_ols(K, Y_train, K_test, Y_test, alpha, X_train, X_test, 'Gaussian Kernel', True)