import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

xs = np.linspace(-10, 10, 100)

m = 5
n = 10

# Create some linear-like data
ys = (m * xs + n) + np.random.normal(0, 4, xs.size) # add random noise


plt.scatter(xs, ys)
plt.grid()


mhat, nhat = np.polyfit(xs, ys, 1)
print(mhat, nhat)

4.909841138001781 10.431102318932654


plt.plot(xs, xs * mhat + nhat, color='tab:orange')
plt.scatter(xs, ys)
plt.grid()


def MSE(estimated_y, correct_y):
  """
    Mean Squared Error between the estimated_y and correct_y values.
    Inputs are both numpy arrays.
  """
  result = 0
  N = estimated_y.size
  for i in range(N):
    result += (estimated_y[i] - correct_y[i]) ** 2
  return result / N

def RMSE(estimated_y, correct_y):
  """
    Root Mean Squared Error between the estimated_y and correct_y values.
    Inputs are both numpy arrays.
  """
  return MSE(estimated_y, correct_y) ** 0.5

def R_squared(estimated_y, correct_y):
  """
    R^2 error between the estimated_y and correct_y values.
    Inputs are both numpy arrays.
  """
  MSE_of_estimates = MSE(estimated_y, correct_y)
  mean_y = correct_y.mean()
  mean_y_array = np.full(correct_y.size, mean_y)
  MSE_wrt_mean_of_y = MSE(estimated_y, mean_y_array) # an array filled with mean-y
  result = 1 - MSE_of_estimates / MSE_wrt_mean_of_y

  return result


MSE(ys, (xs * mhat + nhat))

17.03008891929832


RMSE(ys, (xs * mhat + nhat))

4.126752829925524


R_squared(ys, (xs * mhat + nhat))

0.9796489148199724


a = 1
b = 4
c = 4
ys = (a * xs * xs + b * xs + c) + np.random.normal(0, 4, xs.size) # add random noise
plt.scatter(xs, ys)
plt.grid()


m, n = np.polyfit(xs, ys, 1)


plt.plot(xs, xs * m + n, label='estimated line', color='orange')
plt.scatter(xs, ys)
plt.legend()

<matplotlib.legend.Legend at 0x7fbe328a5970>


MSE(ys, (xs * m + n))

939.2141146961674


ahat, bhat, chat = np.polyfit(xs, ys, 2)


plt.plot(xs, ahat * xs * xs + bhat * xs + chat, color='orange')
plt.scatter(xs, ys)

<matplotlib.collections.PathCollection at 0x7fbe32811cd0>


MSE(ys, (ahat * xs * xs + bhat * xs + chat))

15.052760650909258


ys = 4 * np.sin(xs) - 5 + np.random.randn(xs.size)
plt.scatter(xs, ys)

<matplotlib.collections.PathCollection at 0x7fbe208d88e0>


# Import optimize module
from scipy import optimize

def sinelike(x, a, b):
    return  a * np.sin(x) + b

# Fit a linear model:
solution, _ = optimize.curve_fit(sinelike, xs, ys, method='lm')

print("The estimated solution is: ", solution)

The estimated solution is:  [ 4.14373463 -5.03095775]


plt.scatter(xs, ys)
yhat = [sinelike(x, solution[0], solution[1]) for x in xs]

plt.plot(xs, yhat, color='orange')

[<matplotlib.lines.Line2D at 0x7fbe4058b820>]


MSE(np.array(ys), np.array(yhat))

0.9337464287047195


R_squared(np.array(ys), np.array(yhat))

0.8974051269093574

Application: Regression problems¶