import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

xs = np.linspace(-10, 10, 100)

m = 5
n = 10

# Create some linear-like data
ys = (m * xs + n) + np.random.normal(0, 4, xs.size) # add random noise

plt.scatter(xs, ys)
plt.grid()

mhat, nhat = np.polyfit(xs, ys, 1)
print(mhat, nhat)

5.067523508424096 10.338403758908862

plt.plot(xs, xs * mhat + nhat, color='tab:orange')
plt.scatter(xs, ys)
plt.grid()

def MSE(estimated_y, correct_y):
  """
    Mean Squared Error between the estimated_y and correct_y values.
    Inputs are both numpy arrays.
  """
  result = 0
  N = estimated_y.size
  for i in range(N):
    result += (estimated_y[i] - correct_y[i]) ** 2
  return result / N

def RMSE(estimated_y, correct_y):
  """
    Root Mean Squared Error between the estimated_y and correct_y values.
    Inputs are both numpy arrays.
  """
  return MSE(estimated_y, correct_y) ** 0.5

def R_squared(estimated_y, correct_y):
    """
    Coefficient of determination for predictions and observed values.
    """
    mse_model = MSE(estimated_y, correct_y)

    mean_y = correct_y.mean()
    mean_predictions = np.full(correct_y.size, mean_y)
    mse_baseline = MSE(mean_predictions, correct_y)

    return 1 - mse_model / mse_baseline

MSE(ys, (xs * mhat + nhat))

np.float64(13.890365494547748)

RMSE(ys, (xs * mhat + nhat))

np.float64(3.7269780646722013)

R_squared(ys, (xs * mhat + nhat))

np.float64(0.9840941381977046)

a = 1
b = 4
c = 4
ys = (a * xs * xs + b * xs + c) + np.random.normal(0, 4, xs.size) # add random noise
plt.scatter(xs, ys)
plt.grid()

m, n = np.polyfit(xs, ys, 1)

plt.plot(xs, xs * m + n, label='estimated line', color='orange')
plt.scatter(xs, ys)
plt.legend()

<matplotlib.legend.Legend at 0x10ead6ba0>

MSE(ys, (xs * m + n))

np.float64(938.017200528655)

ahat, bhat, chat = np.polyfit(xs, ys, 2)

plt.plot(xs, ahat * xs * xs + bhat * xs + chat, color='orange')
plt.scatter(xs, ys)

<matplotlib.collections.PathCollection at 0x10ea6b260>

MSE(ys, (ahat * xs * xs + bhat * xs + chat))

np.float64(15.630775125359312)

ys = 4 * np.sin(xs) - 5 + np.random.randn(xs.size)
plt.scatter(xs, ys)

<matplotlib.collections.PathCollection at 0x10eb2e780>

# Import optimize module
from scipy import optimize

def sinelike(x, a, b):
    return  a * np.sin(x) + b

# Fit a linear model:
solution, _ = optimize.curve_fit(sinelike, xs, ys, method='lm')

print("The estimated solution is: ", solution)

The estimated solution is:  [ 3.93896208 -4.86172707]

plt.scatter(xs, ys)
yhat = [sinelike(x, solution[0], solution[1]) for x in xs]

plt.plot(xs, yhat, color='orange')

[<matplotlib.lines.Line2D at 0x1112cfd10>]

MSE(np.array(ys), np.array(yhat))

np.float64(0.728924909680623)

R_squared(np.array(ys), np.array(yhat))

np.float64(0.9012331468531657)

Application: Regression problems¶