import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import minimize
import scipy.stats as st
import pandas as pd
df = pd.read_csv("https://raw.githubusercontent.com/roualdes/data/refs/heads/master/donkeys.csv")
x = df["Weight"]
N = np.size(x)

Mean

(1,) + (3,) + (5,)
(1, 3, 5)
def bootstrap(T, data, R = 1_000, conflevel = 0.95, **kwargs):
    shp = np.shape(T(data, **kwargs))
    rng = np.random.default_rng()
    Ts = np.zeros((R,) + shp)
    N = np.shape(data)[0]
    a = (1 - conflevel) / 2
    for r in range(R):
        idx = rng.integers(N, size = N)   
        Ts[r] = T(data[idx], **kwargs)
    return np.quantile(Ts, [a, 1 - a], axis = 0)    
bootstrap(np.mean, x)
array([149.80675551, 154.35491728])
bootstrap(np.median, x)
array([152., 158.])
bootstrap(np.std, x)
array([24.52093543, 28.42205867])
bootstrap(lambda z: np.sqrt(np.var(z)), x, conflevel = 0.98)
array([24.20485749, 28.84863963])

Linear regression prediction

def ll_regression(theta, data):
    x = data["x"]
    y = data["y"]
    b0 = theta[0]
    b1 = theta[1]
    return np.sum( (y - (b0 + b1 * x)) ** 2)
def ll_grad(theta, data):
    x = data["x"]
    y = data["y"]
    b0 = theta[0]
    b1 = theta[1]
    lm = b0 + b1 * x
    db0 = np.sum(y - lm)
    db1 = np.sum( x * (y - lm) )
    return -2 * np.array([db0, db1])
rng = np.random.default_rng()
init = rng.normal(size = 2)
d = {
    "x": df["Height"],
    "y": df["Weight"]
}
o = minimize(ll_regression, init, jac = ll_grad, args = (d,))
o.x
array([-309.30049353,    4.55262576])
plt.scatter(df["Height"], df["Weight"])
y0 = np.sum(o.x * np.array([1, 50]))
y1 = np.sum(o.x * np.array([1, 120]))
plt.plot(np.array([50, 120]), np.array([y0, y1]), color = "black");

# work in progress; right idea, but needed new
# 1. function T
# 2. data structure
R = 1_000
def bootstrap2(T, data, R = 1_000, conflevel = 0.95):
    ws = np.zeros(R)
    rng = np.random.default_rng()
    for r in range(R):
        init = rng.normal(size = 2)
        idx = rng.integers(N, size = N)
        d = {
            "x": df["Height"][idx],
            "y": df["Weight"][idx]
        }
        o = minimize(ll_regression, init, jac = ll_grad, args = (d,))
        ws[r] = np.sum(o.x * np.array([1, 80]))
    return np.quantile(ws, [0.025, 0.975])
np.sum(o.x * np.array([1, 80]))
54.909567021646296
# leaving this here so we can reproduce it
plt.scatter(df["Height"], df["Weight"])
for r in range(R):
    y0 = np.sum(coefs[r] * np.array([1, 50]))
    y1 = np.sum(coefs[r] * np.array([1, 120]))
    plt.plot(np.array([50, 120]), np.array([y0, y1]), color = "black", alpha = 0.01);
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[15], line 4
      2 plt.scatter(df["Height"], df["Weight"])
      3 for r in range(R):
----> 4     y0 = np.sum(coefs[r] * np.array([1, 50]))
      5     y1 = np.sum(coefs[r] * np.array([1, 120]))
      6     plt.plot(np.array([50, 120]), np.array([y0, y1]), color = "black", alpha = 0.01);

NameError: name 'coefs' is not defined

a = np.array([[1,6], [2, 7], [3, 8], [4, 9], [5, 10]])
a
a[[2, 1]]
a[1: , 1]
a[2:4, 1]
np.sum(a, axis = 0) # collapse 0th axis, left with 2 columns
np.sum(a, axis = 1) # collapse 1st axis, left with 5 rows
N = np.shape(df)[0]
data = np.c_[df["Weight"], np.ones(N), df["Height"]]
def ll_regression(theta, data):
    X = data[:, 1:]
    y = data[:, 0]
    lm = np.sum(X * theta, axis = 1)
    return np.sum( (y - lm) ** 2 )
    
def ll_grad(theta, data):
    X = data[:, 1:]
    height = X[:, 1]
    y = data[:, 0]
    lm = np.sum(X * theta, axis = 1)
    db0 = np.sum(y - lm)
    db1 = np.sum( height * (y - lm) )
    return -2 * np.array([db0, db1])
def ll_regression_prediction(data, **kwargs):
    rng = np.random.default_rng()
    init = rng.normal(size = np.shape(data[:, 1:])[1])
    o = minimize(ll_regression, init, jac = ll_grad, args = (data,))
    return np.sum(o.x * np.array(kwargs["xnew"]))
bootstrap(ll_regression_prediction, data, xnew = [1, 80])
array([46.64412514, 63.03909406])