In [3]:
import numpy as np
import pandas as pd
from scipy.stats import norm as normal
from scipy.optimize import minimize
import bplot as bp
import patsy

Simple Linear Regression

In [4]:
def ll_normal(beta, yX):
    y = yX[:, 0]
    X = yX[:, 1:]
    N = X.shape[0]
    mu = np.full(N, np.nan)
    for n in range(N):
        mu[n] = np.sum(X[n, :] * beta)
    d = y - mu
    return np.sum(d * d)

def optim(data, initval = None):
    k = data.shape[1] - 1
    if not initval:
        initval = np.random.normal(size = k)
    return minimize(ll_normal, initval, args=(data), method="BFGS")['x']

def bootstrap(data, R, fun):
    N, k = data.shape
    k -= 1
    thetas = np.full((R, k), np.nan)
    for r in range(R):
        idx = np.random.choice(N, N, replace=True)
        thetas[r, :] = fun(data[idx, :])
    return thetas

R = 1001
In [5]:
df = pd.read_csv("https://raw.githubusercontent.com/roualdes/data/master/elmhurst.csv")
In [39]:
# Explanitory variable (x-axis) is family income, and (y-axis) is the resulting gift aid.

# We are making predictions on gift aid given family income.
In [7]:
X = patsy.dmatrix("~ family_income", data=df)
yX = np.c_[df['gift_aid'], X]
In [8]:
betahat = minimize(ll_normal, np.random.normal(size=2), args=yX, method="BFGS")['x']
In [9]:
bp.point(df["family_income"], df["gift_aid"])
bp.labels(x = "Familly Income ($100)", y = "Gift aid ($1000)", size = 18)
# betahat[1] * df['family_income']
bp.line(df['family_income'], betahat[0] + betahat[1] * df['family_income'], color = 'red')
Out[9]:
[<matplotlib.lines.Line2D at 0x11d1adc10>]
In [43]:
# For every $1000 increase in family income, we expect gift aid to decrease by ~$40
betahat #24.32 + -0.04(Xn)
Out[43]:
array([24.31933199, -0.04307168])
In [53]:
u = 24.32 + -0.04 * (100) # the units are $1000, so really we are inputing $100,000

# How Edward did it
np.sum(betahat * np.asarray([1, 100]))
Out[53]:
20.012163905185982
In [10]:
betas = bootstrap(yX, R, optim)
/Users/ez/venvs/py3/lib/python3.7/site-packages/scipy/optimize/optimize.py:1048: RuntimeWarning: divide by zero encountered in double_scalars
  rhok = 1.0 / (numpy.dot(yk, sk))
/Users/ez/venvs/py3/lib/python3.7/site-packages/scipy/optimize/optimize.py:1048: RuntimeWarning: divide by zero encountered in double_scalars
  rhok = 1.0 / (numpy.dot(yk, sk))
/Users/ez/venvs/py3/lib/python3.7/site-packages/scipy/optimize/optimize.py:1048: RuntimeWarning: divide by zero encountered in double_scalars
  rhok = 1.0 / (numpy.dot(yk, sk))
/Users/ez/venvs/py3/lib/python3.7/site-packages/scipy/optimize/optimize.py:1048: RuntimeWarning: divide by zero encountered in double_scalars
  rhok = 1.0 / (numpy.dot(yk, sk))
In [11]:
"""
If a family aint' got no money, we are 80% confident the gift aid will be between 22.82 and 25.98 thousands
of dollars

For every $1000 dollar increase in income, we are 80% confident that there is decrease by between 0.029 to 0.06
in $1000 dollars of gift aid
"""
np.percentile(betas, [10, 90], axis=0).T
Out[11]:
array([[22.73844549, 25.9926342 ],
       [-0.05997412, -0.02882947]])
In [12]:
giftaidhat = np.full(R, np.nan)
for r in range(R):
    giftaidhat[r] = np.sum(np.asarray([1,100]) * betas[r, :])
In [15]:
# vectorized
gitfaidhat = np.sum(np.asarray([1, 100] * betas), axis=1)
In [16]:
bp.density(giftaidhat)
Out[16]:
[<matplotlib.lines.Line2D at 0x11d050d90>]
In [17]:
"""
we are 80% confident that as the family income reaches $100,000 the gift aid will be between
19.17 and 20.81 $1000s of dollars

"""

np.percentile(giftaidhat, [10, 90])
Out[17]:
array([19.05822484, 20.80087528])
In [18]:
giftaidhat = np.full(R, np.nan)
for r in range(R):
    bp.line(df['family_income'], betas[r, 0] + betas[r, 1] * df['family_income'],
           color="orange", alpha=0.01)
    giftaidhat[r] = np.sum(np.asarray([1,100]) * betas[r, :])

bp.point(df["family_income"], df["gift_aid"])
bp.labels(x = "Familly Income ($1000)", y = "Gift aid ($1000)", size = 18)
Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x11f44de90>