October 28th and 30th

In [5]:
import numpy as np
import pandas as pd
import bplot as bp
from scipy.stats import binom
from scipy.optimize import minimize
from scipy.special import loggamma
import patsy
gdf = Grouped Data Frame
df = Data Frame

___

Import Data
In [6]:
df = pd.read_csv("https://raw.githubusercontent.com/roualdes/data/master/possum.csv")
df.shape
Out[6]:
(104, 8)
In [26]:
df.head
Out[26]:
<bound method NDFrame.head of      site    pop sex  age  headL  skullW  totalL  tailL
0       1    vic   m  8.0   94.1    60.4    89.0   36.0
1       1    vic   f  6.0   92.5    57.6    91.5   36.5
2       1    vic   f  6.0   94.0    60.0    95.5   39.0
3       1    vic   f  6.0   93.2    57.1    92.0   38.0
4       1    vic   f  2.0   91.5    56.3    85.5   36.0
..    ...    ...  ..  ...    ...     ...     ...    ...
99      7  other   m  1.0   89.5    56.0    81.5   36.5
100     7  other   m  1.0   88.6    54.7    82.5   39.0
101     7  other   f  6.0   92.4    55.0    89.0   38.0
102     7  other   m  4.0   91.5    55.2    82.5   36.5
103     7  other   f  3.0   93.6    59.9    89.0   40.0

[104 rows x 8 columns]>
In [25]:
df.groupby('pop').agg("mean")
Out[25]:
site age headL skullW totalL tailL
pop
other 5.482759 3.689655 92.606897 57.065517 86.787931 37.862069
vic 1.282609 4.022727 92.597826 56.654348 87.467391 35.934783
In [59]:
def ll_normal(beta, yX):
    y = yX[:, 0]
    X = yX[:, 1:]
    N = X.shape[0]
    mu = np.full(N, np.nan)
    for n in range(N):
        mu[n] = np.sum(X[n, :] * beta)
    d = y - mu
    return np.sum(d * d)
In [7]:
X = patsy.dmatrix(" ~C(pop)", data = df)
yX = np.c_[df['totalL'], X]
print(X.shape)
#X[:-1, :]
(104, 2)
In [61]:
betahat = minimize(ll_normal, np.random.normal(size=2),
                   args=(yX), method = "BFGS")['x']
print(np.round(betahat.sum(), 2))
print(np.round(betahat.sum))
Out[61]:
87.47

Generally

$$ Y_n \sim N(\mu_n, \sigma^2) $$$$ \mu = \beta_0 + \beta_1 * x_n $$

Specifically

$$ totalL_n \sim N(\mu_n, \sigma^2) $$$$ \mu_n = \beta_0 + \beta_1 * vic_n $$
In [77]:
for i, (name, gdf) in enumerate(df.groupby('pop')):
    x = np.repeat(i, gdf['totalL'].size)
    y = gdf['totalL']
    bp.jitter(x, y, jitter_y = 0, label=name, color=bp.color[i])
    bp.legend()
In [ ]:
 
In [ ]:
for idx, n in enumerate(range(14, 20)):
    print(idx, n)
In [ ]:
for idx, (name, gdf) in enumerate(df.groupby('pop')):
    print(idx)
    print(name)
    print(gdf)

"I'm gonna call these well-labelled plots" -Edward Roualdes

Notice the labeling of the axis and legend
In [13]:
bp.clear()
for i, (name, gdf) in enumerate(df.groupby('pop')):
    x = np.repeat(i, gdf['totalL'].size)
    y = gdf['totalL']
    bp.jitter(x, y, jitter_y = 0, label = name, color = bp.color[i])
    
bp.xticks([0, 1], np.unique(df['pop']))
bp.title("Well-Labelled Plot")
bp.labels(x = "Population", y = "Total Length(cm)")
bp.legend()
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x26d33179438>
In [62]:
df.groupby('pop').agg("mean")
Out[62]:
site age headL skullW totalL tailL
pop
other 5.482759 3.689655 92.606897 57.065517 86.787931 37.862069
vic 1.282609 4.022727 92.597826 56.654348 87.467391 35.934783