import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st
import pandas as pdAssuming you have data \(X_1, \ldots, X_N \sim D\) from which you want to calculate a confidence interval of the expectation of \(X\), \(\mathbb{E}[X]\) where \(X \sim D\), there’s two options: t-distribution or bootstrap.
N = 101
rng = np.random.default_rng()
x = rng.binomial(10, 0.5, size = N)t-Distribution CI
xbar = np.mean(x)
s = np.std(x)
se = s / np.sqrt(N)
t = st.t(df = N - 1).ppf([0.025, 0.975]) # 95%
xbar + t * searray([4.91784634, 5.51779723])Bootstrap CI
R = 1001
bms = np.zeros(R)
for r in range(R):
    idx = rng.integers(N, size = N)
    bms[r] = np.mean(x[idx])
np.quantile(bms, [0.025, 0.975])array([4.91089109, 5.5049505 ])df = pd.read_csv("https://raw.githubusercontent.com/roualdes/data/refs/heads/master/penguins.csv")df.head()| species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | year | |
|---|---|---|---|---|---|---|---|---|
| 0 | Adelie | Torgersen | 39.1 | 18.7 | 181.0 | 3750.0 | male | 2007 | 
| 1 | Adelie | Torgersen | 39.5 | 17.4 | 186.0 | 3800.0 | female | 2007 | 
| 2 | Adelie | Torgersen | 40.3 | 18.0 | 195.0 | 3250.0 | female | 2007 | 
| 3 | Adelie | Torgersen | NaN | NaN | NaN | NaN | NaN | 2007 | 
| 4 | Adelie | Torgersen | 36.7 | 19.3 | 193.0 | 3450.0 | female | 2007 | 
x = df["bill_length_mm"]xbar = np.mean(x)
N = np.sum(~x.isna()) # np.size(x) # NaNs
se = np.sqrt(np.var(x) / N)
t = st.t(df = N - 1).ppf([0.025, 0.975])
xbar + t * searray([43.34209692, 44.50176273])R = 1_000
rng = np.random.default_rng()
cms = np.zeros(R)
for r in range(R):
    idx = rng.integers(N, size = N)
    cms[r] = np.mean(x[idx])
np.quantile(cms, [0.025, 0.975])array([43.30323207, 44.41865553])rng = np.random.default_rng()
idx = rng.integers(N, size = N)
x[idx]15     36.6
69     41.8
165    48.4
240    47.5
193    49.6
       ... 
241    52.1
166    45.8
265    51.5
94     36.2
118    35.7
Name: bill_length_mm, Length: 342, dtype: float64