import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st
import pandas as pd

Assuming you have data \(X_1, \ldots, X_N \sim D\) from which you want to calculate a confidence interval of the expectation of \(X\), \(\mathbb{E}[X]\) where \(X \sim D\), there’s two options: t-distribution or bootstrap.

N = 101
rng = np.random.default_rng()
x = rng.binomial(10, 0.5, size = N)

t-Distribution CI

xbar = np.mean(x)
s = np.std(x)
se = s / np.sqrt(N)
t = st.t(df = N - 1).ppf([0.025, 0.975]) # 95%
xbar + t * se
array([4.91784634, 5.51779723])

Bootstrap CI

R = 1001
bms = np.zeros(R)
for r in range(R):
    idx = rng.integers(N, size = N)
    bms[r] = np.mean(x[idx])
np.quantile(bms, [0.025, 0.975])
array([4.91089109, 5.5049505 ])
df = pd.read_csv("https://raw.githubusercontent.com/roualdes/data/refs/heads/master/penguins.csv")
df.head()
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex year
0 Adelie Torgersen 39.1 18.7 181.0 3750.0 male 2007
1 Adelie Torgersen 39.5 17.4 186.0 3800.0 female 2007
2 Adelie Torgersen 40.3 18.0 195.0 3250.0 female 2007
3 Adelie Torgersen NaN NaN NaN NaN NaN 2007
4 Adelie Torgersen 36.7 19.3 193.0 3450.0 female 2007
x = df["bill_length_mm"]
xbar = np.mean(x)
N = np.sum(~x.isna()) # np.size(x) # NaNs
se = np.sqrt(np.var(x) / N)
t = st.t(df = N - 1).ppf([0.025, 0.975])
xbar + t * se
array([43.34209692, 44.50176273])
R = 1_000
rng = np.random.default_rng()
cms = np.zeros(R)
for r in range(R):
    idx = rng.integers(N, size = N)
    cms[r] = np.mean(x[idx])
np.quantile(cms, [0.025, 0.975])
array([43.30323207, 44.41865553])
rng = np.random.default_rng()
idx = rng.integers(N, size = N)
x[idx]
15     36.6
69     41.8
165    48.4
240    47.5
193    49.6
       ... 
241    52.1
166    45.8
265    51.5
94     36.2
118    35.7
Name: bill_length_mm, Length: 342, dtype: float64