import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st
import pandas as pd

Assuming you have data \(X_1, \ldots, X_N \sim D\) from which you want to calculate a confidence interval of the expectation of \(X\), \(\mathbb{E}[X]\) where \(X \sim D\), there’s two options: t-distribution or bootstrap.

N = 101
rng = np.random.default_rng()
x = rng.binomial(10, 0.5, size = N)

t-Distribution CI

xbar = np.mean(x)
s = np.std(x)
se = s / np.sqrt(N)
t = st.t(df = N - 1).ppf([0.025, 0.975]) # 95%
xbar + t * se

array([4.91784634, 5.51779723])

Bootstrap CI

R = 1001
bms = np.zeros(R)
for r in range(R):
    idx = rng.integers(N, size = N)
    bms[r] = np.mean(x[idx])
np.quantile(bms, [0.025, 0.975])

array([4.91089109, 5.5049505 ])

df = pd.read_csv("https://raw.githubusercontent.com/roualdes/data/refs/heads/master/penguins.csv")

df.head()

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex	year
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	male	2007
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	female	2007
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	female	2007
3	Adelie	Torgersen	NaN	NaN	NaN	NaN	NaN	2007
4	Adelie	Torgersen	36.7	19.3	193.0	3450.0	female	2007

x = df["bill_length_mm"]

xbar = np.mean(x)
N = np.sum(~x.isna()) # np.size(x) # NaNs
se = np.sqrt(np.var(x) / N)
t = st.t(df = N - 1).ppf([0.025, 0.975])
xbar + t * se

array([43.34209692, 44.50176273])

R = 1_000
rng = np.random.default_rng()
cms = np.zeros(R)
for r in range(R):
    idx = rng.integers(N, size = N)
    cms[r] = np.mean(x[idx])
np.quantile(cms, [0.025, 0.975])

array([43.30323207, 44.41865553])

rng = np.random.default_rng()
idx = rng.integers(N, size = N)
x[idx]

15     36.6
69     41.8
165    48.4
240    47.5
193    49.6
       ... 
241    52.1
166    45.8
265    51.5
94     36.2
118    35.7
Name: bill_length_mm, Length: 342, dtype: float64

Other Formats

t-Distribution CI

Bootstrap CI