import pandas as pd
import numpy as np
import scipy.stats as spicyGrouping DataFrames
df = pd.read_csv("https://raw.githubusercontent.com/roualdes/data/refs/heads/master/penguins.csv")df.head()| species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | year | |
|---|---|---|---|---|---|---|---|---|
| 0 | Adelie | Torgersen | 39.1 | 18.7 | 181.0 | 3750.0 | male | 2007 |
| 1 | Adelie | Torgersen | 39.5 | 17.4 | 186.0 | 3800.0 | female | 2007 |
| 2 | Adelie | Torgersen | 40.3 | 18.0 | 195.0 | 3250.0 | female | 2007 |
| 3 | Adelie | Torgersen | NaN | NaN | NaN | NaN | NaN | 2007 |
| 4 | Adelie | Torgersen | 36.7 | 19.3 | 193.0 | 3450.0 | female | 2007 |
xbar = np.mean(df["bill_length_mm"])
s = np.std(df["bill_length_mm"])
N = np.sum(~df["bill_length_mm"].isna())
t = spicy.t(df = N - 1).ppf([0.025, 0.975]) # ppf(0.975)
xbar + t * s / np.sqrt(N)np.shape(df)(344, 8)
df.columnsIndex(['species', 'island', 'bill_length_mm', 'bill_depth_mm',
'flipper_length_mm', 'body_mass_g', 'sex', 'year'],
dtype='str')
Ngdf = df.groupby(["sex", "island"], as_index = False)gdf<pandas.api.typing.DataFrameGroupBy object at 0x1438c8380>
for name, grouped_df in gdf:
print(name)
print(np.mean(grouped_df["bill_length_mm"]))('female', 'Biscoe')
43.30749999999999
('female', 'Dream')
42.29672131147541
('female', 'Torgersen')
37.55416666666667
('male', 'Biscoe')
47.11927710843374
('male', 'Dream')
46.116129032258065
('male', 'Torgersen')
40.58695652173913
sdf = gdf.aggregate(mn = ("bill_length_mm", np.mean),
sd = ("bill_length_mm", np.std))
# creates a new grouped dataframe
# based on given parameters, where
# parameters follow this pattern
# name_var = (col_name, function)sdf| sex | island | mn | sd | |
|---|---|---|---|---|
| 0 | female | Biscoe | 43.307500 | 4.151439 |
| 1 | female | Dream | 42.296721 | 5.488288 |
| 2 | female | Torgersen | 37.554167 | 2.161400 |
| 3 | male | Biscoe | 47.119277 | 4.662655 |
| 4 | male | Dream | 46.116129 | 5.720512 |
| 5 | male | Torgersen | 40.586957 | 2.960949 |
Indexing DataFrames
idx = np.array([0, 2, 3, 5]) # np.arange(4)
sdf["mn"][idx] # not really my recommended strategy0 43.307500
2 37.554167
3 47.119277
5 40.586957
Name: mn, dtype: float64
sdf["mn"]0 43.307500
1 42.296721
2 37.554167
3 47.119277
4 46.116129
5 40.586957
Name: mn, dtype: float64
df.head()| species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | year | |
|---|---|---|---|---|---|---|---|---|
| 0 | Adelie | Torgersen | 39.1 | 18.7 | 181.0 | 3750.0 | male | 2007 |
| 1 | Adelie | Torgersen | 39.5 | 17.4 | 186.0 | 3800.0 | female | 2007 |
| 2 | Adelie | Torgersen | 40.3 | 18.0 | 195.0 | 3250.0 | female | 2007 |
| 3 | Adelie | Torgersen | NaN | NaN | NaN | NaN | NaN | 2007 |
| 4 | Adelie | Torgersen | 36.7 | 19.3 | 193.0 | 3450.0 | female | 2007 |
bdx = df["bill_length_mm"] > 40 # create a boolean array
df.loc[bdx] # use method loc to index df by boolean arrayFalse
sdx = df["sex"] == "male"
idx = df["island"] == "Dream"
ndx = ~df["body_mass_g"].isna()
bbdx = df["body_mass_g"] > 4000
# find the male penguins from the island dream
# with not missing data in the column body mass
# whose weight is greater than 4000
df.loc[sdx & idx & ndx & bbdx]| species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | year | |
|---|---|---|---|---|---|---|---|---|
| 35 | Adelie | Dream | 39.2 | 21.1 | 196.0 | 4150.0 | male | 2007 |
| 39 | Adelie | Dream | 39.8 | 19.1 | 184.0 | 4650.0 | male | 2007 |
| 43 | Adelie | Dream | 44.1 | 19.7 | 196.0 | 4400.0 | male | 2007 |
| 45 | Adelie | Dream | 39.6 | 18.8 | 190.0 | 4600.0 | male | 2007 |
| 49 | Adelie | Dream | 42.3 | 21.2 | 191.0 | 4150.0 | male | 2007 |
| 91 | Adelie | Dream | 41.1 | 18.1 | 205.0 | 4300.0 | male | 2008 |
| 93 | Adelie | Dream | 39.6 | 18.1 | 186.0 | 4450.0 | male | 2008 |
| 95 | Adelie | Dream | 40.8 | 18.9 | 208.0 | 4300.0 | male | 2008 |
| 97 | Adelie | Dream | 40.3 | 18.5 | 196.0 | 4350.0 | male | 2008 |
| 99 | Adelie | Dream | 43.2 | 18.5 | 192.0 | 4100.0 | male | 2008 |
| 133 | Adelie | Dream | 37.5 | 18.5 | 199.0 | 4475.0 | male | 2009 |
| 139 | Adelie | Dream | 39.7 | 17.9 | 193.0 | 4250.0 | male | 2009 |
| 146 | Adelie | Dream | 39.2 | 18.6 | 190.0 | 4250.0 | male | 2009 |
| 289 | Chinstrap | Dream | 52.0 | 18.1 | 201.0 | 4050.0 | male | 2007 |
| 291 | Chinstrap | Dream | 50.5 | 19.6 | 201.0 | 4050.0 | male | 2007 |
| 295 | Chinstrap | Dream | 49.2 | 18.2 | 195.0 | 4400.0 | male | 2007 |
| 301 | Chinstrap | Dream | 52.0 | 19.0 | 197.0 | 4150.0 | male | 2007 |
| 305 | Chinstrap | Dream | 52.8 | 20.0 | 205.0 | 4550.0 | male | 2008 |
| 307 | Chinstrap | Dream | 54.2 | 20.8 | 201.0 | 4300.0 | male | 2008 |
| 309 | Chinstrap | Dream | 51.0 | 18.8 | 203.0 | 4100.0 | male | 2008 |
| 313 | Chinstrap | Dream | 52.0 | 20.7 | 210.0 | 4800.0 | male | 2008 |
| 315 | Chinstrap | Dream | 53.5 | 19.9 | 205.0 | 4500.0 | male | 2008 |
| 321 | Chinstrap | Dream | 50.8 | 18.5 | 201.0 | 4450.0 | male | 2009 |
| 323 | Chinstrap | Dream | 49.0 | 19.6 | 212.0 | 4300.0 | male | 2009 |
| 329 | Chinstrap | Dream | 50.7 | 19.7 | 203.0 | 4050.0 | male | 2009 |
| 333 | Chinstrap | Dream | 49.3 | 19.9 | 203.0 | 4050.0 | male | 2009 |
| 342 | Chinstrap | Dream | 50.8 | 19.0 | 210.0 | 4100.0 | male | 2009 |
jdx = (df["body_mass_g"] > 40) | (df["body_mass_g"] < 35)
df.loc[jdx & idx]| species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | year | |
|---|---|---|---|---|---|---|---|---|
| 30 | Adelie | Dream | 39.5 | 16.7 | 178.0 | 3250.0 | female | 2007 |
| 31 | Adelie | Dream | 37.2 | 18.1 | 178.0 | 3900.0 | male | 2007 |
| 32 | Adelie | Dream | 39.5 | 17.8 | 188.0 | 3300.0 | female | 2007 |
| 33 | Adelie | Dream | 40.9 | 18.9 | 184.0 | 3900.0 | male | 2007 |
| 34 | Adelie | Dream | 36.4 | 17.0 | 195.0 | 3325.0 | female | 2007 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 339 | Chinstrap | Dream | 55.8 | 19.8 | 207.0 | 4000.0 | male | 2009 |
| 340 | Chinstrap | Dream | 43.5 | 18.1 | 202.0 | 3400.0 | female | 2009 |
| 341 | Chinstrap | Dream | 49.6 | 18.2 | 193.0 | 3775.0 | male | 2009 |
| 342 | Chinstrap | Dream | 50.8 | 19.0 | 210.0 | 4100.0 | male | 2009 |
| 343 | Chinstrap | Dream | 50.2 | 18.7 | 198.0 | 3775.0 | female | 2009 |
124 rows × 8 columns
bdf = df.sort_values("bill_length_mm") # sort an entire dataframe by a column
bdf.head()| species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | year | |
|---|---|---|---|---|---|---|---|---|
| 142 | Adelie | Dream | 32.1 | 15.5 | 188.0 | 3050.0 | female | 2009 |
| 98 | Adelie | Dream | 33.1 | 16.1 | 178.0 | 2900.0 | female | 2008 |
| 70 | Adelie | Torgersen | 33.5 | 19.0 | 190.0 | 3600.0 | female | 2008 |
| 92 | Adelie | Dream | 34.0 | 17.1 | 185.0 | 3400.0 | female | 2008 |
| 8 | Adelie | Torgersen | 34.1 | 18.1 | 193.0 | 3475.0 | NaN | 2007 |
# index by integer location (iloc), not label location (loc)
bdf.iloc[4] # == bdf.loc[8]species Adelie
island Torgersen
bill_length_mm 34.1
bill_depth_mm 18.1
flipper_length_mm 193.0
body_mass_g 3475.0
sex NaN
year 2007
Name: 8, dtype: object
bdf.iloc[0] # == bdf.loc[142]species Adelie
island Dream
bill_length_mm 32.1
bill_depth_mm 15.5
flipper_length_mm 188.0
body_mass_g 3050.0
sex female
year 2009
Name: 142, dtype: object
# index both rows (by boolean array bbdx) and columns by name
# using loc
# if columns are specified with a list => dataframe
# if column is specified with a str => Series
df.loc[bbdx, ["island", "bill_depth_mm", "year"]]| island | bill_depth_mm | year | |
|---|---|---|---|
| 7 | Torgersen | 19.6 | 2007 |
| 9 | Torgersen | 20.2 | 2007 |
| 14 | Torgersen | 21.1 | 2007 |
| 17 | Torgersen | 20.7 | 2007 |
| 19 | Torgersen | 21.5 | 2007 |
| ... | ... | ... | ... |
| 321 | Dream | 18.5 | 2009 |
| 323 | Dream | 19.6 | 2009 |
| 329 | Dream | 19.7 | 2009 |
| 333 | Dream | 19.9 | 2009 |
| 342 | Dream | 19.0 | 2009 |
172 rows × 3 columns
bdf.loc[np.arange(10)]| species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | year | |
|---|---|---|---|---|---|---|---|---|
| 0 | Adelie | Torgersen | 39.1 | 18.7 | 181.0 | 3750.0 | male | 2007 |
| 1 | Adelie | Torgersen | 39.5 | 17.4 | 186.0 | 3800.0 | female | 2007 |
| 2 | Adelie | Torgersen | 40.3 | 18.0 | 195.0 | 3250.0 | female | 2007 |
| 3 | Adelie | Torgersen | NaN | NaN | NaN | NaN | NaN | 2007 |
| 4 | Adelie | Torgersen | 36.7 | 19.3 | 193.0 | 3450.0 | female | 2007 |
| 5 | Adelie | Torgersen | 39.3 | 20.6 | 190.0 | 3650.0 | male | 2007 |
| 6 | Adelie | Torgersen | 38.9 | 17.8 | 181.0 | 3625.0 | female | 2007 |
| 7 | Adelie | Torgersen | 39.2 | 19.6 | 195.0 | 4675.0 | male | 2007 |
| 8 | Adelie | Torgersen | 34.1 | 18.1 | 193.0 | 3475.0 | NaN | 2007 |
| 9 | Adelie | Torgersen | 42.0 | 20.2 | 190.0 | 4250.0 | NaN | 2007 |
bdf.iloc[1:10, df.columns.get_indexer(["island", "bill_depth_mm", "year"])]| island | bill_depth_mm | year | |
|---|---|---|---|
| 98 | Dream | 16.1 | 2008 |
| 70 | Torgersen | 19.0 | 2008 |
| 92 | Dream | 17.1 | 2008 |
| 8 | Torgersen | 18.1 | 2007 |
| 18 | Torgersen | 18.4 | 2007 |
| 54 | Biscoe | 18.1 | 2008 |
| 80 | Torgersen | 17.2 | 2008 |
| 14 | Torgersen | 21.1 | 2007 |
| 52 | Biscoe | 17.9 | 2008 |
# recommendations for indexing DataFrames
# by boolean array, use loc, whether or not you want columns by name
# by integer array, use iloc
# if integer array, and columns by name use df.columns.get_indexerdf.loc[3, "bill_length_mm"] = 5df.loc[3, "body_mass_g"] = 1342343df.loc[3, "bill_depth_mm"] = 3np.size(df["bill_depth_mm"])
#df["bill_depth_mm"] = np.arange(344)344
df.loc[3, ["bill_depth_mm", "body_mass_g", "bill_length_mm"]] = [3, 4, 6]df.loc[3, ["bill_depth_mm", "body_mass_g", "bill_length_mm"]]bill_depth_mm 3.0
body_mass_g 4.0
bill_length_mm 6.0
Name: 3, dtype: float64