Grouping DataFrames

import pandas as pd
import numpy as np
import scipy.stats as spicy
df = pd.read_csv("https://raw.githubusercontent.com/roualdes/data/refs/heads/master/penguins.csv")
df.head()
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex year
0 Adelie Torgersen 39.1 18.7 181.0 3750.0 male 2007
1 Adelie Torgersen 39.5 17.4 186.0 3800.0 female 2007
2 Adelie Torgersen 40.3 18.0 195.0 3250.0 female 2007
3 Adelie Torgersen NaN NaN NaN NaN NaN 2007
4 Adelie Torgersen 36.7 19.3 193.0 3450.0 female 2007
xbar = np.mean(df["bill_length_mm"])
s = np.std(df["bill_length_mm"])
N = np.sum(~df["bill_length_mm"].isna())
t = spicy.t(df = N - 1).ppf([0.025, 0.975]) # ppf(0.975)
xbar + t * s / np.sqrt(N)
np.shape(df)
(344, 8)
df.columns
Index(['species', 'island', 'bill_length_mm', 'bill_depth_mm',
       'flipper_length_mm', 'body_mass_g', 'sex', 'year'],
      dtype='str')
N
gdf = df.groupby(["sex", "island"], as_index = False)
gdf
<pandas.api.typing.DataFrameGroupBy object at 0x1438c8380>
for name, grouped_df in gdf:
    print(name)
    print(np.mean(grouped_df["bill_length_mm"]))
('female', 'Biscoe')
43.30749999999999
('female', 'Dream')
42.29672131147541
('female', 'Torgersen')
37.55416666666667
('male', 'Biscoe')
47.11927710843374
('male', 'Dream')
46.116129032258065
('male', 'Torgersen')
40.58695652173913
sdf = gdf.aggregate(mn = ("bill_length_mm", np.mean),
                   sd = ("bill_length_mm", np.std))
                   # creates a new grouped dataframe 
                   # based on given parameters, where
                   # parameters follow this pattern
                   # name_var = (col_name, function)
sdf
sex island mn sd
0 female Biscoe 43.307500 4.151439
1 female Dream 42.296721 5.488288
2 female Torgersen 37.554167 2.161400
3 male Biscoe 47.119277 4.662655
4 male Dream 46.116129 5.720512
5 male Torgersen 40.586957 2.960949

Indexing DataFrames

idx = np.array([0, 2, 3, 5]) # np.arange(4)
sdf["mn"][idx] # not really my recommended strategy
0    43.307500
2    37.554167
3    47.119277
5    40.586957
Name: mn, dtype: float64
sdf["mn"]
0    43.307500
1    42.296721
2    37.554167
3    47.119277
4    46.116129
5    40.586957
Name: mn, dtype: float64
df.head()
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex year
0 Adelie Torgersen 39.1 18.7 181.0 3750.0 male 2007
1 Adelie Torgersen 39.5 17.4 186.0 3800.0 female 2007
2 Adelie Torgersen 40.3 18.0 195.0 3250.0 female 2007
3 Adelie Torgersen NaN NaN NaN NaN NaN 2007
4 Adelie Torgersen 36.7 19.3 193.0 3450.0 female 2007
bdx = df["bill_length_mm"] > 40 # create a boolean array
df.loc[bdx] # use method loc to index df by boolean array
False
sdx = df["sex"] == "male"
idx = df["island"] == "Dream"
ndx = ~df["body_mass_g"].isna()
bbdx = df["body_mass_g"] > 4000
# find the male penguins from the island dream
# with not missing data in the column body mass
# whose weight is greater than 4000
df.loc[sdx & idx & ndx & bbdx]
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex year
35 Adelie Dream 39.2 21.1 196.0 4150.0 male 2007
39 Adelie Dream 39.8 19.1 184.0 4650.0 male 2007
43 Adelie Dream 44.1 19.7 196.0 4400.0 male 2007
45 Adelie Dream 39.6 18.8 190.0 4600.0 male 2007
49 Adelie Dream 42.3 21.2 191.0 4150.0 male 2007
91 Adelie Dream 41.1 18.1 205.0 4300.0 male 2008
93 Adelie Dream 39.6 18.1 186.0 4450.0 male 2008
95 Adelie Dream 40.8 18.9 208.0 4300.0 male 2008
97 Adelie Dream 40.3 18.5 196.0 4350.0 male 2008
99 Adelie Dream 43.2 18.5 192.0 4100.0 male 2008
133 Adelie Dream 37.5 18.5 199.0 4475.0 male 2009
139 Adelie Dream 39.7 17.9 193.0 4250.0 male 2009
146 Adelie Dream 39.2 18.6 190.0 4250.0 male 2009
289 Chinstrap Dream 52.0 18.1 201.0 4050.0 male 2007
291 Chinstrap Dream 50.5 19.6 201.0 4050.0 male 2007
295 Chinstrap Dream 49.2 18.2 195.0 4400.0 male 2007
301 Chinstrap Dream 52.0 19.0 197.0 4150.0 male 2007
305 Chinstrap Dream 52.8 20.0 205.0 4550.0 male 2008
307 Chinstrap Dream 54.2 20.8 201.0 4300.0 male 2008
309 Chinstrap Dream 51.0 18.8 203.0 4100.0 male 2008
313 Chinstrap Dream 52.0 20.7 210.0 4800.0 male 2008
315 Chinstrap Dream 53.5 19.9 205.0 4500.0 male 2008
321 Chinstrap Dream 50.8 18.5 201.0 4450.0 male 2009
323 Chinstrap Dream 49.0 19.6 212.0 4300.0 male 2009
329 Chinstrap Dream 50.7 19.7 203.0 4050.0 male 2009
333 Chinstrap Dream 49.3 19.9 203.0 4050.0 male 2009
342 Chinstrap Dream 50.8 19.0 210.0 4100.0 male 2009
jdx = (df["body_mass_g"] > 40) | (df["body_mass_g"] < 35)
df.loc[jdx & idx]
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex year
30 Adelie Dream 39.5 16.7 178.0 3250.0 female 2007
31 Adelie Dream 37.2 18.1 178.0 3900.0 male 2007
32 Adelie Dream 39.5 17.8 188.0 3300.0 female 2007
33 Adelie Dream 40.9 18.9 184.0 3900.0 male 2007
34 Adelie Dream 36.4 17.0 195.0 3325.0 female 2007
... ... ... ... ... ... ... ... ...
339 Chinstrap Dream 55.8 19.8 207.0 4000.0 male 2009
340 Chinstrap Dream 43.5 18.1 202.0 3400.0 female 2009
341 Chinstrap Dream 49.6 18.2 193.0 3775.0 male 2009
342 Chinstrap Dream 50.8 19.0 210.0 4100.0 male 2009
343 Chinstrap Dream 50.2 18.7 198.0 3775.0 female 2009

124 rows × 8 columns

bdf = df.sort_values("bill_length_mm") # sort an entire dataframe by a column
bdf.head()
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex year
142 Adelie Dream 32.1 15.5 188.0 3050.0 female 2009
98 Adelie Dream 33.1 16.1 178.0 2900.0 female 2008
70 Adelie Torgersen 33.5 19.0 190.0 3600.0 female 2008
92 Adelie Dream 34.0 17.1 185.0 3400.0 female 2008
8 Adelie Torgersen 34.1 18.1 193.0 3475.0 NaN 2007
# index by integer location (iloc), not label location (loc)
bdf.iloc[4] # == bdf.loc[8]
species                 Adelie
island               Torgersen
bill_length_mm            34.1
bill_depth_mm             18.1
flipper_length_mm        193.0
body_mass_g             3475.0
sex                        NaN
year                      2007
Name: 8, dtype: object
bdf.iloc[0] # == bdf.loc[142]
species              Adelie
island                Dream
bill_length_mm         32.1
bill_depth_mm          15.5
flipper_length_mm     188.0
body_mass_g          3050.0
sex                  female
year                   2009
Name: 142, dtype: object
# index both rows (by boolean array bbdx) and columns by name
# using loc
# if columns are specified with a list => dataframe
# if column is specified with a str => Series
df.loc[bbdx, ["island", "bill_depth_mm", "year"]]
island bill_depth_mm year
7 Torgersen 19.6 2007
9 Torgersen 20.2 2007
14 Torgersen 21.1 2007
17 Torgersen 20.7 2007
19 Torgersen 21.5 2007
... ... ... ...
321 Dream 18.5 2009
323 Dream 19.6 2009
329 Dream 19.7 2009
333 Dream 19.9 2009
342 Dream 19.0 2009

172 rows × 3 columns

bdf.loc[np.arange(10)]
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex year
0 Adelie Torgersen 39.1 18.7 181.0 3750.0 male 2007
1 Adelie Torgersen 39.5 17.4 186.0 3800.0 female 2007
2 Adelie Torgersen 40.3 18.0 195.0 3250.0 female 2007
3 Adelie Torgersen NaN NaN NaN NaN NaN 2007
4 Adelie Torgersen 36.7 19.3 193.0 3450.0 female 2007
5 Adelie Torgersen 39.3 20.6 190.0 3650.0 male 2007
6 Adelie Torgersen 38.9 17.8 181.0 3625.0 female 2007
7 Adelie Torgersen 39.2 19.6 195.0 4675.0 male 2007
8 Adelie Torgersen 34.1 18.1 193.0 3475.0 NaN 2007
9 Adelie Torgersen 42.0 20.2 190.0 4250.0 NaN 2007
bdf.iloc[1:10, df.columns.get_indexer(["island", "bill_depth_mm", "year"])]
island bill_depth_mm year
98 Dream 16.1 2008
70 Torgersen 19.0 2008
92 Dream 17.1 2008
8 Torgersen 18.1 2007
18 Torgersen 18.4 2007
54 Biscoe 18.1 2008
80 Torgersen 17.2 2008
14 Torgersen 21.1 2007
52 Biscoe 17.9 2008
# recommendations for indexing DataFrames
# by boolean array, use loc, whether or not you want columns by name
# by integer array, use iloc
# if integer array, and columns by name use  df.columns.get_indexer
df.loc[3, "bill_length_mm"] = 5
df.loc[3, "body_mass_g"] = 1342343
df.loc[3, "bill_depth_mm"] = 3
np.size(df["bill_depth_mm"])
#df["bill_depth_mm"] = np.arange(344)
344
df.loc[3, ["bill_depth_mm", "body_mass_g", "bill_length_mm"]] = [3, 4, 6]
df.loc[3, ["bill_depth_mm", "body_mass_g", "bill_length_mm"]]
bill_depth_mm     3.0
body_mass_g       4.0
bill_length_mm    6.0
Name: 3, dtype: float64