6 Modify, Group, Summarize
6.1 Modify data
<- c("smoking", "asthma", "skin_cancer")
cols_to_mod |> mutate(across(all_of(cols_to_mod), stringr::str_to_lower)) df
# A tibble: 319,795 × 18
heart_disease bmi smoking alcohol_drinking stroke physical_health
<chr> <dbl> <chr> <chr> <chr> <dbl>
1 No 16.6 yes No No 3
2 No 20.3 no No Yes 0
3 No 26.6 yes No No 20
4 No 24.2 no No No 0
5 No 23.7 no No No 28
6 Yes 28.9 yes No No 6
7 No 21.6 no No No 15
8 No 31.6 yes No No 5
9 No 26.4 no No No 0
10 No 40.7 no No No 0
# … with 319,785 more rows, and 12 more variables: mental_health <dbl>,
# diff_walking <chr>, sex <chr>, age_category <chr>, race <chr>,
# diabetic <chr>, physical_activity <chr>, gen_health <chr>,
# sleep_time <dbl>, asthma <chr>, kidney_disease <chr>, skin_cancer <chr>
<- df |> mutate(bmi_class = case_when(
df < 18.5 ~ "underweight",
bmi >= 18.5 & bmi < 25 ~ "normal weight",
bmi >= 25 & bmi < 30 ~ "overweight",
bmi >= 30 ~ "obese",
bmi TRUE ~ NA_character_
))
= ["smoking", "asthma", "skin_cancer"]
cols_to_mod str.lower) df.loc[:,cols_to_mod].applymap(
smoking asthma skin_cancer
0 yes yes yes
1 no no no
2 yes yes no
3 no no yes
4 no no no
... ... ... ...
319790 yes yes no
319791 yes yes no
319792 no no no
319793 no no no
319794 no no no
[319795 rows x 3 columns]
= df.assign(bmi_class=pd.cut(df["bmi"],
df =[0, 18.5, 25, 30, 1000],
bins=["underweight", "normal weight", "overweight", "obese"],
labels=True))
ordered
# df["bmi_class"] = pd.cut(df["bmi"],
# bins=[0, 18.5, 25, 30, 1000],
# labels=["underweight", "normal weight", "overweight", "obese"],
# ordered=True)
6.2 Count values
|> count(smoking, asthma) df
# A tibble: 4 × 3
smoking asthma n
<chr> <chr> <int>
1 No No 163994
2 No Yes 23893
3 Yes No 112929
4 Yes Yes 18979
"smoking", "asthma"]) df.value_counts([
smoking asthma
No No 163994
Yes No 112929
No Yes 23893
Yes Yes 18979
dtype: int64
6.3 Cross-tabulation
|> select(smoking, asthma) |> table() df
asthma
smoking No Yes
No 163994 23893
Yes 112929 18979
pd.crosstab(df.smoking, df.asthma)
asthma No Yes
smoking
No 163994 23893
Yes 112929 18979
6.4 Grouping
|> group_by(heart_disease) df
# A tibble: 319,795 × 19
# Groups: heart_disease [2]
heart_disease bmi smoking alcohol_drinking stroke physical_health
<chr> <dbl> <chr> <chr> <chr> <dbl>
1 No 16.6 Yes No No 3
2 No 20.3 No No Yes 0
3 No 26.6 Yes No No 20
4 No 24.2 No No No 0
5 No 23.7 No No No 28
6 Yes 28.9 Yes No No 6
7 No 21.6 No No No 15
8 No 31.6 Yes No No 5
9 No 26.4 No No No 0
10 No 40.7 No No No 0
# … with 319,785 more rows, and 13 more variables: mental_health <dbl>,
# diff_walking <chr>, sex <chr>, age_category <chr>, race <chr>,
# diabetic <chr>, physical_activity <chr>, gen_health <chr>,
# sleep_time <dbl>, asthma <chr>, kidney_disease <chr>, skin_cancer <chr>,
# bmi_class <chr>
"heart_disease") df.groupby(
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f5935a4bfd0>
6.5 Summarize data
|>
df group_by(heart_disease) |>
summarize(min = min(bmi), mean = mean(bmi), max = max(bmi))
# A tibble: 2 × 4
heart_disease min mean max
<chr> <dbl> <dbl> <dbl>
1 No 12.0 28.2 94.8
2 Yes 12.2 29.4 83.3
(
df"heart_disease")
.groupby('bmi': ['min', 'mean', 'max']})
.agg({ )
bmi
min mean max
heart_disease
No 12.02 28.224658 94.85
Yes 12.21 29.401592 83.33