6 Modify, Group, Summarize
6.1 Modify data
cols_to_mod <- c("smoking", "asthma", "skin_cancer")
df |> mutate(across(all_of(cols_to_mod), stringr::str_to_lower))# A tibble: 319,795 × 18
heart_disease bmi smoking alcohol_drinking stroke physical_health
<chr> <dbl> <chr> <chr> <chr> <dbl>
1 No 16.6 yes No No 3
2 No 20.3 no No Yes 0
3 No 26.6 yes No No 20
4 No 24.2 no No No 0
5 No 23.7 no No No 28
6 Yes 28.9 yes No No 6
7 No 21.6 no No No 15
8 No 31.6 yes No No 5
9 No 26.4 no No No 0
10 No 40.7 no No No 0
# … with 319,785 more rows, and 12 more variables: mental_health <dbl>,
# diff_walking <chr>, sex <chr>, age_category <chr>, race <chr>,
# diabetic <chr>, physical_activity <chr>, gen_health <chr>,
# sleep_time <dbl>, asthma <chr>, kidney_disease <chr>, skin_cancer <chr>
df <- df |> mutate(bmi_class = case_when(
bmi < 18.5 ~ "underweight",
bmi >= 18.5 & bmi < 25 ~ "normal weight",
bmi >= 25 & bmi < 30 ~ "overweight",
bmi >= 30 ~ "obese",
TRUE ~ NA_character_
))cols_to_mod = ["smoking", "asthma", "skin_cancer"]
df.loc[:,cols_to_mod].applymap(str.lower) smoking asthma skin_cancer
0 yes yes yes
1 no no no
2 yes yes no
3 no no yes
4 no no no
... ... ... ...
319790 yes yes no
319791 yes yes no
319792 no no no
319793 no no no
319794 no no no
[319795 rows x 3 columns]
df = df.assign(bmi_class=pd.cut(df["bmi"],
bins=[0, 18.5, 25, 30, 1000],
labels=["underweight", "normal weight", "overweight", "obese"],
ordered=True))
# df["bmi_class"] = pd.cut(df["bmi"],
# bins=[0, 18.5, 25, 30, 1000],
# labels=["underweight", "normal weight", "overweight", "obese"],
# ordered=True)6.2 Count values
df |> count(smoking, asthma)# A tibble: 4 × 3
smoking asthma n
<chr> <chr> <int>
1 No No 163994
2 No Yes 23893
3 Yes No 112929
4 Yes Yes 18979
df.value_counts(["smoking", "asthma"])smoking asthma
No No 163994
Yes No 112929
No Yes 23893
Yes Yes 18979
dtype: int64
6.3 Cross-tabulation
df |> select(smoking, asthma) |> table() asthma
smoking No Yes
No 163994 23893
Yes 112929 18979
pd.crosstab(df.smoking, df.asthma)asthma No Yes
smoking
No 163994 23893
Yes 112929 18979
6.4 Grouping
df |> group_by(heart_disease)# A tibble: 319,795 × 19
# Groups: heart_disease [2]
heart_disease bmi smoking alcohol_drinking stroke physical_health
<chr> <dbl> <chr> <chr> <chr> <dbl>
1 No 16.6 Yes No No 3
2 No 20.3 No No Yes 0
3 No 26.6 Yes No No 20
4 No 24.2 No No No 0
5 No 23.7 No No No 28
6 Yes 28.9 Yes No No 6
7 No 21.6 No No No 15
8 No 31.6 Yes No No 5
9 No 26.4 No No No 0
10 No 40.7 No No No 0
# … with 319,785 more rows, and 13 more variables: mental_health <dbl>,
# diff_walking <chr>, sex <chr>, age_category <chr>, race <chr>,
# diabetic <chr>, physical_activity <chr>, gen_health <chr>,
# sleep_time <dbl>, asthma <chr>, kidney_disease <chr>, skin_cancer <chr>,
# bmi_class <chr>
df.groupby("heart_disease")<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f5935a4bfd0>
6.5 Summarize data
df |>
group_by(heart_disease) |>
summarize(min = min(bmi), mean = mean(bmi), max = max(bmi))# A tibble: 2 × 4
heart_disease min mean max
<chr> <dbl> <dbl> <dbl>
1 No 12.0 28.2 94.8
2 Yes 12.2 29.4 83.3
(
df
.groupby("heart_disease")
.agg({'bmi': ['min', 'mean', 'max']})
) bmi
min mean max
heart_disease
No 12.02 28.224658 94.85
Yes 12.21 29.401592 83.33