6  Modify, Group, Summarize

6.1 Modify data

cols_to_mod <- c("smoking", "asthma", "skin_cancer")
df |> mutate(across(all_of(cols_to_mod), stringr::str_to_lower))
# A tibble: 319,795 × 18
   heart_disease   bmi smoking alcohol_drinking stroke physical_health
   <chr>         <dbl> <chr>   <chr>            <chr>            <dbl>
 1 No             16.6 yes     No               No                   3
 2 No             20.3 no      No               Yes                  0
 3 No             26.6 yes     No               No                  20
 4 No             24.2 no      No               No                   0
 5 No             23.7 no      No               No                  28
 6 Yes            28.9 yes     No               No                   6
 7 No             21.6 no      No               No                  15
 8 No             31.6 yes     No               No                   5
 9 No             26.4 no      No               No                   0
10 No             40.7 no      No               No                   0
# … with 319,785 more rows, and 12 more variables: mental_health <dbl>,
#   diff_walking <chr>, sex <chr>, age_category <chr>, race <chr>,
#   diabetic <chr>, physical_activity <chr>, gen_health <chr>,
#   sleep_time <dbl>, asthma <chr>, kidney_disease <chr>, skin_cancer <chr>
df <- df |> mutate(bmi_class = case_when(
    bmi < 18.5 ~ "underweight",
    bmi >= 18.5 & bmi < 25 ~ "normal weight",
    bmi >= 25 & bmi < 30 ~ "overweight",
    bmi >= 30 ~ "obese",
    TRUE ~ NA_character_
))
cols_to_mod = ["smoking", "asthma", "skin_cancer"]
df.loc[:,cols_to_mod].applymap(str.lower)
       smoking asthma skin_cancer
0          yes    yes         yes
1           no     no          no
2          yes    yes          no
3           no     no         yes
4           no     no          no
...        ...    ...         ...
319790     yes    yes          no
319791     yes    yes          no
319792      no     no          no
319793      no     no          no
319794      no     no          no

[319795 rows x 3 columns]
df = df.assign(bmi_class=pd.cut(df["bmi"], 
    bins=[0, 18.5, 25, 30, 1000], 
    labels=["underweight", "normal weight", "overweight", "obese"],
    ordered=True))

# df["bmi_class"] = pd.cut(df["bmi"], 
#     bins=[0, 18.5, 25, 30, 1000], 
#     labels=["underweight", "normal weight", "overweight", "obese"],
#     ordered=True)

6.2 Count values

df |> count(smoking, asthma)
# A tibble: 4 × 3
  smoking asthma      n
  <chr>   <chr>   <int>
1 No      No     163994
2 No      Yes     23893
3 Yes     No     112929
4 Yes     Yes     18979
df.value_counts(["smoking", "asthma"])
smoking  asthma
No       No        163994
Yes      No        112929
No       Yes        23893
Yes      Yes        18979
dtype: int64

6.3 Cross-tabulation

df |> select(smoking, asthma) |> table()
       asthma
smoking     No    Yes
    No  163994  23893
    Yes 112929  18979
pd.crosstab(df.smoking, df.asthma)
asthma       No    Yes
smoking               
No       163994  23893
Yes      112929  18979

6.4 Grouping

df |> group_by(heart_disease)
# A tibble: 319,795 × 19
# Groups:   heart_disease [2]
   heart_disease   bmi smoking alcohol_drinking stroke physical_health
   <chr>         <dbl> <chr>   <chr>            <chr>            <dbl>
 1 No             16.6 Yes     No               No                   3
 2 No             20.3 No      No               Yes                  0
 3 No             26.6 Yes     No               No                  20
 4 No             24.2 No      No               No                   0
 5 No             23.7 No      No               No                  28
 6 Yes            28.9 Yes     No               No                   6
 7 No             21.6 No      No               No                  15
 8 No             31.6 Yes     No               No                   5
 9 No             26.4 No      No               No                   0
10 No             40.7 No      No               No                   0
# … with 319,785 more rows, and 13 more variables: mental_health <dbl>,
#   diff_walking <chr>, sex <chr>, age_category <chr>, race <chr>,
#   diabetic <chr>, physical_activity <chr>, gen_health <chr>,
#   sleep_time <dbl>, asthma <chr>, kidney_disease <chr>, skin_cancer <chr>,
#   bmi_class <chr>
df.groupby("heart_disease")
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f5935a4bfd0>

6.5 Summarize data

df |> 
    group_by(heart_disease) |> 
    summarize(min = min(bmi), mean = mean(bmi), max = max(bmi))
# A tibble: 2 × 4
  heart_disease   min  mean   max
  <chr>         <dbl> <dbl> <dbl>
1 No             12.0  28.2  94.8
2 Yes            12.2  29.4  83.3
(
    df
    .groupby("heart_disease")
    .agg({'bmi': ['min', 'mean', 'max']})
)
                 bmi                  
                 min       mean    max
heart_disease                         
No             12.02  28.224658  94.85
Yes            12.21  29.401592  83.33