7 Pivoting and Joining
After importing the data as in ?sec-import_local, we must load the necessary packages:
7.1 Pivot
First, we select the first ten rows of our dataframe and the columns heart_disease
, stroke
, smoking
and bmi
.
<- df[1:10, ] |>
df3 select(heart_disease, stroke, smoking, bmi) |>
mutate(patient_id = stringr::str_c("P", 1:10), .before = 1)
df3
# A tibble: 10 × 5
patient_id heart_disease stroke smoking bmi
<chr> <chr> <chr> <chr> <dbl>
1 P1 No No Yes 16.6
2 P2 No Yes No 20.3
3 P3 No No Yes 26.6
4 P4 No No No 24.2
5 P5 No No No 23.7
6 P6 Yes No Yes 28.9
7 P7 No No No 21.6
8 P8 No No Yes 31.6
9 P9 No No No 26.4
10 P10 No No No 40.7
= df.loc[0:9, ["heart_disease", "stroke", "smoking", "bmi"]]
df3 0, "patient_id", ["P" + str(i) for i in range(1, 11)])
df3.insert( df3
patient_id heart_disease stroke smoking bmi
0 P1 No No Yes 16.60
1 P2 No Yes No 20.34
2 P3 No No Yes 26.58
3 P4 No No No 24.21
4 P5 No No No 23.71
5 P6 Yes No Yes 28.87
6 P7 No No No 21.63
7 P8 No No Yes 31.64
8 P9 No No No 26.45
9 P10 No No No 40.69
7.1.1 Long format
<- df3 |>
df4 pivot_longer(-patient_id, names_to = "variable", values_to = "value",
values_transform = as.character)
df4
# A tibble: 40 × 3
patient_id variable value
<chr> <chr> <chr>
1 P1 heart_disease No
2 P1 stroke No
3 P1 smoking Yes
4 P1 bmi 16.6
5 P2 heart_disease No
6 P2 stroke Yes
7 P2 smoking No
8 P2 bmi 20.34
9 P3 heart_disease No
10 P3 stroke No
# … with 30 more rows
= df3.melt(id_vars="patient_id", var_name="variable", value_name="value")
df4 df4
patient_id variable value
0 P1 heart_disease No
1 P2 heart_disease No
2 P3 heart_disease No
3 P4 heart_disease No
4 P5 heart_disease No
5 P6 heart_disease Yes
6 P7 heart_disease No
7 P8 heart_disease No
8 P9 heart_disease No
9 P10 heart_disease No
10 P1 stroke No
11 P2 stroke Yes
12 P3 stroke No
13 P4 stroke No
14 P5 stroke No
15 P6 stroke No
16 P7 stroke No
17 P8 stroke No
18 P9 stroke No
19 P10 stroke No
20 P1 smoking Yes
21 P2 smoking No
22 P3 smoking Yes
23 P4 smoking No
24 P5 smoking No
25 P6 smoking Yes
26 P7 smoking No
27 P8 smoking Yes
28 P9 smoking No
29 P10 smoking No
30 P1 bmi 16.6
31 P2 bmi 20.34
32 P3 bmi 26.58
33 P4 bmi 24.21
34 P5 bmi 23.71
35 P6 bmi 28.87
36 P7 bmi 21.63
37 P8 bmi 31.64
38 P9 bmi 26.45
39 P10 bmi 40.69
7.1.2 Wide format
|> pivot_wider(patient_id, names_from = variable, values_from = value) df4
# A tibble: 10 × 5
patient_id heart_disease stroke smoking bmi
<chr> <chr> <chr> <chr> <chr>
1 P1 No No Yes 16.6
2 P2 No Yes No 20.34
3 P3 No No Yes 26.58
4 P4 No No No 24.21
5 P5 No No No 23.71
6 P6 Yes No Yes 28.87
7 P7 No No No 21.63
8 P8 No No Yes 31.64
9 P9 No No No 26.45
10 P10 No No No 40.69
="patient_id", columns="variable", values="value") df4.pivot(index
variable bmi heart_disease smoking stroke
patient_id
P1 16.6 No Yes No
P10 40.69 No No No
P2 20.34 No No Yes
P3 26.58 No Yes No
P4 24.21 No No No
P5 23.71 No No No
P6 28.87 Yes Yes No
P7 21.63 No No No
P8 31.64 No Yes No
P9 26.45 No No No
7.2 Merge
set.seed(123)
<- tibble(
df2 gen_health = c("Poor", "Fair", "Good", "Very good", "Excellent"),
random_biomarker = rnorm(5)
) df2
# A tibble: 5 × 2
gen_health random_biomarker
<chr> <dbl>
1 Poor -0.560
2 Fair -0.230
3 Good 1.56
4 Very good 0.0705
5 Excellent 0.129
import numpy as np
= np.random.RandomState(123)
rng = pd.DataFrame({
df2 "gen_health": ["Poor", "Fair", "Good", "Very good", "Excellent"],
"random_biomarker": rng.randn(5)
}) df2
gen_health random_biomarker
0 Poor -1.085631
1 Fair 0.997345
2 Good 0.282978
3 Very good -1.506295
4 Excellent -0.578600
7.2.1 Left-join
|> left_join(df2, by = "gen_health") df
# A tibble: 319,795 × 19
heart_disease bmi smoking alcohol_drinking stroke physical_health
<chr> <dbl> <chr> <chr> <chr> <dbl>
1 No 16.6 Yes No No 3
2 No 20.3 No No Yes 0
3 No 26.6 Yes No No 20
4 No 24.2 No No No 0
5 No 23.7 No No No 28
6 Yes 28.9 Yes No No 6
7 No 21.6 No No No 15
8 No 31.6 Yes No No 5
9 No 26.4 No No No 0
10 No 40.7 No No No 0
# … with 319,785 more rows, and 13 more variables: mental_health <dbl>,
# diff_walking <chr>, sex <chr>, age_category <chr>, race <chr>,
# diabetic <chr>, physical_activity <chr>, gen_health <chr>,
# sleep_time <dbl>, asthma <chr>, kidney_disease <chr>, skin_cancer <chr>,
# random_biomarker <dbl>
"gen_health"), on='gen_health', how="left") df.join(df2.set_index(
heart_disease bmi ... skin_cancer random_biomarker
0 No 16.60 ... Yes -1.506295
1 No 20.34 ... No -1.506295
2 No 26.58 ... No 0.997345
3 No 24.21 ... Yes 0.282978
4 No 23.71 ... No -1.506295
... ... ... ... ... ...
319790 Yes 27.41 ... No 0.997345
319791 No 29.84 ... No -1.506295
319792 No 24.24 ... No 0.282978
319793 No 32.81 ... No 0.282978
319794 No 46.56 ... No 0.282978
[319795 rows x 19 columns]
7.2.2 Right-join
|> right_join(df2, by = "gen_health") df
# A tibble: 319,795 × 19
heart_disease bmi smoking alcohol_drinking stroke physical_health
<chr> <dbl> <chr> <chr> <chr> <dbl>
1 No 16.6 Yes No No 3
2 No 20.3 No No Yes 0
3 No 26.6 Yes No No 20
4 No 24.2 No No No 0
5 No 23.7 No No No 28
6 Yes 28.9 Yes No No 6
7 No 21.6 No No No 15
8 No 31.6 Yes No No 5
9 No 26.4 No No No 0
10 No 40.7 No No No 0
# … with 319,785 more rows, and 13 more variables: mental_health <dbl>,
# diff_walking <chr>, sex <chr>, age_category <chr>, race <chr>,
# diabetic <chr>, physical_activity <chr>, gen_health <chr>,
# sleep_time <dbl>, asthma <chr>, kidney_disease <chr>, skin_cancer <chr>,
# random_biomarker <dbl>
"gen_health"), on='gen_health', how="right") df.join(df2.set_index(
heart_disease bmi ... skin_cancer random_biomarker
10 Yes 34.30 ... No -1.085631
25 No 29.18 ... No -1.085631
35 Yes 32.98 ... Yes -1.085631
45 Yes 20.37 ... No -1.085631
49 No 27.46 ... No -1.085631
... ... ... ... ... ...
319751 No 34.96 ... No -0.578600
319772 No 30.90 ... No -0.578600
319783 No 33.28 ... No -0.578600
319788 No 23.38 ... No -0.578600
319789 No 22.22 ... No -0.578600
[319795 rows x 19 columns]