library(tidymodels)
<- readr::read_csv("data/heart_2020_cleaned.csv", show_col_types = FALSE) |>
df ::clean_names()
janitor<- df[1:1000, ] df
11 Splitting Data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from janitor import clean_names
= pd.read_csv("data/heart_2020_cleaned.csv").clean_names(case_type='snake')
df = df.iloc[0:1000,] df
11.1 Train-test split
To prevent data leakage, one always have to train their model(s) (i.e. fit coefficients, find structure and association) on the training set and test it on the testing set (i.e. evaluate performance on never-seen data, generalization). Hence, the first step in the data modeling process is to split your data into two separate sets.
set.seed(42)
<- initial_split(df, prop = 0.8, strata = heart_disease)
splits <- training(splits)
train_data <- testing(splits) test_data
from sklearn.model_selection import train_test_split
= train_test_split(
train_data, test_data, train_target, test_target ="heart_disease"), df["heart_disease"],
df.drop(columns=0.8,
train_size=42
random_state )
Probability is concerned with studying uncertainty, randomness. Computers are not able to give you random answers, but they can simulate randomness with (pseudo) random number generators. In the following chapters, we will set the seeds of random number generators in order to reproduce the results (although some machine learning procedures involve randomness, you will be able to obtain the same results printed on this book).
We will use the same number (42, or The Answer to the Ultimate Question of Life, The Universe, and Everything) both in R and Python scripts, but remember that this choice does not lead to the same random process (even if the generation process is the same).
11.2 Cross-validation
11.2.1 K-fold
set.seed(42)
<- vfold_cv(train_data, v = 10) cv
from sklearn.model_selection import KFold
= KFold(n_splits=10, shuffle=True, random_state=42) cv
Stratified K-fold
set.seed(42)
<- vfold_cv(train_data, v = 10, strata = heart_disease) cv
from sklearn.model_selection import StratifiedKFold
= StratifiedKFold(n_splits=10, shuffle=True, random_state=42) cv
Repeated Stratified K-fold
set.seed(42)
<- vfold_cv(train_data, v = 10, strata = heart_disease, repeats = 10) cv
from sklearn.model_selection import RepeatedStratifiedKFold
= RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=42) cv
Grouped K-fold
<- group_vfold_cv(train_data, group = , v = 10) cv
from sklearn.model_selection import GroupKFold
= GroupKFold(n_splits=10)
cv # group is specified later
11.2.2 Leave-one-out
<- loo_cv(train_data) cv
from sklearn.model_selection import LeaveOneOut
= LeaveOneOut() cv
11.2.3 Monte Carlo
set.seed(42)
<- mc_cv(train_data, prop = 3/4, times = 10) cv
from sklearn.model_selection import ShuffleSplit
= ShuffleSplit(n_splits=10, train_size=3/4, random_state=42) cv
Stratified Monte Carlo
set.seed(42)
<- mc_cv(train_data, prop = 3/4, times = 10, strata = heart_disease) cv
from sklearn.model_selection import StratifiedShuffleSplit
= StratifiedShuffleSplit(n_splits=10, train_size=3/4, random_state=42) cv
11.2.4 Time Series split
<- rolling_origin(train_data,
cv initial = (nrow(train_data) %% 11) + (nrow(train_data) %/% 11),
assess = nrow(train_data) %/% 11,
lag = 0)
from sklearn.model_selection import TimeSeriesSplit
= TimeSeriesSplit(n_splits=10, gap=0) cv