library(tidymodels)
<- readr::read_csv("data/heart_2020_cleaned.csv", show_col_types = FALSE) |>
df ::clean_names()
janitor<- df[1:1000, ] df
11 Splitting Data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from janitor import clean_names
= pd.read_csv("data/heart_2020_cleaned.csv").clean_names(case_type='snake')
df = df.iloc[0:1000,] df
11.1 Train-test split
To prevent data leakage, one always have to train their model(s) (i.e. fit coefficients, find structure and association) on the training set and test it on the testing set (i.e. evaluate performance on never-seen data, generalization). Hence, the first step in the data modeling process is to split your data into two separate sets.
set.seed(42)
<- initial_split(df, prop = 0.8, strata = heart_disease)
splits <- training(splits)
train_data <- testing(splits) test_data
from sklearn.model_selection import train_test_split
= train_test_split(
train_data, test_data, train_target, test_target ="heart_disease"), df["heart_disease"],
df.drop(columns=0.8,
train_size=42
random_state )
11.2 Cross-validation
11.2.1 K-fold
set.seed(42)
<- vfold_cv(train_data, v = 10) cv
from sklearn.model_selection import KFold
= KFold(n_splits=10, shuffle=True, random_state=42) cv
Stratified K-fold
set.seed(42)
<- vfold_cv(train_data, v = 10, strata = heart_disease) cv
from sklearn.model_selection import StratifiedKFold
= StratifiedKFold(n_splits=10, shuffle=True, random_state=42) cv
Repeated Stratified K-fold
set.seed(42)
<- vfold_cv(train_data, v = 10, strata = heart_disease, repeats = 10) cv
from sklearn.model_selection import RepeatedStratifiedKFold
= RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=42) cv
Grouped K-fold
<- group_vfold_cv(train_data, group = , v = 10) cv
from sklearn.model_selection import GroupKFold
= GroupKFold(n_splits=10)
cv # group is specified later
11.2.2 Leave-one-out
<- loo_cv(train_data) cv
from sklearn.model_selection import LeaveOneOut
= LeaveOneOut() cv
11.2.3 Monte Carlo
set.seed(42)
<- mc_cv(train_data, prop = 3/4, times = 10) cv
from sklearn.model_selection import ShuffleSplit
= ShuffleSplit(n_splits=10, train_size=3/4, random_state=42) cv
Stratified Monte Carlo
set.seed(42)
<- mc_cv(train_data, prop = 3/4, times = 10, strata = heart_disease) cv
from sklearn.model_selection import StratifiedShuffleSplit
= StratifiedShuffleSplit(n_splits=10, train_size=3/4, random_state=42) cv
11.2.4 Time Series split
<- rolling_origin(train_data,
cv initial = (nrow(train_data) %% 11) + (nrow(train_data) %/% 11),
assess = nrow(train_data) %/% 11,
lag = 0)
from sklearn.model_selection import TimeSeriesSplit
= TimeSeriesSplit(n_splits=10, gap=0) cv