# Fix random numbers by setting the seed # Enables analysis to be reproducible when random numbers are used set.seed(1116)# Put 80% of the data into the training set email_split <- initial_split(email, prop = 0.80)# Create data frames for the two sets:train_data <- training(email_split)test_data <- testing(email_split)
mutate()
train_data %>% mutate( date = lubridate::date(time), dow = wday(time), month = month(time) ) %>% select(time, date, dow, month) %>% sample_n(size = 5) # shuffle to show a variety
## # A tibble: 5 × 4## time date dow month## <dttm> <date> <dbl> <dbl>## 1 2012-03-15 14:51:35 2012-03-15 5 3## 2 2012-03-03 09:24:02 2012-03-03 7 3## 3 2012-01-18 11:55:23 2012-01-18 4 1## 4 2012-02-24 23:08:59 2012-02-24 6 2## 5 2012-01-11 08:18:51 2012-01-11 4 1
email_rec <- recipe( spam ~ ., # formula data = train_data # data to use for cataloguing names and types of variables )summary(email_rec)
## # A tibble: 21 × 4## variable type role source ## <chr> <chr> <chr> <chr> ## 1 to_multiple nominal predictor original## 2 from nominal predictor original## 3 cc numeric predictor original## 4 sent_email nominal predictor original## 5 time date predictor original## 6 image numeric predictor original## 7 attach numeric predictor original## 8 dollar numeric predictor original## 9 winner nominal predictor original## 10 inherit numeric predictor original## 11 viagra numeric predictor original## 12 password numeric predictor original## 13 num_char numeric predictor original## 14 line_breaks numeric predictor original## 15 format nominal predictor original## 16 re_subj nominal predictor original## 17 exclaim_subj numeric predictor original## 18 urgent_subj nominal predictor original## 19 exclaim_mess numeric predictor original## 20 number nominal predictor original## 21 spam nominal outcome original
email_rec <- email_rec %>% step_rm(from, sent_email)
## Recipe## ## Inputs:## ## role #variables## outcome 1## predictor 20## ## Operations:## ## Variables removed from, sent_email
email_rec <- email_rec %>% step_date(time, features = c("dow", "month")) %>% step_rm(time)
## Recipe## ## Inputs:## ## role #variables## outcome 1## predictor 20## ## Operations:## ## Variables removed from, sent_email## Date features from time## Variables removed time
email_rec <- email_rec %>% step_cut(cc, attach, dollar, breaks = c(0, 1)) %>% step_cut(inherit, password, breaks = c(0, 1, 5, 10, 20))
## Recipe## ## Inputs:## ## role #variables## outcome 1## predictor 20## ## Operations:## ## Variables removed from, sent_email## Date features from time## Variables removed time## Cut numeric for cc, attach, dollar## Cut numeric for inherit, password
email_rec <- email_rec %>% step_dummy(all_nominal(), -all_outcomes())
## Recipe## ## Inputs:## ## role #variables## outcome 1## predictor 20## ## Operations:## ## Variables removed from, sent_email## Date features from time## Variables removed time## Cut numeric for cc, attach, dollar## Cut numeric for inherit, password## Dummy variables from all_nominal(), -all_outcomes()
Variables that contain only a single value
email_rec <- email_rec %>% step_zv(all_predictors())
## Recipe## ## Inputs:## ## role #variables## outcome 1## predictor 20## ## Operations:## ## Variables removed from, sent_email## Date features from time## Variables removed time## Cut numeric for cc, attach, dollar## Cut numeric for inherit, password## Dummy variables from all_nominal(), -all_outcomes()## Zero variance filter on all_predictors()
email_rec <- recipe(spam ~ ., data = email) %>% step_rm(from, sent_email) %>% step_date(time, features = c("dow", "month")) %>% step_rm(time) %>% step_cut(cc, attach, dollar, breaks = c(0, 1)) %>% step_cut(inherit, password, breaks = c(0, 1, 5, 10, 20)) %>% step_dummy(all_nominal(), -all_outcomes()) %>% step_zv(all_predictors())
email_mod <- logistic_reg() %>% set_engine("glm")email_mod
## Logistic Regression Model Specification (classification)## ## Computational engine: glm
Workflows bring together models and recipes so that they can be easily applied to both the training and test data.
email_wflow <- workflow() %>% add_model(email_mod) %>% add_recipe(email_rec)
## ══ Workflow ════════════════════════════════════════════════════════════════════════════════════════## Preprocessor: Recipe## Model: logistic_reg()## ## ── Preprocessor ────────────────────────────────────────────────────────────────────────────────────## 7 Recipe Steps## ## • step_rm()## • step_date()## • step_rm()## • step_cut()## • step_cut()## • step_dummy()## • step_zv()## ## ── Model ───────────────────────────────────────────────────────────────────────────────────────────## Logistic Regression Model Specification (classification)## ## Computational engine: glm
email_fit <- email_wflow %>% fit(data = train_data)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
tidy(email_fit) %>% print(n = 31)
## # A tibble: 31 × 5## term estimate std.error statistic p.value## <chr> <dbl> <dbl> <dbl> <dbl>## 1 (Intercept) -0.892 0.249 -3.58 3.37e- 4## 2 image -1.65 0.934 -1.76 7.77e- 2## 3 viagra 2.28 182. 0.0125 9.90e- 1## 4 num_char 0.0470 0.0244 1.93 5.36e- 2## 5 line_breaks -0.00510 0.00138 -3.69 2.28e- 4## 6 exclaim_subj -0.204 0.277 -0.736 4.62e- 1## 7 exclaim_mess 0.00885 0.00186 4.75 1.99e- 6## 8 to_multiple_X1 -2.60 0.354 -7.35 2.06e-13## 9 cc_X.1.68. -0.312 0.490 -0.638 5.24e- 1## 10 attach_X.1.21. 2.05 0.368 5.58 2.45e- 8## 11 dollar_X.1.64. 0.214 0.217 0.988 3.23e- 1## 12 winner_yes 2.18 0.428 5.08 3.68e- 7## 13 inherit_X.1.5. -9.21 765. -0.0120 9.90e- 1## 14 inherit_X.5.10. 2.51 1.44 1.74 8.12e- 2## 15 password_X.1.5. -1.71 0.748 -2.28 2.24e- 2## 16 password_X.5.10. -12.5 475. -0.0263 9.79e- 1## 17 password_X.10.20. -13.7 814. -0.0168 9.87e- 1## 18 password_X.20.22. -13.9 1029. -0.0135 9.89e- 1## 19 format_X1 -0.916 0.159 -5.77 7.79e- 9## 20 re_subj_X1 -2.90 0.437 -6.65 2.95e-11## 21 urgent_subj_X1 3.52 1.08 3.25 1.15e- 3## 22 number_small -0.895 0.167 -5.35 8.75e- 8## 23 number_big -0.199 0.250 -0.797 4.25e- 1## 24 time_dow_Mon 0.0441 0.296 0.149 8.82e- 1## 25 time_dow_Tue 0.371 0.267 1.39 1.64e- 1## 26 time_dow_Wed -0.133 0.272 -0.488 6.26e- 1## 27 time_dow_Thu 0.0392 0.277 0.141 8.88e- 1## 28 time_dow_Fri 0.0488 0.280 0.174 8.62e- 1## 29 time_dow_Sat 0.253 0.298 0.849 3.96e- 1## 30 time_month_Feb 0.784 0.180 4.35 1.37e- 5## 31 time_month_Mar 0.541 0.181 2.99 2.79e- 3
email_pred <- predict(email_fit, test_data, type = "prob") %>% bind_cols(test_data)
## Warning: There are new levels in a factor: NA
email_pred
## # A tibble: 785 × 23## .pred_0 .pred_1 spam to_multiple from cc sent_email## <dbl> <dbl> <fct> <fct> <fct> <int> <fct> ## 1 0.995 0.00470 0 1 1 0 1 ## 2 0.999 0.00134 0 0 1 1 1 ## 3 0.967 0.0328 0 0 1 0 0 ## 4 0.999 0.000776 0 0 1 1 0 ## 5 0.994 0.00642 0 0 1 4 0 ## 6 0.860 0.140 0 0 1 0 0 ## # … with 779 more rows, and 16 more variables: time <dttm>,## # image <dbl>, attach <dbl>, dollar <dbl>, winner <fct>,## # inherit <dbl>, viagra <dbl>, password <dbl>, num_char <dbl>,## # line_breaks <int>, format <fct>, re_subj <fct>,## # exclaim_subj <dbl>, urgent_subj <fct>, exclaim_mess <dbl>,## # number <fct>
email_pred %>% roc_curve( truth = spam, .pred_1, event_level = "second" ) %>% autoplot()
email_pred %>% roc_auc( truth = spam, .pred_1, event_level = "second" )
## # A tibble: 1 × 3## .metric .estimator .estimate## <chr> <chr> <dbl>## 1 roc_auc binary 0.857
Suppose we decide to label an email as spam if the model predicts the probability of spam to be more than 0.5.
Email is not spam | Email is spam | |
---|---|---|
Email labelled not spam | 710 | 60 |
Email labelled spam | 6 | 8 |
NA | 1 | NA |
cutoff_prob <- 0.5email_pred %>% mutate( spam = if_else(spam == 1, "Email is spam", "Email is not spam"), spam_pred = if_else(.pred_1 > cutoff_prob, "Email labelled spam", "Email labelled not spam") ) %>% count(spam_pred, spam) %>% pivot_wider(names_from = spam, values_from = n) %>% kable(col.names = c("", "Email is not spam", "Email is spam"))
Suppose we decide to label an email as spam if the model predicts the probability of spam to be more than 0.25.
Email is not spam | Email is spam | |
---|---|---|
Email labelled not spam | 665 | 34 |
Email labelled spam | 51 | 34 |
NA | 1 | NA |
cutoff_prob <- 0.25email_pred %>% mutate( spam = if_else(spam == 1, "Email is spam", "Email is not spam"), spam_pred = if_else(.pred_1 > cutoff_prob, "Email labelled spam", "Email labelled not spam") ) %>% count(spam_pred, spam) %>% pivot_wider(names_from = spam, values_from = n) %>% kable(col.names = c("", "Email is not spam", "Email is spam"))
Suppose we decide to label an email as spam if the model predicts the probability of spam to be more than 0.75.
Email is not spam | Email is spam | |
---|---|---|
Email labelled not spam | 714 | 65 |
Email labelled spam | 2 | 3 |
NA | 1 | NA |
cutoff_prob <- 0.75email_pred %>% mutate( spam = if_else(spam == 1, "Email is spam", "Email is not spam"), spam_pred = if_else(.pred_1 > cutoff_prob, "Email labelled spam", "Email labelled not spam") ) %>% count(spam_pred, spam) %>% pivot_wider(names_from = spam, values_from = n) %>% kable(col.names = c("", "Email is not spam", "Email is spam"))
Keyboard shortcuts
↑, ←, Pg Up, k | Go to previous slide |
↓, →, Pg Dn, Space, j | Go to next slide |
Home | Go to first slide |
End | Go to last slide |
Number + Return | Go to specific slide |
b / m / f | Toggle blackout / mirrored / fullscreen mode |
c | Clone slideshow |
p | Toggle presenter mode |
t | Restart the presentation timer |
?, h | Toggle this help |
Esc | Back to slideshow |