pp <- read_csv( "data/paris-paintings.csv", na = c("n/a", "", "NA")) %>% mutate(log_price = log(price))
log_price
pp_fit <- linear_reg() %>% set_engine("lm") %>% fit(log_price ~ Width_in + Height_in, data = pp)tidy(pp_fit)
## # A tibble: 3 × 5## term estimate std.error statistic p.value## <chr> <dbl> <dbl> <dbl> <dbl>## 1 (Intercept) 4.77 0.0579 82.4 0 ## 2 Width_in 0.0269 0.00373 7.22 6.58e-13## 3 Height_in -0.0133 0.00395 -3.36 7.93e- 4
## # A tibble: 3 × 5## term estimate std.error statistic p.value## <chr> <dbl> <dbl> <dbl> <dbl>## 1 (Intercept) 4.77 0.0579 82.4 0 ## 2 Width_in 0.0269 0.00373 7.22 6.58e-13## 3 Height_in -0.0133 0.00395 -3.36 7.93e- 4
ˆlog_price=4.77+0.0269×width−0.0133×height
p <- plot_ly(pp, x = ~Width_in, y = ~Height_in, z = ~log_price, marker = list(size = 3, color = "lightgray", alpha = 0.5, line = list(color = "gray", width = 2))) %>% add_markers() %>% plotly::layout(scene = list( xaxis = list(title = "Width (in)"), yaxis = list(title = "Height (in)"), zaxis = list(title = "log_price") )) %>% config(displayModeBar = FALSE)frameWidget(p)
pp <- pp %>% mutate(artistliving = if_else(artistliving == 0, "Deceased", "Living"))pp %>% count(artistliving)
## # A tibble: 2 × 2## artistliving n## <chr> <int>## 1 Deceased 2937## 2 Living 456
Typical surface area appears to be less than 1000 square inches (~ 80cm x 80cm). There are very few paintings that have surface area above 5000 square inches.
ggplot(data = pp, aes(x = Surface, fill = artistliving)) + geom_histogram(binwidth = 500) + facet_grid(artistliving ~ .) + scale_fill_manual(values = c("#E48957", "#071381")) + guides(fill = "none") + labs(x = "Surface area", y = NULL) + geom_vline(xintercept = 1000) + geom_vline(xintercept = 5000, linetype = "dashed", color = "gray")
## Warning: Removed 176 rows containing non-finite values## (stat_bin).
For simplicity let's focus on the paintings with Surface < 5000
:
pp_Surf_lt_5000 <- pp %>% filter(Surface < 5000)ggplot(data = pp_Surf_lt_5000, aes(y = log_price, x = Surface, color = artistliving, shape = artistliving)) + geom_point(alpha = 0.5) + labs(color = "Artist", shape = "Artist") + scale_color_manual(values = c("#E48957", "#071381"))
ggplot(data = pp_Surf_lt_5000, aes(y = log_price, x = Surface, color = artistliving, shape = artistliving)) + geom_point(alpha = 0.5) + facet_wrap(~artistliving) + scale_color_manual(values = c("#E48957", "#071381")) + labs(color = "Artist", shape = "Artist")
log_price
Surface
area and artistliving
pp_main_fit <- linear_reg() %>% set_engine("lm") %>% fit(log_price ~ Surface + artistliving, data = pp_Surf_lt_5000)tidy(pp_main_fit)
## # A tibble: 3 × 5## term estimate std.error statistic p.value## <chr> <dbl> <dbl> <dbl> <dbl>## 1 (Intercept) 4.88 0.0424 115. 0 ## 2 Surface 0.000265 0.0000415 6.39 1.85e-10## 3 artistlivingLiving 0.137 0.0970 1.41 1.57e- 1
log_price
Surface
area and artistliving
pp_main_fit <- linear_reg() %>% set_engine("lm") %>% fit(log_price ~ Surface + artistliving, data = pp_Surf_lt_5000)tidy(pp_main_fit)
## # A tibble: 3 × 5## term estimate std.error statistic p.value## <chr> <dbl> <dbl> <dbl> <dbl>## 1 (Intercept) 4.88 0.0424 115. 0 ## 2 Surface 0.000265 0.0000415 6.39 1.85e-10## 3 artistlivingLiving 0.137 0.0970 1.41 1.57e- 1
ˆlog_price=4.88+0.000265×surface+0.137×artistliving
artistliving
ˆlog_price=4.88+0.000265×surface+0.137×0
=4.88+0.000265×surface
artistliving
ˆlog_price=4.88+0.000265×surface+0.137×0
=4.88+0.000265×surface
artistliving
ˆlog_price=4.88+0.000265×surface+0.137×1
=5.017+0.000265×surface
tidy(pp_main_fit) %>% mutate(exp_estimate = exp(estimate)) %>% select(term, estimate, exp_estimate)
## # A tibble: 3 × 3## term estimate exp_estimate## <chr> <dbl> <dbl>## 1 (Intercept) 4.88 132. ## 2 Surface 0.000265 1.00## 3 artistlivingLiving 0.137 1.15
artistliving
affect the intercept.What seems more appropriate in this case?
Surface
area, artistliving
, and their interactionpp_int_fit <- linear_reg() %>% set_engine("lm") %>% fit(log_price ~ Surface * artistliving, data = pp_Surf_lt_5000)tidy(pp_int_fit)
## # A tibble: 4 × 5## term estimate std.error statistic p.value## <chr> <dbl> <dbl> <dbl> <dbl>## 1 (Intercept) 4.91e+0 0.0432 114. 0 ## 2 Surface 2.06e-4 0.0000442 4.65 3.37e-6## 3 artistlivingLiving -1.26e-1 0.119 -1.06 2.89e-1## 4 Surface:artistlivingLiving 4.79e-4 0.000126 3.81 1.39e-4
## # A tibble: 4 × 5## term estimate std.error statistic p.value## <chr> <dbl> <dbl> <dbl> <dbl>## 1 (Intercept) 4.91e+0 0.0432 114. 0 ## 2 Surface 2.06e-4 0.0000442 4.65 3.37e-6## 3 artistlivingLiving -1.26e-1 0.119 -1.06 2.89e-1## 4 Surface:artistlivingLiving 4.79e-4 0.000126 3.81 1.39e-4
ˆlog_price=4.91+0.00021×surface−0.126×artistliving + 0.00048×surface∗artistliving
It appears that adding the interaction actually increased adjusted R2, so we should indeed use the model with the interactions.
glance(pp_main_fit)$adj.r.squared
## [1] 0.01258977
glance(pp_int_fit)$adj.r.squared
## [1] 0.01676753
Keyboard shortcuts
↑, ←, Pg Up, k | Go to previous slide |
↓, →, Pg Dn, Space, j | Go to next slide |
Home | Go to first slide |
End | Go to last slide |
Number + Return | Go to specific slide |
b / m / f | Toggle blackout / mirrored / fullscreen mode |
c | Clone slideshow |
p | Toggle presenter mode |
t | Restart the presentation timer |
?, h | Toggle this help |
Esc | Back to slideshow |