## # A tibble: 218 × 6## title date location abstract text url ## <chr> <date> <chr> <chr> <chr> <chr>## 1 Coronavirus (COVID-1… 2021-04-20 St Andr… Stateme… "Goo… http…## 2 Coronavirus (COVID-1… 2021-04-13 St Andr… Stateme… "Tha… http…## 3 Coronavirus (COVID-1… 2021-04-06 St Andr… Stateme… "Goo… http…## 4 Coronavirus (COVID-1… 2021-03-30 St Andr… Stateme… "Tha… http…## 5 Coronavirus (COVID-1… 2021-03-24 Scottis… Stateme… "Tha… http…## 6 Coronavirus (Covid-1… 2021-03-23 The Sco… Stateme… "Pre… http…## 7 Coronavirus (COVID-1… 2021-03-18 Scottis… Stateme… "Tha… http…## 8 Coronavirus (COVID-1… 2021-03-17 St Andr… Stateme… "\nG… http…## 9 Coronavirus (COVID-1… 2021-03-16 Scottis… Stateme… "Pre… http…## 10 Coronavirus (COVID-1… 2021-03-15 St Andr… Stateme… "\nG… http…## 11 Coronavirus (COVID-1… 2021-03-11 Scottis… Stateme… "I c… http…## 12 Coronavirus (COVID-1… 2021-03-09 Scottis… Stateme… "Pre… http…## 13 Coronavirus (COVID-1… 2021-03-05 Scottis… Parliam… "Hel… http…## 14 Coronavirus (COVID-1… 2021-03-04 Scottis… Parliam… "I w… http…## 15 Coronavirus (COVID-1… 2021-03-02 Scottis… Stateme… "Pre… http…## # … with 203 more rows
Scrape title
, date
, location
, abstract
, and text
from a few COVID-19 speech pages to develop the code
Write a function that scrapes title
, date
, location
, abstract
, and text
from COVID-19 speech pages
Scrape the url
s of COVID-19 speeches from the main page
Use this function to scrape from each individual COVID-19 speech from these url
s and create a data frame with the columns title
, date
, location
, abstract
, text
, and url
url <- "https://www.gov.scot/publications/coronavirus-covid-19-update-first-ministers-speech-26-october/"speech_page <- read_html(url)
speech_page
## {html_document}## <html dir="ltr" lang="en">## [1] <head>\n<meta http-equiv="Content-Type" content="text/html ...## [2] <body class="fontawesome site-header__container">\n\n\n\n\ ...
title <- speech_page %>% html_node(".article-header__title") %>% html_text()title
## [1] "Coronavirus (COVID-19) update: First Minister's speech 26 October"
library(lubridate)speech_page %>% html_node(".content-data__list:nth-child(1) strong") %>% html_text()
## [1] "26 Oct 2020"
date <- speech_page %>% html_node(".content-data__list:nth-child(1) strong") %>% html_text() %>% dmy()date
## [1] "2020-10-26"
location <- speech_page %>% html_node(".content-data__list+ .content-data__list strong") %>% html_text()location
## [1] "St Andrew's House, Edinburgh"
abstract <- speech_page %>% html_node(".leader--first-para p") %>% html_text()abstract
## [1] "Statement given by First Minister Nicola Sturgeon at a media briefing in St Andrew's House on Monday 26 October 2020."
text <- speech_page %>% html_nodes("#preamble p") %>% html_text() %>% list()text
## [[1]]## [1] "\nGood afternoon, and thanks for joining us. I want to start with the usual daily report on the COVID statistics." ## [2] "The total number of positive cases reported yesterday was 1,122." ## [3] "This represents 7.1% of the total number of tests carried out. 428 of the new cases were in Greater Glasgow and Clyde, 274 in Lanarkshire, 105 in Lothian and 97 in Ayrshire and Arran. " ## [4] "The remaining cases were spread across the mainland health board regions. " ## [5] "The total number of confirmed cases is now 57,874." ## [6] "I can also confirm that 1,152 people are in hospital – that is an increase of 36 from yesterday" ## [7] "90 people are in intensive care, which is four more than yesterday." ## [8] "And I regret to say that in the last 24 hours, one further death has been registered of a patient who first tested positive over the previous 28 days. It is important though to remember that registration offices tend not to be open as normal over the weekend so the Sunday and Monday figures are often lower." ## [9] "We also reported 11 deaths on Saturday, and one yesterday. So since the last briefing on Friday, 13 additional deaths have been registered. That takes the total number of deaths, under this measurement, to 2,701." ## [10] "That reminds us again of how dangerous this virus can be and I want to send my condolences to everyone who has lost someone." ...
oct_26_speech <- tibble( title = title, date = date, location = location, abstract = abstract, text = text, url = url)oct_26_speech
## # A tibble: 1 × 6## title date location abstract text url ## <chr> <date> <chr> <chr> <lis> <chr>## 1 Coronavirus (COVID-19… 2020-10-26 St Andr… Stateme… <chr> http…
url <- "https://www.gov.scot/publications/coronavirus-covid-19-update-first-ministers-speech-23-october/"speech_page <- read_html(url)
speech_page
## {html_document}## <html dir="ltr" lang="en">## [1] <head>\n<meta http-equiv="Content-Type" content="text/html ...## [2] <body class="fontawesome site-header__container">\n\n\n\n\ ...
title <- speech_page %>% html_node(".article-header__title") %>% html_text()date <- speech_page %>% html_node(".content-data__list:nth-child(1) strong") %>% html_text() %>% dmy()location <- speech_page %>% html_node(".content-data__list+ .content-data__list strong") %>% html_text()abstract <- speech_page %>% html_node(".leader--first-para p") %>% html_text()text <- speech_page %>% html_nodes("#preamble p") %>% html_text() %>% list()
oct_23_speech <- tibble( title = title, date = date, location = location, abstract = abstract, text = text, url = url)oct_23_speech
## # A tibble: 1 × 6## title date location abstract text url ## <chr> <date> <chr> <chr> <lis> <chr>## 1 Coronavirus (COVID-19… 2020-10-23 St Andr… Stateme… <chr> http…
When you’ve copied and pasted a block of code more than twice.
How many times will we need to copy and paste the code we developed to scrape data on all of First Minister's COVID-19 speeches?
Automate common tasks in a more powerful and general way than copy-and-pasting:
Down the line: Improve your reach as a data scientist by writing functions (and packages!) that others use
Assuming that the page structure is the same for each speech page, how many "things" do you need to know for each speech page to scrape the data we want from it?
url_23_oct <- "https://www.gov.scot/publications/coronavirus-covid-19-update-first-ministers-speech-23-october/"speech_page <- read_html(url_23_oct)title <- speech_page %>% html_node(".article-header__title") %>% html_text()date <- speech_page %>% html_node(".content-data__list:nth-child(1) strong") %>% html_text() %>% dmy()location <- speech_page %>% html_node(".content-data__list+ .content-data__list strong") %>% html_text()abstract <- speech_page %>% html_node(".leader--first-para p") %>% html_text()text <- speech_page %>% html_nodes("#preamble p") %>% html_text() %>% list()tibble( title = title, date = date, location = location, abstract = abstract, text = text, url= url)
scrape_speech <-
function
. If we had more the call would look like function(x, y, z)
.scrape_speech <- function(x){}
function
. If we had more the call would look like function(x, y, z)
.{
block that immediately follows function(...)
.scrape_speech <- function(url){ # code we developed earlier to scrape info # on single art piece goes here}
scrape_speech()
scrape_speech <- function(url) { speech_page <- read_html(url) title <- speech_page %>% html_node(".article-header__title") %>% html_text() date <- speech_page %>% html_node(".content-data__list:nth-child(1) strong") %>% html_text() %>% dmy() location <- speech_page %>% html_node(".content-data__list+ .content-data__list strong") %>% html_text() abstract <- speech_page %>% html_node(".leader--first-para p") %>% html_text() text <- speech_page %>% html_nodes("#preamble p") %>% html_text() %>% list() tibble( title = title, date = date, location = location, abstract = abstract, text = text, url = url )}
scrape_speech(url = "https://www.gov.scot/publications/coronavirus-covid-19-update-first-ministers-speech-26-october/") %>% glimpse()
## Rows: 1## Columns: 6## $ title <chr> NA## $ date <date> NA## $ location <chr> NA## $ abstract <chr> NA## $ text <list> <"\nGood afternoon, and thanks for joining us.…## $ url <chr> "https://www.gov.scot/publications/coronaviru…
scrape_speech(url = "https://www.gov.scot/publications/coronavirus-covid-19-update-first-ministers-speech-23-october/") %>% glimpse()
## Rows: 1## Columns: 6## $ title <chr> NA## $ date <date> NA## $ location <chr> NA## $ abstract <chr> NA## $ text <list> <"\nGood afternoon everyone. Thank you very mu…## $ url <chr> "https://www.gov.scot/publications/coronaviru…
scrape_speech(url = "https://www.gov.scot/publications/coronavirus-covid-19-update-first-ministers-speech-22-october/") %>% glimpse()
## Rows: 1## Columns: 6## $ title <chr> NA## $ date <date> NA## $ location <chr> NA## $ abstract <chr> NA## $ text <list> <"\nGood afternoon, let me start as usual with…## $ url <chr> "https://www.gov.scot/publications/coronaviru…
function([inputs separated by commas]){ # what to do with those inputs}
scrape_page <- function(x){ # do bunch of stuff with the input... # return a tibble tibble(...)}
What is going on here?
add_2 <- function(x){ x + 2 1000}
add_2(3)
## [1] 1000
add_2(10)
## [1] 1000
"There are only two hard things in Computer Science: cache invalidation and naming things." - Phil Karlton
snake_case
as opposed to camelCase
)snake_case
as opposed to camelCase
)scrape_page()
, scrape_speech()
OR str_remove()
, str_replace()
etc.)snake_case
as opposed to camelCase
)scrape_page()
, scrape_speech()
OR str_remove()
, str_replace()
etc.)# JUST DON'Tmean <- function(x){ x * 3 }
Keyboard shortcuts
↑, ←, Pg Up, k | Go to previous slide |
↓, →, Pg Dn, Space, j | Go to next slide |
Home | Go to first slide |
End | Go to last slide |
Number + Return | Go to specific slide |
b / m / f | Toggle blackout / mirrored / fullscreen mode |
c | Clone slideshow |
p | Toggle presenter mode |
t | Restart the presentation timer |
?, h | Toggle this help |
Esc | Back to slideshow |