class: center, middle, inverse, title-slide .title[ # Scraping top 250 movies on IMDB ] .subtitle[ ##
Data Science in a Box ] .author[ ###
datasciencebox.org
] --- layout: true <div class="my-footer"> <span> <a href="https://datasciencebox.org" target="_blank">datasciencebox.org</a> </span> </div> --- class: middle # Top 250 movies on IMDB --- ## Top 250 movies on IMDB Take a look at the source code, look for the tag `table` tag: <br> http://www.imdb.com/chart/top .pull-left[ <img src="img/imdb-top-250.png" width="100%" style="display: block; margin: auto;" /> ] .pull-right[ <img src="img/imdb-top-250-source.png" width="94%" style="display: block; margin: auto;" /> ] --- ## First check if you're allowed! ```r library(robotstxt) paths_allowed("http://www.imdb.com") ``` ``` ## [1] TRUE ``` vs. e.g. ```r paths_allowed("http://www.facebook.com") ``` ``` ## [1] FALSE ``` --- ## Plan <img src="img/plan.png" width="90%" style="display: block; margin: auto;" /> --- ## Plan 1. Read the whole page 2. Scrape movie titles and save as `titles` 3. Scrape years movies were made in and save as `years` 4. Scrape IMDB ratings and save as `ratings` 5. Create a data frame called `imdb_top_250` with variables `title`, `year`, and `rating` --- class: middle # Step 1. Read the whole page --- ## Read the whole page ```r page <- read_html("https://www.imdb.com/chart/top/") page ``` ``` ## {html_document} ## <html xmlns:og="http://ogp.me/ns#" xmlns:fb="http://www.facebook.com/2008/fbml"> ## [1] <head>\n<meta http-equiv="Content-Type" content="text/html ... ## [2] <body id="styleguide-v2" class="fixed">\n <img ... ``` --- ## A webpage in R - Result is a list with 2 elements ```r typeof(page) ``` ``` ## [1] "list" ``` -- - that we need to convert to something more familiar, like a data frame.... ```r class(page) ``` ``` ## [1] "xml_document" "xml_node" ``` --- class: middle # Step 2. Scrape movie titles and save as `titles` --- ## Scrape movie titles <img src="img/titles.png" width="70%" style="display: block; margin: auto;" /> --- ## Scrape the nodes .pull-left[ ```r page %>% html_nodes(".titleColumn a") ``` ``` ## {xml_nodeset (250)} ## [1] <a href="/title/tt0111161/?pf_rd_m=A2FGELUUNOQJNL&pf_ ... ## [2] <a href="/title/tt0068646/?pf_rd_m=A2FGELUUNOQJNL&pf_ ... ## [3] <a href="/title/tt0468569/?pf_rd_m=A2FGELUUNOQJNL&pf_ ... ## [4] <a href="/title/tt0071562/?pf_rd_m=A2FGELUUNOQJNL&pf_ ... ## [5] <a href="/title/tt0050083/?pf_rd_m=A2FGELUUNOQJNL&pf_ ... ## [6] <a href="/title/tt0108052/?pf_rd_m=A2FGELUUNOQJNL&pf_ ... ## [7] <a href="/title/tt0167260/?pf_rd_m=A2FGELUUNOQJNL&pf_ ... ## [8] <a href="/title/tt0110912/?pf_rd_m=A2FGELUUNOQJNL&pf_ ... ## [9] <a href="/title/tt0120737/?pf_rd_m=A2FGELUUNOQJNL&pf_ ... ## [10] <a href="/title/tt0060196/?pf_rd_m=A2FGELUUNOQJNL&pf_ ... ## [11] <a href="/title/tt0109830/?pf_rd_m=A2FGELUUNOQJNL&pf_ ... ## [12] <a href="/title/tt0137523/?pf_rd_m=A2FGELUUNOQJNL&pf_ ... ## [13] <a href="/title/tt1375666/?pf_rd_m=A2FGELUUNOQJNL&pf_ ... ## [14] <a href="/title/tt0167261/?pf_rd_m=A2FGELUUNOQJNL&pf_ ... ## [15] <a href="/title/tt0080684/?pf_rd_m=A2FGELUUNOQJNL&pf_ ... ## [16] <a href="/title/tt0133093/?pf_rd_m=A2FGELUUNOQJNL&pf_ ... ... ``` ] .pull-right[ <img src="img/titles.png" width="100%" style="display: block; margin: auto;" /> ] --- ## Extract the text from the nodes .pull-left[ ```r page %>% html_nodes(".titleColumn a") %>% html_text() ``` ``` ## [1] "The Shawshank Redemption" ## [2] "The Godfather" ## [3] "The Dark Knight" ## [4] "The Godfather Part II" ## [5] "12 Angry Men" ## [6] "Schindler's List" ## [7] "The Lord of the Rings: The Return of the King" ## [8] "Pulp Fiction" ## [9] "The Lord of the Rings: The Fellowship of the Ring" ## [10] "The Good, the Bad and the Ugly" ## [11] "Forrest Gump" ## [12] "Fight Club" ## [13] "Inception" ## [14] "The Lord of the Rings: The Two Towers" ## [15] "Star Wars: Episode V - The Empire Strikes Back" ## [16] "The Matrix" ... ``` ] .pull-right[ <img src="img/titles.png" width="100%" style="display: block; margin: auto;" /> ] --- ## Save as `titles` .pull-left[ ```r titles <- page %>% html_nodes(".titleColumn a") %>% html_text() titles ``` ``` ## [1] "The Shawshank Redemption" ## [2] "The Godfather" ## [3] "The Dark Knight" ## [4] "The Godfather Part II" ## [5] "12 Angry Men" ## [6] "Schindler's List" ## [7] "The Lord of the Rings: The Return of the King" ## [8] "Pulp Fiction" ## [9] "The Lord of the Rings: The Fellowship of the Ring" ## [10] "The Good, the Bad and the Ugly" ## [11] "Forrest Gump" ## [12] "Fight Club" ## [13] "Inception" ## [14] "The Lord of the Rings: The Two Towers" ... ``` ] .pull-right[ <img src="img/titles.png" width="100%" style="display: block; margin: auto;" /> ] --- class: middle # Step 3. Scrape year movies were made and save as `years` --- ## Scrape years movies were made in <img src="img/years.png" width="70%" style="display: block; margin: auto;" /> --- ## Scrape the nodes .pull-left[ ```r page %>% html_nodes(".secondaryInfo") ``` ``` ## {xml_nodeset (250)} ## [1] <span class="secondaryInfo">(1994)</span> ## [2] <span class="secondaryInfo">(1972)</span> ## [3] <span class="secondaryInfo">(2008)</span> ## [4] <span class="secondaryInfo">(1974)</span> ## [5] <span class="secondaryInfo">(1957)</span> ## [6] <span class="secondaryInfo">(1993)</span> ## [7] <span class="secondaryInfo">(2003)</span> ## [8] <span class="secondaryInfo">(1994)</span> ## [9] <span class="secondaryInfo">(2001)</span> ## [10] <span class="secondaryInfo">(1966)</span> ## [11] <span class="secondaryInfo">(1994)</span> ## [12] <span class="secondaryInfo">(1999)</span> ## [13] <span class="secondaryInfo">(2010)</span> ## [14] <span class="secondaryInfo">(2002)</span> ## [15] <span class="secondaryInfo">(1980)</span> ## [16] <span class="secondaryInfo">(1999)</span> ... ``` ] .pull-right[ <img src="img/years.png" width="100%" style="display: block; margin: auto;" /> ] --- ## Extract the text from the nodes .pull-left[ ```r page %>% html_nodes(".secondaryInfo") %>% html_text() ``` ``` ## [1] "(1994)" "(1972)" "(2008)" "(1974)" "(1957)" "(1993)" ## [7] "(2003)" "(1994)" "(2001)" "(1966)" "(1994)" "(1999)" ## [13] "(2010)" "(2002)" "(1980)" "(1999)" "(1990)" "(1975)" ## [19] "(1995)" "(1954)" "(1946)" "(1991)" "(2002)" "(1998)" ## [25] "(1997)" "(1999)" "(2014)" "(1977)" "(1991)" "(1985)" ## [31] "(2001)" "(1960)" "(2002)" "(1994)" "(2019)" "(1994)" ## [37] "(2000)" "(1998)" "(1995)" "(2006)" "(2006)" "(1942)" ## [43] "(2022)" "(2014)" "(2011)" "(1936)" "(1962)" "(1968)" ## [49] "(1988)" "(1954)" "(1979)" "(1931)" "(1988)" "(2000)" ## [55] "(1979)" "(1981)" "(2012)" "(2008)" "(2006)" "(1950)" ## [61] "(1957)" "(1980)" "(1940)" "(1957)" "(2018)" "(1986)" ## [67] "(1999)" "(1964)" "(2012)" "(2018)" "(2019)" "(2003)" ## [73] "(1995)" "(1984)" "(1995)" "(2017)" "(1981)" "(2009)" ## [79] "(1997)" "(2019)" "(1984)" "(1997)" "(2000)" "(2010)" ## [85] "(2016)" "(1952)" "(2009)" "(1983)" "(1968)" "(2004)" ## [91] "(1992)" "(1963)" "(2018)" "(1941)" "(1962)" "(2012)" ... ``` ] .pull-right[ <img src="img/years.png" width="100%" style="display: block; margin: auto;" /> ] --- ## Clean up the text We need to go from `"(1994)"` to `1994`: - Remove `(` and `)`: string manipulation - Convert to numeric: `as.numeric()` --- ## stringr .pull-left-wide[ - **stringr** provides a cohesive set of functions designed to make working with strings as easy as possible - Functions in stringr start with `str_*()`, e.g. - `str_remove()` to remove a pattern from a string ```r str_remove(string = "jello", pattern = "el") ``` ``` ## [1] "jlo" ``` - `str_replace()` to replace a pattern with another ```r str_replace(string = "jello", pattern = "j", replacement = "h") ``` ``` ## [1] "hello" ``` ] .pull-right-narrow[ <img src="img/stringr.png" width="100%" style="display: block; margin: auto auto auto 0;" /> ] --- ## Clean up the text ```r page %>% html_nodes(".secondaryInfo") %>% html_text() %>% str_remove("\\(") # remove ( ``` ``` ## [1] "1994)" "1972)" "2008)" "1974)" "1957)" "1993)" "2003)" ## [8] "1994)" "2001)" "1966)" "1994)" "1999)" "2010)" "2002)" ## [15] "1980)" "1999)" "1990)" "1975)" "1995)" "1954)" "1946)" ## [22] "1991)" "2002)" "1998)" "1997)" "1999)" "2014)" "1977)" ## [29] "1991)" "1985)" "2001)" "1960)" "2002)" "1994)" "2019)" ## [36] "1994)" "2000)" "1998)" "1995)" "2006)" "2006)" "1942)" ## [43] "2022)" "2014)" "2011)" "1936)" "1962)" "1968)" "1988)" ## [50] "1954)" "1979)" "1931)" "1988)" "2000)" "1979)" "1981)" ## [57] "2012)" "2008)" "2006)" "1950)" "1957)" "1980)" "1940)" ## [64] "1957)" "2018)" "1986)" "1999)" "1964)" "2012)" "2018)" ## [71] "2019)" "2003)" "1995)" "1984)" "1995)" "2017)" "1981)" ## [78] "2009)" "1997)" "2019)" "1984)" "1997)" "2000)" "2010)" ## [85] "2016)" "1952)" "2009)" "1983)" "1968)" "2004)" "1992)" ## [92] "1963)" "2018)" "1941)" "1962)" "2012)" "1959)" "1931)" ## [99] "1958)" "2001)" "1971)" "1985)" "1987)" "1944)" "1960)" ... ``` --- ## Clean up the text ```r page %>% html_nodes(".secondaryInfo") %>% html_text() %>% str_remove("\\(") %>% # remove ( str_remove("\\)") # remove ) ``` ``` ## [1] "1994" "1972" "2008" "1974" "1957" "1993" "2003" "1994" ## [9] "2001" "1966" "1994" "1999" "2010" "2002" "1980" "1999" ## [17] "1990" "1975" "1995" "1954" "1946" "1991" "2002" "1998" ## [25] "1997" "1999" "2014" "1977" "1991" "1985" "2001" "1960" ## [33] "2002" "1994" "2019" "1994" "2000" "1998" "1995" "2006" ## [41] "2006" "1942" "2022" "2014" "2011" "1936" "1962" "1968" ## [49] "1988" "1954" "1979" "1931" "1988" "2000" "1979" "1981" ## [57] "2012" "2008" "2006" "1950" "1957" "1980" "1940" "1957" ## [65] "2018" "1986" "1999" "1964" "2012" "2018" "2019" "2003" ## [73] "1995" "1984" "1995" "2017" "1981" "2009" "1997" "2019" ## [81] "1984" "1997" "2000" "2010" "2016" "1952" "2009" "1983" ## [89] "1968" "2004" "1992" "1963" "2018" "1941" "1962" "2012" ## [97] "1959" "1931" "1958" "2001" "1971" "1985" "1987" "1944" ## [105] "1960" "1983" "1952" "1973" "1962" "1976" "1997" "2009" ... ``` --- ## Convert to numeric ```r page %>% html_nodes(".secondaryInfo") %>% html_text() %>% str_remove("\\(") %>% # remove ( str_remove("\\)") %>% # remove ) as.numeric() ``` ``` ## [1] 1994 1972 2008 1974 1957 1993 2003 1994 2001 1966 1994 1999 ## [13] 2010 2002 1980 1999 1990 1975 1995 1954 1946 1991 2002 1998 ## [25] 1997 1999 2014 1977 1991 1985 2001 1960 2002 1994 2019 1994 ## [37] 2000 1998 1995 2006 2006 1942 2022 2014 2011 1936 1962 1968 ## [49] 1988 1954 1979 1931 1988 2000 1979 1981 2012 2008 2006 1950 ## [61] 1957 1980 1940 1957 2018 1986 1999 1964 2012 2018 2019 2003 ## [73] 1995 1984 1995 2017 1981 2009 1997 2019 1984 1997 2000 2010 ## [85] 2016 1952 2009 1983 1968 2004 1992 1963 2018 1941 1962 2012 ## [97] 1959 1931 1958 2001 1971 1985 1987 1944 1960 1983 1952 1973 ## [109] 1962 1976 1997 2009 1995 2020 1927 2011 2000 1988 2010 1989 ## [121] 1948 2021 2019 2007 2004 1965 2005 2016 1921 1959 2022 2020 ## [133] 1950 2018 2013 1961 1992 1995 1985 2006 2007 1999 2001 1975 ## [145] 1998 1961 1948 2010 1950 1963 1993 2003 2007 2003 1980 1980 ... ``` --- ## Save as `years` .pull-left[ ```r years <- page %>% html_nodes(".secondaryInfo") %>% html_text() %>% str_remove("\\(") %>% # remove ( str_remove("\\)") %>% # remove ) as.numeric() years ``` ``` ## [1] 1994 1972 2008 1974 1957 1993 2003 1994 2001 1966 1994 1999 ## [13] 2010 2002 1980 1999 1990 1975 1995 1954 1946 1991 2002 1998 ## [25] 1997 1999 2014 1977 1991 1985 2001 1960 2002 1994 2019 1994 ## [37] 2000 1998 1995 2006 2006 1942 2022 2014 2011 1936 1962 1968 ## [49] 1988 1954 1979 1931 1988 2000 1979 1981 2012 2008 2006 1950 ## [61] 1957 1980 1940 1957 2018 1986 1999 1964 2012 2018 2019 2003 ## [73] 1995 1984 1995 2017 1981 2009 1997 2019 1984 1997 2000 2010 ## [85] 2016 1952 2009 1983 1968 2004 1992 1963 2018 1941 1962 2012 ## [97] 1959 1931 1958 2001 1971 1985 1987 1944 1960 1983 1952 1973 ## [109] 1962 1976 1997 2009 1995 2020 1927 2011 2000 1988 2010 1989 ## [121] 1948 2021 2019 2007 2004 1965 2005 2016 1921 1959 2022 2020 ... ``` ] .pull-right[ <img src="img/years.png" width="100%" style="display: block; margin: auto;" /> ] --- class: middle # Step 4. Scrape IMDB ratings and save as `ratings` --- ## Scrape IMDB ratings <img src="img/ratings.png" width="70%" style="display: block; margin: auto;" /> --- ## Scrape the nodes .pull-left[ ```r page %>% html_nodes("strong") ``` ``` ## {xml_nodeset (250)} ## [1] <strong title="9.2 based on 2,598,663 user ratings">9.2</ ... ## [2] <strong title="9.2 based on 1,794,051 user ratings">9.2</ ... ## [3] <strong title="9.0 based on 2,569,907 user ratings">9.0</ ... ## [4] <strong title="9.0 based on 1,236,627 user ratings">9.0</ ... ## [5] <strong title="8.9 based on 767,804 user ratings">8.9</st ... ## [6] <strong title="8.9 based on 1,321,819 user ratings">8.9</ ... ## [7] <strong title="8.9 based on 1,784,964 user ratings">8.9</ ... ## [8] <strong title="8.9 based on 1,992,218 user ratings">8.9</ ... ## [9] <strong title="8.8 based on 1,806,022 user ratings">8.8</ ... ## [10] <strong title="8.8 based on 745,449 user ratings">8.8</st ... ## [11] <strong title="8.8 based on 2,007,365 user ratings">8.8</ ... ## [12] <strong title="8.8 based on 2,046,596 user ratings">8.8</ ... ## [13] <strong title="8.7 based on 2,280,290 user ratings">8.7</ ... ## [14] <strong title="8.7 based on 1,612,066 user ratings">8.7</ ... ## [15] <strong title="8.7 based on 1,257,435 user ratings">8.7</ ... ## [16] <strong title="8.7 based on 1,866,072 user ratings">8.7</ ... ... ``` ] .pull-right[ <img src="img/ratings.png" width="100%" style="display: block; margin: auto;" /> ] --- ## Extract the text from the nodes .pull-left[ ```r page %>% html_nodes("strong") %>% html_text() ``` ``` ## [1] "9.2" "9.2" "9.0" "9.0" "8.9" "8.9" "8.9" "8.9" "8.8" "8.8" ## [11] "8.8" "8.8" "8.7" "8.7" "8.7" "8.7" "8.7" "8.6" "8.6" "8.6" ## [21] "8.6" "8.6" "8.6" "8.6" "8.6" "8.6" "8.6" "8.6" "8.5" "8.5" ## [31] "8.5" "8.5" "8.5" "8.5" "8.5" "8.5" "8.5" "8.5" "8.5" "8.5" ## [41] "8.5" "8.5" "8.5" "8.5" "8.5" "8.4" "8.4" "8.4" "8.4" "8.4" ## [51] "8.4" "8.4" "8.4" "8.4" "8.4" "8.4" "8.4" "8.4" "8.4" "8.4" ## [61] "8.4" "8.4" "8.4" "8.4" "8.4" "8.3" "8.3" "8.3" "8.3" "8.3" ## [71] "8.3" "8.3" "8.3" "8.3" "8.3" "8.3" "8.3" "8.3" "8.3" "8.3" ## [81] "8.3" "8.3" "8.3" "8.3" "8.3" "8.3" "8.3" "8.3" "8.3" "8.3" ## [91] "8.3" "8.3" "8.3" "8.3" "8.3" "8.3" "8.3" "8.3" "8.2" "8.2" ## [101] "8.2" "8.2" "8.2" "8.2" "8.2" "8.2" "8.2" "8.2" "8.2" "8.2" ## [111] "8.2" "8.2" "8.2" "8.2" "8.2" "8.2" "8.2" "8.2" "8.2" "8.2" ## [121] "8.2" "8.2" "8.2" "8.2" "8.2" "8.2" "8.2" "8.2" "8.2" "8.2" ## [131] "8.2" "8.2" "8.2" "8.2" "8.2" "8.2" "8.2" "8.2" "8.2" "8.2" ## [141] "8.2" "8.2" "8.2" "8.2" "8.2" "8.1" "8.1" "8.1" "8.1" "8.1" ## [151] "8.1" "8.1" "8.1" "8.1" "8.1" "8.1" "8.1" "8.1" "8.1" "8.1" ... ``` ] .pull-right[ <img src="img/ratings.png" width="100%" style="display: block; margin: auto;" /> ] --- ## Convert to numeric .pull-left[ ```r page %>% html_nodes("strong") %>% html_text() %>% as.numeric() ``` ``` ## [1] 9.2 9.2 9.0 9.0 8.9 8.9 8.9 8.9 8.8 8.8 8.8 8.8 8.7 8.7 8.7 ## [16] 8.7 8.7 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.5 8.5 ## [31] 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 ## [46] 8.4 8.4 8.4 8.4 8.4 8.4 8.4 8.4 8.4 8.4 8.4 8.4 8.4 8.4 8.4 ## [61] 8.4 8.4 8.4 8.4 8.4 8.3 8.3 8.3 8.3 8.3 8.3 8.3 8.3 8.3 8.3 ## [76] 8.3 8.3 8.3 8.3 8.3 8.3 8.3 8.3 8.3 8.3 8.3 8.3 8.3 8.3 8.3 ## [91] 8.3 8.3 8.3 8.3 8.3 8.3 8.3 8.3 8.2 8.2 8.2 8.2 8.2 8.2 8.2 ## [106] 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.2 ## [121] 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.2 ## [136] 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.1 8.1 8.1 8.1 8.1 ## [151] 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 ## [166] 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 ## [181] 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 ## [196] 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 ## [211] 8.1 8.0 8.0 8.0 8.0 8.0 8.0 8.0 8.0 8.0 8.0 8.0 8.0 8.0 8.0 ... ``` ] .pull-right[ <img src="img/ratings.png" width="100%" style="display: block; margin: auto;" /> ] --- ## Save as `ratings` .pull-left[ ```r ratings <- page %>% html_nodes("strong") %>% html_text() %>% as.numeric() ratings ``` ``` ## [1] 9.2 9.2 9.0 9.0 8.9 8.9 8.9 8.9 8.8 8.8 8.8 8.8 8.7 8.7 8.7 ## [16] 8.7 8.7 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.5 8.5 ## [31] 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 ## [46] 8.4 8.4 8.4 8.4 8.4 8.4 8.4 8.4 8.4 8.4 8.4 8.4 8.4 8.4 8.4 ## [61] 8.4 8.4 8.4 8.4 8.4 8.3 8.3 8.3 8.3 8.3 8.3 8.3 8.3 8.3 8.3 ## [76] 8.3 8.3 8.3 8.3 8.3 8.3 8.3 8.3 8.3 8.3 8.3 8.3 8.3 8.3 8.3 ## [91] 8.3 8.3 8.3 8.3 8.3 8.3 8.3 8.3 8.2 8.2 8.2 8.2 8.2 8.2 8.2 ## [106] 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.2 ## [121] 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.2 ## [136] 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.2 8.1 8.1 8.1 8.1 8.1 ## [151] 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 ## [166] 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 ... ``` ] .pull-right[ <img src="img/ratings.png" width="100%" style="display: block; margin: auto;" /> ] --- class: middle # Step 5. Create a data frame called `imdb_top_250` --- ## Create a data frame: `imdb_top_250` ```r imdb_top_250 <- tibble( title = titles, year = years, rating = ratings ) imdb_top_250 ``` ``` ## # A tibble: 250 × 3 ## title year rating ## <chr> <dbl> <dbl> ## 1 The Shawshank Redemption 1994 9.2 ## 2 The Godfather 1972 9.2 ## 3 The Dark Knight 2008 9 ## 4 The Godfather Part II 1974 9 ## 5 12 Angry Men 1957 8.9 ## 6 Schindler's List 1993 8.9 ## # … with 244 more rows ``` ---
--- ## Clean up / enhance May or may not be a lot of work depending on how messy the data are - See if you like what you got: ```r glimpse(imdb_top_250) ``` ``` ## Rows: 250 ## Columns: 3 ## $ title <chr> "The Shawshank Redemption", "The Godfather", "Th… ## $ year <dbl> 1994, 1972, 2008, 1974, 1957, 1993, 2003, 1994, … ## $ rating <dbl> 9.2, 9.2, 9.0, 9.0, 8.9, 8.9, 8.9, 8.9, 8.8, 8.8… ``` - Add a variable for rank ```r imdb_top_250 <- imdb_top_250 %>% mutate(rank = 1:nrow(imdb_top_250)) %>% relocate(rank) ``` --- ``` ## # A tibble: 250 × 4 ## rank title year rating ## <int> <chr> <dbl> <dbl> ## 1 1 The Shawshank Redemption 1994 9.2 ## 2 2 The Godfather 1972 9.2 ## 3 3 The Dark Knight 2008 9 ## 4 4 The Godfather Part II 1974 9 ## 5 5 12 Angry Men 1957 8.9 ## 6 6 Schindler's List 1993 8.9 ## 7 7 The Lord of the Rings: The Return of the K… 2003 8.9 ## 8 8 Pulp Fiction 1994 8.9 ## 9 9 The Lord of the Rings: The Fellowship of t… 2001 8.8 ## 10 10 The Good, the Bad and the Ugly 1966 8.8 ## 11 11 Forrest Gump 1994 8.8 ## 12 12 Fight Club 1999 8.8 ## 13 13 Inception 2010 8.7 ## 14 14 The Lord of the Rings: The Two Towers 2002 8.7 ## 15 15 Star Wars: Episode V - The Empire Strikes … 1980 8.7 ## 16 16 The Matrix 1999 8.7 ## 17 17 Goodfellas 1990 8.7 ## 18 18 One Flew Over the Cuckoo's Nest 1975 8.6 ## 19 19 Se7en 1995 8.6 ## 20 20 Seven Samurai 1954 8.6 ## # … with 230 more rows ``` --- class: middle # What next? --- .question[ Which years have the most movies on the list? ] -- ```r imdb_top_250 %>% count(year, sort = TRUE) ``` ``` ## # A tibble: 86 × 2 ## year n ## <dbl> <int> ## 1 1995 8 ## 2 2004 7 ## 3 1957 6 ## 4 2003 6 ## 5 2009 6 ## 6 2019 6 ## # … with 80 more rows ``` --- .question[ Which 1995 movies made the list? ] -- ```r imdb_top_250 %>% filter(year == 1995) %>% print(n = 8) ``` ``` ## # A tibble: 8 × 4 ## rank title year rating ## <int> <chr> <dbl> <dbl> ## 1 19 Se7en 1995 8.6 ## 2 39 The Usual Suspects 1995 8.5 ## 3 73 Braveheart 1995 8.3 ## 4 75 Toy Story 1995 8.3 ## 5 113 Heat 1995 8.2 ## 6 138 Casino 1995 8.2 ## 7 186 Before Sunrise 1995 8.1 ## 8 238 La Haine 1995 8 ``` --- .question[ Visualize the average yearly rating for movies that made it on the top 250 list over time. ] -- .panelset[ .panel[.panel-name[Plot] <img src="u2-d19-top-250-imdb_files/figure-html/unnamed-chunk-46-1.png" width="58%" style="display: block; margin: auto;" /> ] .panel[.panel-name[Code] ```r imdb_top_250 %>% group_by(year) %>% summarise(avg_score = mean(rating)) %>% ggplot(aes(y = avg_score, x = year)) + geom_point() + geom_smooth(method = "lm", se = FALSE) + labs(x = "Year", y = "Average score") ``` ] ]