-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_data.R
85 lines (54 loc) · 2.03 KB
/
scrape_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# Load packages ----
pacman::p_load(
rvest,
dplyr,
stringr,
purrr,
here,
magrittr
)
# Is it legal to scrape data from the website? ----
robotstxt::paths_allowed("https://www.gasprices.aaa.com/") # Yes!
# Generate state URLS
master <-
tibble(
state_abbs = c(state.abb, "DC") %>% sort(),
state_urls = paste0("https://gasprices.aaa.com/?state=", state_abbs)
)
# Create HTML folder for current gas prices (current means today) (if needed)
if(!dir.exists(here("html", Sys.Date()))){
dir.create(path = here("html", Sys.Date()))
}
# Download HTML files into today's folder
walk2(master$state_urls, master$state_abbs, ~ download.file(url = .x, destfile = here("html", Sys.Date(), paste0(.y, ".html"))))
# Add file paths to the master dataset
master %<>% mutate(state_paths = list.files(here("html", Sys.Date()), full.names = TRUE))
# Function for scraping city data
scrape_city_data <- function(url){
page_html <- read_html(url)
city <- page_html %>%
html_nodes("h3") %>%
html_text()
prices <- page_html %>%
html_nodes("[class='table-mob']") %>%
html_table() %>%
.[-1] %>%
invoke(rbind, .)
colnames(prices)[1] <- "Average"
prices$City <- rep(city, each = 5)
prices %>%
mutate_at(.vars = vars(Regular:Diesel), .funs = list(~stringr::str_replace(string = .x, pattern = "\\$", replacement = "") %>% as.numeric())) %>%
mutate(Date = Sys.Date()) %>%
tidyr::pivot_longer(cols = Regular:Diesel, names_to = "Fuel_Type", values_to = "Price") %>%
select(Date, City, Fuel_Type, Average, Price)
}
scrape_city_data_possibly <- possibly(scrape_city_data, otherwise = NA)
# Generate state URLs
master %<>%
mutate(data = map(state_paths, scrape_city_data_possibly)) %>%
tidyr::unnest(data) %>%
janitor::clean_names() %>%
select(date, city, state = state_abbs, url = state_urls, fuel_type, average, price)
# Save data to disk
# Create data folder for current gas prices (current means today) (if needed)
readr::write_csv(master, here("data", paste0(Sys.Date(), ".csv")))