-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathweb-scraper.py
144 lines (100 loc) · 4.17 KB
/
web-scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# web-scraper.py
#
# Identify the foods during the day.
# Export them into a CSV file for further usage.
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
from cs50 import SQL
def pull_menu():
# Locate the URL of the HUDS website.
URL = "http://www.foodpro.huds.harvard.edu/foodpro/menu_items.asp?type=30&meal=1"
# Retrieve the page using GET.
page = requests.get(URL)
# Testing Purposes: Print out the page's text.
# print(page.text)
# Set up a new HTML parser on the page's content.
soup = BeautifulSoup(page.content, "html.parser")
# Find the content that matches an ID of content.
results = soup.find(id = "content")
# Testing Purposes: Print out the content that matches the find above.
# print(results.prettify())
# Find all the links in the inside of #content.
food_items = results.find_all("a")
# With each link in #content, decide whether it's a valid food or not.
valid_food = False
menu = []
ingredients = []
allergens = []
# Iterate through each a element from the result
for food_item in food_items:
# Testing Purposes
# print(food_item, end="\n"*2)
# All elements past "Create Nutrition Report" will not be food items.
if food_item.text.strip() == "Create Nutrition Report":
valid_food = False
# If the element is a valid food item, add the insides as a text element.
if valid_food == True:
# Testing Purposes
print(food_item.text.strip())
# Add each element to the menu.
menu.append(food_item.text.strip())
# Get the URL for each food item.
food_url = food_item['href']
# Retrieve the page using GET.
food_page = requests.get(food_url)
# Set up a new HTML parser on the page's content.
food_soup = BeautifulSoup(food_page.content, "html.parser")
# Find the content that matches an ID of content.
results = food_soup.find(id = "content")
# Find all the paragraphs in the inside of #content.
food_ps = results.find_all("p")
# Iterate for ingredients.
ing_found = False
for food_p in food_ps:
# Search for the ingredients within the text.
if "Ingredients" in food_p.text.strip() and "Consumer Responsibility" not in food_p.text.strip():
# Add the ingredients to the list.
ingredients.append(food_p.text.strip())
# Testing Purposes:
# print(food_p.text.strip())
ing_found = True
break
if ing_found == False:
ingredients.append("Ingredients: N/A")
# Iterate for allergens.
all_found = False
for food_py in food_ps:
found = False
# Search for the allergens within the text.
if "Allergens" in food_py.text.strip():
# Add the allergens to the list.
allergens.append(food_py.text.strip())
# Testing Purposes:
# print(food_py.text.strip())
all_found = True
break
if all_found == False:
allergens.append("Allergens: N/A")
# Elements past "Cancel" are valid food items.
if food_item.text.strip() == "Cancel":
valid_food = True
# Testing Purposes
# print("Menu: " + str(len(menu)))
# print("Ingredients: " + str(len(ingredients)))
# print("Allergens: " + str(len(allergens)))
# Convert the lists to a dictionary.
dict = {"food": menu, "ingredients": ingredients, "allergens": allergens}
# Send the dictionary to a Pandas Dataframe.
df = pd.DataFrame(dict)
# Convert the file to a CSV for processing into SQL.
df.to_csv("menu_items.csv")
# Auto-refresh the CSV
if __name__ == '__main__':
while True:
pull_menu()
# Adjust this time for minutes
time_wait = 10
# Sleep for that set amount of minutes
time.sleep(time_wait * 60)