-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.py
93 lines (78 loc) · 3.73 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/python3
"""
Scraping script to gather the data from BoardGameGeek.com and the Wikipedia list of TTRPG
"""
from bs4 import BeautifulSoup
import requests
#from flask_sqlalchemy import SQLAlchemy
import os
from dotenv import load_dotenv
import re
from app.main import db, Games
load_dotenv('../../.env')
class TTGSpider():
urls = {
"board": 'https://boardgamegeek.com/browse/boardgame/page/',
"rpg": "https://en.wikipedia.org/wiki/List_of_tabletop_role-playing_games",
}
"""
Parsing for the Wiki RPG list. Uses the API to insert to the database
"""
def parse_rpg(self):
html = requests.get(self.urls["rpg"])
soup = BeautifulSoup(html.content, 'html.parser')
results = soup.find_all("div", class_="div-col")
for result in results:
game_year = result.find("span", class_="mw-headline").text
games_list = result.findAll("li") # for each game title within this year, get other info
for title in games_list:
desc_html = title.find("a", href=True) # get the link to the game's page
go_next = requests.get('https://en.wikipedia.org' + desc_html.get('href'))
dsoup = BeautifulSoup(go_next.content, 'html.parser') # follow the description rabbit, Alice
wrapper = dsoup.find("div", class_="mw-parser-output")
description = wrapper.find("p", class_=None).text # get <p> within div class=mw-parser-output
title = title.text.replace("'", "''") # escape single quotes
description = description.replace("'", "''").strip() # escape single quotes
description = re.sub(r"[\[\d+\]]", '', description) # remove useless ref numbers
data = {"name": title, "year": game_year, "desc": description, "type": 'RPG'}
url = 'https://tabletopgames.herokuapp.com/v1/insert'
response = requests.post(url=url, data=data)
if response.status_code > 299:
print(f"Got {response.status_code} response from server on title")
"""
First attempt at parsing with BeautifulSoup. Generated SQL to insert manually with a client.
"""
def parse_board(self, i):
html = requests.get(self.urls["board"] + str(i))
soup = BeautifulSoup(html.content, 'html.parser')
results = soup.find_all(id="row_")
for result in results:
game_name = result.find("a", class_="primary")
game_desc = result.find("p", class_="smallefont")
game_year = result.find("span", class_="smallerfont")
if game_year:
game_yr = game_year.text.strip()
year = re.search(r'\d{1,4}', game_yr).group(0)
else:
year = ""
if game_desc:
desc = game_desc.text.replace("'", "''")
else:
desc = ""
name = game_name.text.replace("'", "''")
try:
with open('inserts.txt', 'a', encoding="utf-8") as f:
print(f"insert into games (name, year, description, type) values ('{name.strip()}', '{year}', '{desc.strip()}', 'board_game');", file=f)
except Exception as e:
print(f"Error inserting: {e}")
def main():
# Set up the spider
spider_go = TTGSpider()
# Go, spider, go!
spider_go.dbc()
for i in range (4,100):
print("Starting on page", i)
spider_go.parse_board(i) # parses the board games
spider_go.parse_rpg() # parses the RPGs
if __name__ == "__main__":
main()