-
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add alpha 'parser' module (archive page) (#6)
* chore: 🙈 Return ignoring 'dist' folder after deleting old files * chore: Add blank files for future code structure * chore: ➕ Add requests, beautifulsoup4, lxml for parsing HTML 2.26.0 for requests 4.9.3 for beautifulsoup4 4.6.3 for lxml * chore: ➕ Add 'requests-file' to dev deps for future unit-tests * chore: ➕ Add requests-mock 1.9.3 to dev dependencies * feat(parser): ✨ Add function to parse archive page Decople separate modules: config, parser, downloader from lep module * test(parser): ✅ Add test for checking mocked response of archive page Add test file with presaved archive HTML page * chore: Add test HTML files for several episodes to mock them in tests * refactor(parser): 🚧 Update two functions for parsing all links and getting text link by href * chore: 🔧 Add mapping dict for 4 links and their text * test: 🚧 Add PoC test for mocking several episode pages * chore: 🔧 Update mypy settings in pyproject.toml - Decrease 'fail_under' downto 85 (during active develpment stage) - Use new syntax (section) for ignoring imports in mypy for several packages * chore: 🔧 Rename Tuple with irrelevant links * chore: ♻️ Improve typings and function names Add raised exceptions (try/except) for function to get text of HTML * test(parser): ✅ Add several general tests for parsing functions * ci: 🔧 Add installing of 'requests_mock' into 'tests' Nox session * ci: 🔧 Exclude HTML files from 'pre-commit' hooks * style: 🎨 Commit changes which were modified by 'pre-commit' hooks * style: 🎨 Fix flake8 errors * chore: ➕ Add flake8 plugins to dev deps flake8-black (0.2.3) flake8-import-order (0.18.1) * chore: 🔧 Change flake8 config: max-line-length = 120 and ignore long lines (B950) * ci: 🔧 Add installation of 'requests_mock' into 'typeguard' Nox session * chore: ➖ Remove unused 'requests-file' from dev-deps
- Loading branch information
Showing
31 changed files
with
22,066 additions
and
36 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
"""CLI main click group.""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
"""Package with CLI commands.""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
"""Download command.""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
"""Convert command.""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
"""App configuration module.""" | ||
|
||
|
||
ARCHIVE_URL = "https://hotenov.com" | ||
|
||
LOCAL_ARCHIVE_HTML = "2021-08-10_lep-archive-page-content-pretty.html" | ||
|
||
SHORT_LINKS_MAPPING_DICT = { | ||
"http://wp.me/p4IuUx-7PL": "https://teacherluke.co.uk/2017/06/20/460-catching-up-with-amber-paul-6-feat-sarah-donnelly/", | ||
"http://wp.me/p4IuUx-7C6": "https://teacherluke.co.uk/2017/04/25/444-the-rick-thompson-report-snap-general-election-2017/", | ||
"http://wp.me/p4IuUx-7C4": "https://teacherluke.co.uk/2017/04/21/443-the-trip-to-japan-part-2/", | ||
"http://wp.me/p4IuUx-7BQ": "https://teacherluke.co.uk/2017/04/21/442-the-trip-to-japan-part-1/", | ||
"http://wp.me/p4IuUx-7BO": "https://teacherluke.co.uk/2017/04/18/441-andy-johnson-at-the-iatefl-conference/", | ||
"http://wp.me/p4IuUx-7Av": "https://teacherluke.co.uk/2017/03/28/436-the-return-of-the-lying-game-with-amber-paul-video/", | ||
"http://wp.me/p4IuUx-7zK": "https://teacherluke.co.uk/2017/03/26/i-was-interviewed-on-my-fluent-podcast-with-daniel-goodson/", | ||
"http://wp.me/p4IuUx-7sg": "https://teacherluke.co.uk/2017/01/10/415-with-the-family-part-3-more-encounters-with-famous-people/", | ||
"https://wp.me/p4IuUx-29": "https://teacherluke.co.uk/2011/10/11/notting-hill-carnival-video-frustration-out-takes/", | ||
} | ||
|
||
# MISSPELLED_LTD = ".co.ukm" | ||
|
||
IRRELEVANT_LINKS = ("https://wp.me/P4IuUx-82H",) | ||
|
||
EPISODE_LINK_RE = r"https?://((?P<short>wp\.me/p4IuUx-[\w-]+)|(teacherluke\.(co\.uk|wordpress\.com)/(?P<date>\d{4}/\d{2}/\d{2})/))" | ||
|
||
LINK_TEXTS_MAPPING = { | ||
"https://teacherluke.co.uk/2018/04/18/522-learning-english-at-summer-school-in-the-uk-a-rambling-chat-with-raphael-miller/": "522. Learning English at Summer School in the UK (A Rambling Chat with Raphael Miller)", | ||
"https://teacherluke.co.uk/2017/08/14/website-content-lukes-criminal-past-zep-episode-185/": "[Website content] Luke’s Criminal Past (ZEP Episode 185)", | ||
"https://teacherluke.co.uk/2017/05/26/i-was-invited-onto-the-english-across-the-pond-podcast/": "[Website content] I was invited onto the “English Across The Pond” Podcast", | ||
"https://teacherluke.co.uk/2016/03/20/i-was-invited-onto-craig-wealands-weekly-blab-and-we-talked-about-comedy-video/": "[VIDEO] I was invited onto Craig Wealand’s weekly Blab, and we talked about comedy", | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
"""LEP module for downloading logic.""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
"""LEP module for general logic and classes.""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
"""LEP module for parsing logic.""" | ||
import copy | ||
import re | ||
from typing import Any | ||
from typing import List | ||
|
||
import requests | ||
from bs4 import BeautifulSoup | ||
from bs4 import SoupStrainer | ||
|
||
from lep_downloader import config as conf | ||
|
||
|
||
deleted_links = [] | ||
regex = conf.EPISODE_LINK_RE | ||
ep_pattern = re.compile(regex, re.IGNORECASE) | ||
s = requests.Session() | ||
|
||
|
||
def get_web_page_html_text(page_url: str, session: requests.Session) -> Any: | ||
"""Return HTML text of LEP archive page.""" | ||
with session: | ||
try: | ||
resp = session.get(page_url, timeout=(6, 33)) | ||
if not resp.ok: | ||
resp.raise_for_status() | ||
except requests.exceptions.HTTPError: | ||
raise | ||
except requests.exceptions.Timeout: | ||
raise | ||
except requests.exceptions.ConnectionError: | ||
raise | ||
except Exception: | ||
raise | ||
else: | ||
resp.encoding = "utf-8" | ||
return resp.text | ||
|
||
|
||
def get_all_links_from_soup(soup_obj: BeautifulSoup) -> List[str]: | ||
"""Return list of links from HTML block.""" | ||
all_links: List[str] = [] | ||
all_tags_a = soup_obj("a") | ||
for tag_a in all_tags_a: | ||
all_links.append(tag_a["href"].strip()) | ||
|
||
return all_links | ||
|
||
|
||
def replace_misspelled_link(soup_obj: BeautifulSoup) -> BeautifulSoup: | ||
"""Replace link with '.ukm' misspelled LTD.""" | ||
modified_soup = copy.copy(soup_obj) # TODO: Really needs to copy? | ||
misspelled_tag_a = modified_soup.find( | ||
"a", href="https://teacherluke.co.ukm/2012/08/06/london-olympics-2012/" | ||
) | ||
if misspelled_tag_a: | ||
misspelled_tag_a[ | ||
"href" | ||
] = "https://teacherluke.co.uk/2012/08/06/london-olympics-2012/" | ||
del misspelled_tag_a | ||
return modified_soup | ||
|
||
|
||
def remove_irrelevant_links(links: List[str]) -> List[str]: | ||
"""Return list of links without known irrelevant links.""" | ||
for i, link in enumerate(links[:]): | ||
if link in conf.IRRELEVANT_LINKS: | ||
deleted_links.append(link) | ||
del links[i] | ||
return links | ||
|
||
|
||
def remove_not_episode_links_by_regex_pattern(links: List[str]) -> List[str]: | ||
"""Return list of adopted episode (post) links.""" | ||
result: List[str] = [] | ||
for link in links: | ||
match = ep_pattern.match(link) | ||
if match: | ||
result.append(link) | ||
else: | ||
deleted_links.append(link) | ||
return result | ||
|
||
|
||
def get_links_text_by_href( | ||
soup_obj: BeautifulSoup, | ||
links: List[str], | ||
) -> List[str]: | ||
"""Return text of <a></a> tag by its href attribute.""" | ||
link_strings = [] | ||
for url in links: | ||
a_tag = soup_obj.find("a", href=url) | ||
if url in [*conf.LINK_TEXTS_MAPPING]: | ||
link_string = conf.LINK_TEXTS_MAPPING[url] | ||
else: | ||
link_string = " ".join([text for text in a_tag.stripped_strings]) | ||
link_strings.append(link_string) | ||
|
||
return link_strings | ||
|
||
|
||
def substitute_short_links(unique_links: List[str]) -> List[str]: | ||
"""Return list of links with final location for short links.""" | ||
final_links = copy.deepcopy(unique_links) | ||
|
||
for key, value in conf.SHORT_LINKS_MAPPING_DICT.items(): | ||
try: | ||
short_link_index = unique_links.index(key) | ||
final_links[short_link_index] = value | ||
except ValueError: | ||
print(f"[WARNING]: No short links: {key}") | ||
return final_links | ||
|
||
|
||
def get_archive_parsing_results(archive_url: str) -> Any: | ||
"""Return Tuple with valid episode links and discarded links.""" | ||
html_page = get_web_page_html_text(archive_url, s) | ||
only_div_entry_content = SoupStrainer("div", class_="entry-content") | ||
soup_div = BeautifulSoup(html_page, "lxml", parse_only=only_div_entry_content) | ||
|
||
if len(soup_div) > 0: | ||
modified_soup_div = replace_misspelled_link(soup_div) | ||
all_links = get_all_links_from_soup(modified_soup_div) | ||
cleaned_links = remove_irrelevant_links(all_links) | ||
cleaned_links = remove_not_episode_links_by_regex_pattern(cleaned_links) | ||
|
||
# Get unique links with preserved order for Python 3.7+ | ||
unique_links = list(dict.fromkeys(cleaned_links)) | ||
|
||
# Get list of 'link labeles' | ||
link_strings = get_links_text_by_href(modified_soup_div, unique_links) | ||
|
||
final_list = substitute_short_links(unique_links) | ||
parsing_result = (final_list, deleted_links, link_strings) | ||
return parsing_result | ||
else: | ||
print("[ERROR] Can't parse this page: Main <div> is not found") | ||
return None | ||
|
||
|
||
def parse_single_page(url: str, session: requests.Session) -> Any: | ||
"""Returns result of parsing of single page.""" | ||
req = session.get(url, timeout=(3.05, 27)) | ||
req.encoding = "utf-8" | ||
html_text = req.text | ||
|
||
soup_obj = BeautifulSoup(html_text, "lxml") | ||
page_title = soup_obj.title.string | ||
result = page_title | ||
return result |
Oops, something went wrong.