Skip to content

Commit

Permalink
Add alpha 'parser' module (archive page) (#6)
Browse files Browse the repository at this point in the history
* chore: 🙈 Return ignoring 'dist' folder after deleting old files

* chore: Add blank files for future code structure

* chore: ➕ Add requests, beautifulsoup4, lxml for parsing HTML

2.26.0 for requests
4.9.3 for beautifulsoup4
4.6.3 for lxml

* chore: ➕ Add 'requests-file' to dev deps for future unit-tests

* chore: ➕ Add requests-mock 1.9.3 to dev dependencies

* feat(parser): ✨ Add function to parse archive page

Decople separate modules: config, parser, downloader from lep module

* test(parser): ✅ Add test for checking mocked response of archive page

Add test file with presaved archive HTML page

* chore: Add test HTML files for several episodes

to mock them in tests

* refactor(parser): 🚧 Update two functions for parsing all links and getting text link by href

* chore: 🔧 Add mapping dict for 4 links and their text

* test: 🚧 Add PoC test for mocking several episode pages

* chore: 🔧 Update mypy settings in pyproject.toml

- Decrease 'fail_under' downto 85 (during active develpment stage)
- Use new syntax (section) for ignoring imports in mypy for several packages

* chore: 🔧 Rename Tuple with irrelevant links

* chore: ♻️ Improve typings and function names

Add raised exceptions (try/except) for function to get text of HTML

* test(parser): ✅ Add several general tests for parsing functions

* ci: 🔧 Add installing of 'requests_mock' into 'tests' Nox session

* ci: 🔧 Exclude HTML files from 'pre-commit' hooks

* style: 🎨 Commit changes which were modified by 'pre-commit' hooks

* style: 🎨 Fix flake8 errors

* chore: ➕ Add flake8 plugins to dev deps

flake8-black (0.2.3)
flake8-import-order (0.18.1)

* chore: 🔧 Change flake8 config: max-line-length = 120 and ignore long lines (B950)

* ci: 🔧 Add installation of 'requests_mock' into 'typeguard' Nox session

* chore: ➖ Remove unused 'requests-file' from dev-deps
  • Loading branch information
hotenov authored Sep 20, 2021
1 parent 434c2e4 commit d1c44f6
Show file tree
Hide file tree
Showing 31 changed files with 22,066 additions and 36 deletions.
4 changes: 2 additions & 2 deletions .flake8
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[flake8]
select = B,B9,C,D,DAR,E,F,N,RST,S,W
ignore = E203,E501,RST201,RST203,RST301,W503
max-line-length = 80
ignore = E203,E501,RST201,RST203,RST301,W503,B950
max-line-length = 120
max-complexity = 10
docstring-convention = google
per-file-ignores = tests/*:S101
Expand Down
4 changes: 4 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ exclude: |
luke_english_podcast_downloader.py|
dist/2.0.5/LEP-downloader.py
)$
repos:
- repo: local
hooks:
Expand Down Expand Up @@ -34,6 +35,7 @@ repos:
language: system
types: [text]
stages: [commit, push, manual]
exclude_types: [html]
- id: flake8
name: flake8
entry: flake8
Expand All @@ -52,7 +54,9 @@ repos:
language: system
types: [text]
stages: [commit, push, manual]
exclude_types: [html]
- repo: https://github.com/pre-commit/mirrors-prettier
rev: v2.3.0
hooks:
- id: prettier
exclude_types: [html]
4 changes: 2 additions & 2 deletions noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def mypy(session: Session) -> None:
def tests(session: Session) -> None:
"""Run the test suite."""
session.install(".")
session.install("coverage[toml]", "pytest", "pygments")
session.install("coverage[toml]", "pytest", "pygments", "requests_mock")
try:
session.run("coverage", "run", "--parallel", "-m", "pytest", *session.posargs)
finally:
Expand All @@ -154,7 +154,7 @@ def coverage(session: Session) -> None:
def typeguard(session: Session) -> None:
"""Runtime type checking using Typeguard."""
session.install(".")
session.install("pytest", "typeguard", "pygments")
session.install("pytest", "typeguard", "pygments", "requests_mock")
session.run("pytest", f"--typeguard-packages={package}", *session.posargs)


Expand Down
197 changes: 173 additions & 24 deletions poetry.lock

Large diffs are not rendered by default.

24 changes: 16 additions & 8 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ Changelog = "https://github.com/hotenov/LEP-downloader/releases"
[tool.poetry.dependencies]
python = "^3.7.0"
click = "^8.0.1"
requests = "^2.26.0"
beautifulsoup4 = "^4.9.3"
lxml = "^4.6.3"

[tool.poetry.dev-dependencies]
pytest = "^6.2.4"
Expand All @@ -44,6 +47,9 @@ pre-commit-hooks = "^4.0.1"
sphinx-rtd-theme = "^0.5.2"
sphinx-click = "^3.0.1"
Pygments = "^2.9.0"
requests-mock = "^1.9.3"
flake8-black = "^0.2.3"
flake8-import-order = "^0.18.1"

[tool.poetry.scripts]
lep-downloader = "lep_downloader.__main__:main"
Expand All @@ -57,7 +63,7 @@ source = ["lep_downloader"]

[tool.coverage.report]
show_missing = true
fail_under = 100
fail_under = 85

[tool.mypy]
strict = true
Expand All @@ -66,13 +72,15 @@ show_column_numbers = true
show_error_codes = true
show_error_context = true

[mypy-desert]
ignore_missing_imports = true

[mypy-pytest]
ignore_missing_imports = true

[mypy-pytest_mock]
[[tool.mypy.overrides]]
module = [
'desert',
'pytest',
'pytest_mock.*',
'requests.*',
'bs4.*',
'requests_mock.*',
]
ignore_missing_imports = true

[build-system]
Expand Down
1 change: 1 addition & 0 deletions src/lep_downloader/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""CLI main click group."""
1 change: 1 addition & 0 deletions src/lep_downloader/commands/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Package with CLI commands."""
1 change: 1 addition & 0 deletions src/lep_downloader/commands/download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Download command."""
1 change: 1 addition & 0 deletions src/lep_downloader/commands/parse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Convert command."""
31 changes: 31 additions & 0 deletions src/lep_downloader/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""App configuration module."""


ARCHIVE_URL = "https://hotenov.com"

LOCAL_ARCHIVE_HTML = "2021-08-10_lep-archive-page-content-pretty.html"

SHORT_LINKS_MAPPING_DICT = {
"http://wp.me/p4IuUx-7PL": "https://teacherluke.co.uk/2017/06/20/460-catching-up-with-amber-paul-6-feat-sarah-donnelly/",
"http://wp.me/p4IuUx-7C6": "https://teacherluke.co.uk/2017/04/25/444-the-rick-thompson-report-snap-general-election-2017/",
"http://wp.me/p4IuUx-7C4": "https://teacherluke.co.uk/2017/04/21/443-the-trip-to-japan-part-2/",
"http://wp.me/p4IuUx-7BQ": "https://teacherluke.co.uk/2017/04/21/442-the-trip-to-japan-part-1/",
"http://wp.me/p4IuUx-7BO": "https://teacherluke.co.uk/2017/04/18/441-andy-johnson-at-the-iatefl-conference/",
"http://wp.me/p4IuUx-7Av": "https://teacherluke.co.uk/2017/03/28/436-the-return-of-the-lying-game-with-amber-paul-video/",
"http://wp.me/p4IuUx-7zK": "https://teacherluke.co.uk/2017/03/26/i-was-interviewed-on-my-fluent-podcast-with-daniel-goodson/",
"http://wp.me/p4IuUx-7sg": "https://teacherluke.co.uk/2017/01/10/415-with-the-family-part-3-more-encounters-with-famous-people/",
"https://wp.me/p4IuUx-29": "https://teacherluke.co.uk/2011/10/11/notting-hill-carnival-video-frustration-out-takes/",
}

# MISSPELLED_LTD = ".co.ukm"

IRRELEVANT_LINKS = ("https://wp.me/P4IuUx-82H",)

EPISODE_LINK_RE = r"https?://((?P<short>wp\.me/p4IuUx-[\w-]+)|(teacherluke\.(co\.uk|wordpress\.com)/(?P<date>\d{4}/\d{2}/\d{2})/))"

LINK_TEXTS_MAPPING = {
"https://teacherluke.co.uk/2018/04/18/522-learning-english-at-summer-school-in-the-uk-a-rambling-chat-with-raphael-miller/": "522. Learning English at Summer School in the UK (A Rambling Chat with Raphael Miller)",
"https://teacherluke.co.uk/2017/08/14/website-content-lukes-criminal-past-zep-episode-185/": "[Website content] Luke’s Criminal Past (ZEP Episode 185)",
"https://teacherluke.co.uk/2017/05/26/i-was-invited-onto-the-english-across-the-pond-podcast/": "[Website content] I was invited onto the “English Across The Pond” Podcast",
"https://teacherluke.co.uk/2016/03/20/i-was-invited-onto-craig-wealands-weekly-blab-and-we-talked-about-comedy-video/": "[VIDEO] I was invited onto Craig Wealand’s weekly Blab, and we talked about comedy",
}
1 change: 1 addition & 0 deletions src/lep_downloader/downloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""LEP module for downloading logic."""
1 change: 1 addition & 0 deletions src/lep_downloader/lep.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""LEP module for general logic and classes."""
150 changes: 150 additions & 0 deletions src/lep_downloader/parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
"""LEP module for parsing logic."""
import copy
import re
from typing import Any
from typing import List

import requests
from bs4 import BeautifulSoup
from bs4 import SoupStrainer

from lep_downloader import config as conf


deleted_links = []
regex = conf.EPISODE_LINK_RE
ep_pattern = re.compile(regex, re.IGNORECASE)
s = requests.Session()


def get_web_page_html_text(page_url: str, session: requests.Session) -> Any:
"""Return HTML text of LEP archive page."""
with session:
try:
resp = session.get(page_url, timeout=(6, 33))
if not resp.ok:
resp.raise_for_status()
except requests.exceptions.HTTPError:
raise
except requests.exceptions.Timeout:
raise
except requests.exceptions.ConnectionError:
raise
except Exception:
raise
else:
resp.encoding = "utf-8"
return resp.text


def get_all_links_from_soup(soup_obj: BeautifulSoup) -> List[str]:
"""Return list of links from HTML block."""
all_links: List[str] = []
all_tags_a = soup_obj("a")
for tag_a in all_tags_a:
all_links.append(tag_a["href"].strip())

return all_links


def replace_misspelled_link(soup_obj: BeautifulSoup) -> BeautifulSoup:
"""Replace link with '.ukm' misspelled LTD."""
modified_soup = copy.copy(soup_obj) # TODO: Really needs to copy?
misspelled_tag_a = modified_soup.find(
"a", href="https://teacherluke.co.ukm/2012/08/06/london-olympics-2012/"
)
if misspelled_tag_a:
misspelled_tag_a[
"href"
] = "https://teacherluke.co.uk/2012/08/06/london-olympics-2012/"
del misspelled_tag_a
return modified_soup


def remove_irrelevant_links(links: List[str]) -> List[str]:
"""Return list of links without known irrelevant links."""
for i, link in enumerate(links[:]):
if link in conf.IRRELEVANT_LINKS:
deleted_links.append(link)
del links[i]
return links


def remove_not_episode_links_by_regex_pattern(links: List[str]) -> List[str]:
"""Return list of adopted episode (post) links."""
result: List[str] = []
for link in links:
match = ep_pattern.match(link)
if match:
result.append(link)
else:
deleted_links.append(link)
return result


def get_links_text_by_href(
soup_obj: BeautifulSoup,
links: List[str],
) -> List[str]:
"""Return text of <a></a> tag by its href attribute."""
link_strings = []
for url in links:
a_tag = soup_obj.find("a", href=url)
if url in [*conf.LINK_TEXTS_MAPPING]:
link_string = conf.LINK_TEXTS_MAPPING[url]
else:
link_string = " ".join([text for text in a_tag.stripped_strings])
link_strings.append(link_string)

return link_strings


def substitute_short_links(unique_links: List[str]) -> List[str]:
"""Return list of links with final location for short links."""
final_links = copy.deepcopy(unique_links)

for key, value in conf.SHORT_LINKS_MAPPING_DICT.items():
try:
short_link_index = unique_links.index(key)
final_links[short_link_index] = value
except ValueError:
print(f"[WARNING]: No short links: {key}")
return final_links


def get_archive_parsing_results(archive_url: str) -> Any:
"""Return Tuple with valid episode links and discarded links."""
html_page = get_web_page_html_text(archive_url, s)
only_div_entry_content = SoupStrainer("div", class_="entry-content")
soup_div = BeautifulSoup(html_page, "lxml", parse_only=only_div_entry_content)

if len(soup_div) > 0:
modified_soup_div = replace_misspelled_link(soup_div)
all_links = get_all_links_from_soup(modified_soup_div)
cleaned_links = remove_irrelevant_links(all_links)
cleaned_links = remove_not_episode_links_by_regex_pattern(cleaned_links)

# Get unique links with preserved order for Python 3.7+
unique_links = list(dict.fromkeys(cleaned_links))

# Get list of 'link labeles'
link_strings = get_links_text_by_href(modified_soup_div, unique_links)

final_list = substitute_short_links(unique_links)
parsing_result = (final_list, deleted_links, link_strings)
return parsing_result
else:
print("[ERROR] Can't parse this page: Main <div> is not found")
return None


def parse_single_page(url: str, session: requests.Session) -> Any:
"""Returns result of parsing of single page."""
req = session.get(url, timeout=(3.05, 27))
req.encoding = "utf-8"
html_text = req.text

soup_obj = BeautifulSoup(html_text, "lxml")
page_title = soup_obj.title.string
result = page_title
return result
Loading

0 comments on commit d1c44f6

Please sign in to comment.