Add alpha 'parser' module (archive page) (#6)

* chore: 🙈 Return ignoring 'dist' folder after deleting old files * chore: Add blank files for future code structure * chore: ➕ Add requests, beautifulsoup4, lxml for parsing HTML 2.26.0 for requests 4.9.3 for beautifulsoup4 4.6.3 for lxml * chore: ➕ Add 'requests-file' to dev deps for future unit-tests * chore: ➕ Add requests-mock 1.9.3 to dev dependencies * feat(parser): ✨ Add function to parse archive page Decople separate modules: config, parser, downloader from lep module * test(parser): ✅ Add test for checking mocked response of archive page Add test file with presaved archive HTML page * chore: Add test HTML files for several episodes to mock them in tests * refactor(parser): 🚧 Update two functions for parsing all links and getting text link by href * chore: 🔧 Add mapping dict for 4 links and their text * test: 🚧 Add PoC test for mocking several episode pages * chore: 🔧 Update mypy settings in pyproject.toml - Decrease 'fail_under' downto 85 (during active develpment stage) - Use new syntax (section) for ignoring imports in mypy for several packages * chore: 🔧 Rename Tuple with irrelevant links * chore: ♻️ Improve typings and function names Add raised exceptions (try/except) for function to get text of HTML * test(parser): ✅ Add several general tests for parsing functions * ci: 🔧 Add installing of 'requests_mock' into 'tests' Nox session * ci: 🔧 Exclude HTML files from 'pre-commit' hooks * style: 🎨 Commit changes which were modified by 'pre-commit' hooks * style: 🎨 Fix flake8 errors * chore: ➕ Add flake8 plugins to dev deps flake8-black (0.2.3) flake8-import-order (0.18.1) * chore: 🔧 Change flake8 config: max-line-length = 120 and ignore long lines (B950) * ci: 🔧 Add installation of 'requests_mock' into 'typeguard' Nox session * chore: ➖ Remove unused 'requests-file' from dev-deps
hotenov · Sep 20, 2021 · d1c44f6 · d1c44f6
1 parent 434c2e4
commit d1c44f6
Show file tree

Hide file tree

Showing 31 changed files with 22,066 additions and 36 deletions.
diff --git a/.flake8 b/.flake8
@@ -1,7 +1,7 @@
 [flake8]
 select = B,B9,C,D,DAR,E,F,N,RST,S,W
-ignore = E203,E501,RST201,RST203,RST301,W503
-max-line-length = 80
+ignore = E203,E501,RST201,RST203,RST301,W503,B950
+max-line-length = 120
 max-complexity = 10
 docstring-convention = google
 per-file-ignores = tests/*:S101

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -5,6 +5,7 @@ exclude: |
     luke_english_podcast_downloader.py|
     dist/2.0.5/LEP-downloader.py
   )$
+
 repos:
   - repo: local
     hooks:
@@ -34,6 +35,7 @@ repos:
         language: system
         types: [text]
         stages: [commit, push, manual]
+        exclude_types: [html]
       - id: flake8
         name: flake8
         entry: flake8
@@ -52,7 +54,9 @@ repos:
         language: system
         types: [text]
         stages: [commit, push, manual]
+        exclude_types: [html]
   - repo: https://github.com/pre-commit/mirrors-prettier
     rev: v2.3.0
     hooks:
       - id: prettier
+        exclude_types: [html]
diff --git a/noxfile.py b/noxfile.py
@@ -129,7 +129,7 @@ def mypy(session: Session) -> None:
 def tests(session: Session) -> None:
     """Run the test suite."""
     session.install(".")
-    session.install("coverage[toml]", "pytest", "pygments")
+    session.install("coverage[toml]", "pytest", "pygments", "requests_mock")
     try:
         session.run("coverage", "run", "--parallel", "-m", "pytest", *session.posargs)
     finally:
@@ -154,7 +154,7 @@ def coverage(session: Session) -> None:
 def typeguard(session: Session) -> None:
     """Runtime type checking using Typeguard."""
     session.install(".")
-    session.install("pytest", "typeguard", "pygments")
+    session.install("pytest", "typeguard", "pygments", "requests_mock")
     session.run("pytest", f"--typeguard-packages={package}", *session.posargs)
 
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -20,6 +20,9 @@ Changelog = "https://github.com/hotenov/LEP-downloader/releases"
 [tool.poetry.dependencies]
 python = "^3.7.0"
 click = "^8.0.1"
+requests = "^2.26.0"
+beautifulsoup4 = "^4.9.3"
+lxml = "^4.6.3"
 
 [tool.poetry.dev-dependencies]
 pytest = "^6.2.4"
@@ -44,6 +47,9 @@ pre-commit-hooks = "^4.0.1"
 sphinx-rtd-theme = "^0.5.2"
 sphinx-click = "^3.0.1"
 Pygments = "^2.9.0"
+requests-mock = "^1.9.3"
+flake8-black = "^0.2.3"
+flake8-import-order = "^0.18.1"
 
 [tool.poetry.scripts]
 lep-downloader = "lep_downloader.__main__:main"
@@ -57,7 +63,7 @@ source = ["lep_downloader"]
 
 [tool.coverage.report]
 show_missing = true
-fail_under = 100
+fail_under = 85
 
 [tool.mypy]
 strict = true
@@ -66,13 +72,15 @@ show_column_numbers = true
 show_error_codes = true
 show_error_context = true
 
-[mypy-desert]
-ignore_missing_imports = true
-
-[mypy-pytest]
-ignore_missing_imports = true
-
-[mypy-pytest_mock]
+[[tool.mypy.overrides]]
+module = [
+    'desert',
+    'pytest',
+    'pytest_mock.*',
+    'requests.*',
+    'bs4.*',
+    'requests_mock.*',
+]
 ignore_missing_imports = true
 
 [build-system]

diff --git a/src/lep_downloader/cli.py b/src/lep_downloader/cli.py
@@ -0,0 +1 @@
+"""CLI main click group."""
diff --git a/src/lep_downloader/commands/__init__.py b/src/lep_downloader/commands/__init__.py
@@ -0,0 +1 @@
+"""Package with CLI commands."""
diff --git a/src/lep_downloader/commands/download.py b/src/lep_downloader/commands/download.py
@@ -0,0 +1 @@
+"""Download command."""
diff --git a/src/lep_downloader/commands/parse.py b/src/lep_downloader/commands/parse.py
@@ -0,0 +1 @@
+"""Convert command."""
diff --git a/src/lep_downloader/config.py b/src/lep_downloader/config.py
@@ -0,0 +1,31 @@
+"""App configuration module."""
+
+
+ARCHIVE_URL = "https://hotenov.com"
+
+LOCAL_ARCHIVE_HTML = "2021-08-10_lep-archive-page-content-pretty.html"
+
+SHORT_LINKS_MAPPING_DICT = {
+    "http://wp.me/p4IuUx-7PL": "https://teacherluke.co.uk/2017/06/20/460-catching-up-with-amber-paul-6-feat-sarah-donnelly/",
+    "http://wp.me/p4IuUx-7C6": "https://teacherluke.co.uk/2017/04/25/444-the-rick-thompson-report-snap-general-election-2017/",
+    "http://wp.me/p4IuUx-7C4": "https://teacherluke.co.uk/2017/04/21/443-the-trip-to-japan-part-2/",
+    "http://wp.me/p4IuUx-7BQ": "https://teacherluke.co.uk/2017/04/21/442-the-trip-to-japan-part-1/",
+    "http://wp.me/p4IuUx-7BO": "https://teacherluke.co.uk/2017/04/18/441-andy-johnson-at-the-iatefl-conference/",
+    "http://wp.me/p4IuUx-7Av": "https://teacherluke.co.uk/2017/03/28/436-the-return-of-the-lying-game-with-amber-paul-video/",
+    "http://wp.me/p4IuUx-7zK": "https://teacherluke.co.uk/2017/03/26/i-was-interviewed-on-my-fluent-podcast-with-daniel-goodson/",
+    "http://wp.me/p4IuUx-7sg": "https://teacherluke.co.uk/2017/01/10/415-with-the-family-part-3-more-encounters-with-famous-people/",
+    "https://wp.me/p4IuUx-29": "https://teacherluke.co.uk/2011/10/11/notting-hill-carnival-video-frustration-out-takes/",
+}
+
+# MISSPELLED_LTD = ".co.ukm"
+
+IRRELEVANT_LINKS = ("https://wp.me/P4IuUx-82H",)
+
+EPISODE_LINK_RE = r"https?://((?P<short>wp\.me/p4IuUx-[\w-]+)|(teacherluke\.(co\.uk|wordpress\.com)/(?P<date>\d{4}/\d{2}/\d{2})/))"
+
+LINK_TEXTS_MAPPING = {
+    "https://teacherluke.co.uk/2018/04/18/522-learning-english-at-summer-school-in-the-uk-a-rambling-chat-with-raphael-miller/": "522. Learning English at Summer School in the UK (A Rambling Chat with Raphael Miller)",
+    "https://teacherluke.co.uk/2017/08/14/website-content-lukes-criminal-past-zep-episode-185/": "[Website content] Luke’s Criminal Past (ZEP Episode 185)",
+    "https://teacherluke.co.uk/2017/05/26/i-was-invited-onto-the-english-across-the-pond-podcast/": "[Website content] I was invited onto the “English Across The Pond” Podcast",
+    "https://teacherluke.co.uk/2016/03/20/i-was-invited-onto-craig-wealands-weekly-blab-and-we-talked-about-comedy-video/": "[VIDEO] I was invited onto Craig Wealand’s weekly Blab, and we talked about comedy",
+}
diff --git a/src/lep_downloader/downloader.py b/src/lep_downloader/downloader.py
@@ -0,0 +1 @@
+"""LEP module for downloading logic."""
diff --git a/src/lep_downloader/lep.py b/src/lep_downloader/lep.py
@@ -0,0 +1 @@
+"""LEP module for general logic and classes."""
diff --git a/src/lep_downloader/parser.py b/src/lep_downloader/parser.py
@@ -0,0 +1,150 @@
+"""LEP module for parsing logic."""
+import copy
+import re
+from typing import Any
+from typing import List
+
+import requests
+from bs4 import BeautifulSoup
+from bs4 import SoupStrainer
+
+from lep_downloader import config as conf
+
+
+deleted_links = []
+regex = conf.EPISODE_LINK_RE
+ep_pattern = re.compile(regex, re.IGNORECASE)
+s = requests.Session()
+
+
+def get_web_page_html_text(page_url: str, session: requests.Session) -> Any:
+    """Return HTML text of LEP archive page."""
+    with session:
+        try:
+            resp = session.get(page_url, timeout=(6, 33))
+            if not resp.ok:
+                resp.raise_for_status()
+        except requests.exceptions.HTTPError:
+            raise
+        except requests.exceptions.Timeout:
+            raise
+        except requests.exceptions.ConnectionError:
+            raise
+        except Exception:
+            raise
+        else:
+            resp.encoding = "utf-8"
+            return resp.text
+
+
+def get_all_links_from_soup(soup_obj: BeautifulSoup) -> List[str]:
+    """Return list of links from HTML block."""
+    all_links: List[str] = []
+    all_tags_a = soup_obj("a")
+    for tag_a in all_tags_a:
+        all_links.append(tag_a["href"].strip())
+
+    return all_links
+
+
+def replace_misspelled_link(soup_obj: BeautifulSoup) -> BeautifulSoup:
+    """Replace link with '.ukm' misspelled LTD."""
+    modified_soup = copy.copy(soup_obj)  # TODO: Really needs to copy?
+    misspelled_tag_a = modified_soup.find(
+        "a", href="https://teacherluke.co.ukm/2012/08/06/london-olympics-2012/"
+    )
+    if misspelled_tag_a:
+        misspelled_tag_a[
+            "href"
+        ] = "https://teacherluke.co.uk/2012/08/06/london-olympics-2012/"
+    del misspelled_tag_a
+    return modified_soup
+
+
+def remove_irrelevant_links(links: List[str]) -> List[str]:
+    """Return list of links without known irrelevant links."""
+    for i, link in enumerate(links[:]):
+        if link in conf.IRRELEVANT_LINKS:
+            deleted_links.append(link)
+            del links[i]
+    return links
+
+
+def remove_not_episode_links_by_regex_pattern(links: List[str]) -> List[str]:
+    """Return list of adopted episode (post) links."""
+    result: List[str] = []
+    for link in links:
+        match = ep_pattern.match(link)
+        if match:
+            result.append(link)
+        else:
+            deleted_links.append(link)
+    return result
+
+
+def get_links_text_by_href(
+    soup_obj: BeautifulSoup,
+    links: List[str],
+) -> List[str]:
+    """Return text of <a></a> tag by its href attribute."""
+    link_strings = []
+    for url in links:
+        a_tag = soup_obj.find("a", href=url)
+        if url in [*conf.LINK_TEXTS_MAPPING]:
+            link_string = conf.LINK_TEXTS_MAPPING[url]
+        else:
+            link_string = " ".join([text for text in a_tag.stripped_strings])
+        link_strings.append(link_string)
+
+    return link_strings
+
+
+def substitute_short_links(unique_links: List[str]) -> List[str]:
+    """Return list of links with final location for short links."""
+    final_links = copy.deepcopy(unique_links)
+
+    for key, value in conf.SHORT_LINKS_MAPPING_DICT.items():
+        try:
+            short_link_index = unique_links.index(key)
+            final_links[short_link_index] = value
+        except ValueError:
+            print(f"[WARNING]: No short links: {key}")
+    return final_links
+
+
+def get_archive_parsing_results(archive_url: str) -> Any:
+    """Return Tuple with valid episode links and discarded links."""
+    html_page = get_web_page_html_text(archive_url, s)
+    only_div_entry_content = SoupStrainer("div", class_="entry-content")
+    soup_div = BeautifulSoup(html_page, "lxml", parse_only=only_div_entry_content)
+
+    if len(soup_div) > 0:
+        modified_soup_div = replace_misspelled_link(soup_div)
+        all_links = get_all_links_from_soup(modified_soup_div)
+        cleaned_links = remove_irrelevant_links(all_links)
+        cleaned_links = remove_not_episode_links_by_regex_pattern(cleaned_links)
+
+        # Get unique links with preserved order for Python 3.7+
+        unique_links = list(dict.fromkeys(cleaned_links))
+
+        # Get list of 'link labeles'
+        link_strings = get_links_text_by_href(modified_soup_div, unique_links)
+
+        final_list = substitute_short_links(unique_links)
+        parsing_result = (final_list, deleted_links, link_strings)
+        return parsing_result
+    else:
+        print("[ERROR] Can't parse this page: Main <div> is not found")
+        return None
+
+
+def parse_single_page(url: str, session: requests.Session) -> Any:
+    """Returns result of parsing of single page."""
+    req = session.get(url, timeout=(3.05, 27))
+    req.encoding = "utf-8"
+    html_text = req.text
+
+    soup_obj = BeautifulSoup(html_text, "lxml")
+    page_title = soup_obj.title.string
+    result = page_title
+    return result
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""LEP module for general logic and classes."""