Add writing and updating JSON database (#7)

New changes from #6: * fix(parser): Make links texts safe for windows path * test(parser): ✅ Update test with getting link text by href * chore: ➕ Add 'rope' package in dev deps for refactoring in VS Code * feat(parser): ✨ Parse episode page (date and episode number) Add LepEpisode class (with a couple of basic attrs) Add parsing functions for episode date and number Add replacing of invalid path characters for link title Add and update unit-tests to demonstrate that episode parsing (isolated) works * refactor(parser): ♻️ Add returning of URL final location during getting response Return error text instead of raising exceptions * test(parser): ✅ Add tests to check final location after redirects Update tests for differend exceptions during getting response * feat(parser): ✨ Add index generating for post URL (for quick search / reference in the future) * test(parser): ✅ Add two tests to check index generation * feat: Add 'admin_note' attribute to LepEpisode class * feat(parser): Add logic for bad response of parsing page * test(parser): ✅ Update tests taking into account response status for parsing page Add test to check return value (None) for non-episode link Black formatting changes * style(parser): 🏷️ Fix 'mypy' and 'pre-commit' errors * feat(parser): ✨ Add 'parsing_utc' attribute for LepEpisode class Add storing parsing datetime in each episode * test(parser): ✅ Update test to check parsing all links from list Add non-episode HTML and link to check when parsed episode is empty Return 'fail_under' = 100 in pyroject.toml * feat(parser): ✨ Add function to parsing links to episode audio (parts) Add attributes 'post_type' and 'audios' for LepEpisode class * test(parser): ✅ Add minimum sufficient tests (to satisfy Coverage) * refactor(parser): ♻️ Unify parsing part of archive page (tag <article> only) Rename soup objects * style: 🎨 Fix 'pre-commit' errors for imports order * refactor: ♻️ Change default value for 'audios' attribute to None because 'flake8-bugbear' error B006 was raised * perf(parser): ⚡ Change algorithm to extract episode links and their texts Remove mapping dict i.e. there are no duplicates now * test(parser): ✅ Update tests according to new archive parsing algorythm Remove tests for two deleted functions * test(parser): ✅ Add two tests to check parsing mp3 links for certain cases Exclude links to separate short audio No dupplicates for 'audio' word in the URL * style: ✏️ Fix wrong writing of 'non-episode' word * feat(parser): ✨ Add function for descending sorting of parsed episodes Change returned type of episodes list to List[LepEpisode] * test(parser): ✅ Add test to check episodes sorting Modify existing tests to follow changes for returned type of parsing function * fix(parser): 🐛 Change secondary key sorting to 'index' becuase could be episodes with the same date but without episode number Update unit-test * chore: 🔧 Add JSON_DB_URL configuration parameter * feat: 🏷️ Add 'LepJsonEncoder' class for json dump operations * feat(parser): ✨ Add rough implementation of 'main' method with parsing actions (including writing JSON file) * test(parser): ✅ Add tests for writing and updating JSON database Add JSON (pretty) fixture with test database * style: 🎨 Fix imports by pre-commit Add json files to exclude types in '.pre-commit-config.yaml'
hotenov · Oct 20, 2021 · 9b025af · 9b025af
1 parent d1c44f6
commit 9b025af
Show file tree

Hide file tree

Showing 9 changed files with 10,389 additions and 168 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -35,7 +35,7 @@ repos:
         language: system
         types: [text]
         stages: [commit, push, manual]
-        exclude_types: [html]
+        exclude_types: [html, json]
       - id: flake8
         name: flake8
         entry: flake8
@@ -54,9 +54,9 @@ repos:
         language: system
         types: [text]
         stages: [commit, push, manual]
-        exclude_types: [html]
+        exclude_types: [html, json]
   - repo: https://github.com/pre-commit/mirrors-prettier
     rev: v2.3.0
     hooks:
       - id: prettier
-        exclude_types: [html]
+        exclude_types: [html, json]
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -50,6 +50,7 @@ Pygments = "^2.9.0"
 requests-mock = "^1.9.3"
 flake8-black = "^0.2.3"
 flake8-import-order = "^0.18.1"
+rope = "^0.20.1"
 
 [tool.poetry.scripts]
 lep-downloader = "lep_downloader.__main__:main"
@@ -63,7 +64,7 @@ source = ["lep_downloader"]
 
 [tool.coverage.report]
 show_missing = true
-fail_under = 85
+fail_under = 100
 
 [tool.mypy]
 strict = true

diff --git a/src/lep_downloader/config.py b/src/lep_downloader/config.py
@@ -3,6 +3,8 @@
 
 ARCHIVE_URL = "https://hotenov.com"
 
+JSON_DB_URL = "https://hotenov.com/some_json.json"
+
 LOCAL_ARCHIVE_HTML = "2021-08-10_lep-archive-page-content-pretty.html"
 
 SHORT_LINKS_MAPPING_DICT = {
@@ -23,9 +25,4 @@
 
 EPISODE_LINK_RE = r"https?://((?P<short>wp\.me/p4IuUx-[\w-]+)|(teacherluke\.(co\.uk|wordpress\.com)/(?P<date>\d{4}/\d{2}/\d{2})/))"
 
-LINK_TEXTS_MAPPING = {
-    "https://teacherluke.co.uk/2018/04/18/522-learning-english-at-summer-school-in-the-uk-a-rambling-chat-with-raphael-miller/": "522. Learning English at Summer School in the UK (A Rambling Chat with Raphael Miller)",
-    "https://teacherluke.co.uk/2017/08/14/website-content-lukes-criminal-past-zep-episode-185/": "[Website content] Luke’s Criminal Past (ZEP Episode 185)",
-    "https://teacherluke.co.uk/2017/05/26/i-was-invited-onto-the-english-across-the-pond-podcast/": "[Website content] I was invited onto the “English Across The Pond” Podcast",
-    "https://teacherluke.co.uk/2016/03/20/i-was-invited-onto-craig-wealands-weekly-blab-and-we-talked-about-comedy-video/": "[VIDEO] I was invited onto Craig Wealand’s weekly Blab, and we talked about comedy",
-}
+INVALID_PATH_CHARS_RE = r"[<>:\"/\\\\|?*]"
diff --git a/src/lep_downloader/lep.py b/src/lep_downloader/lep.py
@@ -1 +1,53 @@
 """LEP module for general logic and classes."""
+import json
+import typing as t
+
+
+class LepEpisode(object):
+    """LEP episode class."""
+
+    def __init__(
+        self,
+        episode: int = 0,
+        date: str = "2000-01-01T00:00:00+00:00",
+        url: str = "",
+        post_title: str = "",
+        post_type: str = "",
+        parsing_utc: str = "",
+        index: int = 0,
+        audios: t.Optional[t.List[t.List[str]]] = None,
+        admin_note: str = "",
+    ) -> None:
+        """Default instance of LepEpisode.
+
+        Args:
+            episode (int): Episode number.
+            date (str): Post datetime (default 2000-01-01T00:00:00+00:00).
+            url (str): Final location of post URL.
+            post_title (str): Post title, extracted from tag <a> and safe for windows path.
+            post_type (str): Post type ("AUDIO", "TEXT", etc.).
+            audios (list): List of links lists (for multi-part episodes).
+            parsing_utc (str): Parsing datetime in UTC timezone (with microseconds).
+            index (int): Parsing index: concatenation of URL date and increment (for several posts).
+            admin_note (str): Note for administrator and storing error message (for bad response)
+        """
+        self.episode = episode
+        self.date = date
+        self.url = url
+        self.post_title = post_title
+        self.post_type = post_type
+        self.audios = audios
+        self.parsing_utc = parsing_utc
+        self.index = index
+        self.admin_note = admin_note
+
+
+class LepJsonEncoder(json.JSONEncoder):
+    """Custom JSONEncoder for LepEpisode objects."""
+
+    def default(self, obj: t.Any) -> t.Any:
+        """Override 'default' method for encoding JSON objects."""
+        if isinstance(obj, LepEpisode):
+            return obj.__dict__
+        # Let the base class default method raise the TypeError
+        return json.JSONEncoder.default(self, obj)