[wip] address linting errors in xml handlers

MITLibraries · Nov 21, 2023 · 618de0c · 618de0c
1 parent 1cc0119
commit 618de0c
Show file tree

Hide file tree

Showing 3 changed files with 236 additions and 93 deletions.
diff --git a/Makefile b/Makefile
@@ -33,7 +33,7 @@ mypy:
 	pipenv run mypy .
 
 ruff:
-	pipenv run ruff check .
+	pipenv run ruff check ./solenoid/elements
 
 safety:
 	pipenv check

diff --git a/pyproject.toml b/pyproject.toml
@@ -8,7 +8,7 @@ line-length = 90
 [tool.mypy]
 disallow_untyped_calls = true
 disallow_untyped_defs = true
-exclude = ["tests/"]
+exclude = ["tests/", "userauth/"]
 
 [tool.pytest.ini_options]
 log_level = "INFO"

diff --git a/solenoid/elements/xml_handlers.py b/solenoid/elements/xml_handlers.py
@@ -2,6 +2,8 @@
 import logging
 import xml.etree.ElementTree as ET
 
+from typing import Generator
+
 from django.utils import timezone
 
 logger = logging.getLogger(__name__)
@@ -10,53 +12,223 @@
     "atom": "http://www.w3.org/2005/Atom",
     "api": "http://www.symplectic.co.uk/publications/api",
 }
+OA_POLICY_ENACTED_DATE = dt.date(2009, 3, 18)
 
 
-def extract_attribute(root, search_string, attribute):
-    try:
-        value = root.find(search_string, NS).get(attribute)
-        if value is None:
-            value = ""
-    except AttributeError:
-        value = ""
-    return value
+def extract_attribute(root: ET.Element, search_string: str, attribute: str) -> str | None:
+    element = root.find(search_string, NS)
+    if element:
+        return element.get(attribute, "")
+    else:
+        return ""
 
 
-def extract_field(root, search_string):
-    try:
-        field = root.find(search_string, NS).text
-    except AttributeError:
+def extract_field(root: ET.Element, search_string: str) -> str | None:
+    element = root.find(search_string)
+    if element:
+        try:
+            field = element.text
+        except AttributeError:
+            field = ""
+    else:
         field = ""
     return field
 
 
-def get_pub_date(root):
-    try:
-        year = int(
-            extract_field(root, ".//api:field[@name='publication-date']" "//api:year")
-        )
-    except ValueError:
+def get_pub_date(root: ET.Element) -> str | None | dt.date:
+    year_value = extract_field(root, ".//api:field[@name='publication-date']//api:year")
+    if year_value:
+        try:
+            year = int(year_value)
+        except ValueError:
+            return None
+    else:
         return None
-    try:
-        month = int(
-            extract_field(root, ".//api:field[@name='publication-date']" "//api:month")
-        )
-    except ValueError:
+
+    month_value = extract_field(root, ".//api:field[@name='publication-date']//api:month")
+    if month_value:
+        try:
+            month = int(month_value)
+        except ValueError:
+            month = 1
+    else:
         month = 1
-    try:
-        day = int(
-            extract_field(root, ".//api:field[@name='publication-date']" "//api:day")
-        )
-    except ValueError:
+
+    day_value = extract_field(root, ".//api:field[@name='publication-date']//api:day")
+    if day_value:
+        try:
+            day = int(day_value)
+        except ValueError:
+            day = 1
+    else:
         day = 1
+
     try:
         pub_date = dt.date(year, month, day)
     except ValueError:
         pub_date = dt.date(year, 1, 1)
     return pub_date
 
 
-def make_xml(username):
+def pub_date_is_valid(pub_date: dt.date, author_data: dict) -> bool:
+    """Determine whether an article's publication date is valid for inclusion in request.
+
+    An article's publication date is considered valid for inclusion and the function will
+    proceed with other checks if it meets ANY (at least one) of the following criteria:
+
+        1. publication date is unknown (None);
+        2. publication date is *after* the date the OA policy was in effect AND
+            *during* the author's period of employment with MIT.
+
+    An article's publication date is considered invalid and will be excluded from the
+    request if it meets ALL of the following criteria:
+
+        1. publication date is known;
+        2. publication date is before (<=) the date the OA policy was in effect;
+        3. publication date is outside of author's employment with MIT
+
+    Args:
+        pub_date (dt.date): Date of publication.
+        author_data (dict): Data about the author's start and end dates of MIT employment.
+
+    Returns:
+        bool: Flag indicating whether date is valid (True) or invalid (False).
+    """
+    if pub_date is None:
+        return True
+    else:
+        author_start_date = dt.date.fromisoformat(author_data["Start Date"])
+        author_end_date = dt.date.fromisoformat(author_data["End Date"])
+        if pub_date <= OA_POLICY_ENACTED_DATE:
+            return False
+        elif pub_date < author_start_date or pub_date > author_end_date:
+            return False
+        else:
+            return True
+
+
+def pub_has_library_status(element: ET.Element) -> bool:
+    if element.find(".//api:library-status", NS):
+        return True
+    return False
+
+
+def pub_manual_entry_is_valid(element: ET.Element) -> bool:
+    """Determine if a manual entry for the article exists and is valid.
+
+    If a manual entry *doesn't* exist for the article, the function can proceed with
+    other checks to determine if the article should be included in request.
+
+    If a manual entry *does* exist for the article, the entry must have
+    ALL of the following fields marked as "false" before the function can proceed
+    with other checks:
+
+        1. c-do-not-request
+        2. c-optout
+        3. c-received
+        4. c-requested
+
+    If any of the fields are marked as "true", the function will
+
+    Args:
+        element (ET.Element): Element representing an article.
+
+    Returns:
+        bool: Flag indicating whether manually entered data for an article is valid
+            (True) or invalid (False).
+    """
+    if do_not_request := element.find(
+        ".//api:field[@name='c-do-not-request']/api:boolean", NS
+    ):
+        if do_not_request.text == "true":
+            return False
+
+    if optout := element.find(".//api:field[@name='c-optout']/api:boolean", NS):
+        if optout.text == "true":
+            return False
+
+    if received := element.find(".//api:field[@name='c-received']/api:boolean", NS):
+        if received.text == "true":
+            return False
+
+    if requested := element.find(".//api:field[@name='c-requested']/api:boolean", NS):
+        if received.text == "true":
+            return False
+    return True
+
+
+def pub_is_oa_exempt(element: ET.Element) -> bool:
+    """Determine if an article is exempt from inclusion in request due to OA excepions.
+
+    An article *is* exempt from inclusion if ALL of the following criteria are met:
+
+        1. At least one (1) OA policy exception, excluding "Waiver",
+            applies to the article.
+
+    An article *is not* exempt from inclusion if ANY of the following criteria are met:
+        1. Zero (0) OA policy exceptions apply to the article.
+        2. If any OA policy exceptions apply to the article, "Waiver" is included.
+
+    Args:
+        element (ET.Element): Element representing an article.
+
+    Returns:
+        bool: Flag indicating whether article is exempt from OA policy
+            (i.e., excluded from request).
+    """
+    if element.find(".//api:oa-policy-exception", NS):
+        exceptions = [
+            e.text for e in element.findall(".//api:oa-policy-exception/api:type", NS)
+        ]
+        if "Waiver" in exceptions:
+            return False
+        return True
+    return False
+
+
+def pub_type_is_valid(element: ET.Element) -> bool:
+    """Determine whether an article's publication type is valid for inclusion in request.
+
+    An article's publication date is considered valid for inclusion and will
+    proceed with other checks if the type is either a journal article, book chapter, or
+    conference proceeding.
+    """
+    pub_type = extract_attribute(
+        element, search_string=".//api:object", attribute="type-id"
+    )
+    if pub_type in ("3", "4", "5"):
+        return True
+    return False
+
+
+def include_pub_in_request(element: ET.Element, author_data: dict) -> bool:
+    # Filter for papers to be requested based on various criteria
+    pub_date = get_pub_date(element)
+    if not pub_date_is_valid(pub_date, author_data):
+        return False
+    if pub_has_library_status(element):
+        return False
+    if not pub_type_is_valid(element):
+        return False
+    if pub_is_oa_exempt(element):
+        return False
+
+    if element.find(".//api:record[@source-name='manual']", NS):
+        if not pub_manual_entry_is_valid(element):
+            return False
+
+    # # If paper has a dspace record in Elements, status is not 'Public'
+    # # or 'Private' (in either case it has been deposited and should not
+    # # be requested)
+    # if element.find(".//api:record[@source-name='dspace']", NS):
+    #     status = extract_field(
+    #         element, ".//api:field[@name=" "'repository-status']/api:text"
+    #     )
+    #     if status == "Public" or status == "Private":
+    #         continue
+
+
+def make_xml(username: str) -> ET.Element:
     top = ET.Element("update-object")
     top.set("xmlns", "http://www.symplectic.co.uk/publications/api")
     oa_field = ET.SubElement(top, "oa")
@@ -79,7 +251,7 @@ def make_xml(username):
     return top
 
 
-def parse_author_pubs_xml(xml_gen, author_data):
+def parse_author_pubs_xml(xml_gen: Generator, author_data: dict) -> list[dict]:
     """Takes a an author-publications record feed from Symplectic
     Elements, parses each record according to local rules for which
     publications should be requested based on certain metadata fields, and
@@ -89,67 +261,22 @@ def parse_author_pubs_xml(xml_gen, author_data):
     RESULTS = []
     for page in xml_gen:
         root = ET.fromstring(page)
-        for entry in root.findall("./atom:entry", NS):
-            pub_id = entry.find(".//api:object[@category='publication']", NS).get("id")
-            title = entry.find(".//api:field[@name='title']/api:text", NS).text
-            # Filter for papers to be requested based on various criteria
-            pub_date = get_pub_date(entry)
-            if not pub_date:
-                pass
-            # Paper was published after OA policy enacted
-            elif pub_date <= dt.date(2009, 3, 18):
-                continue
-            # Paper was published while author was MIT faculty
-            elif pub_date < dt.date.fromisoformat(
-                author_data["Start Date"]
-            ) or pub_date > dt.date.fromisoformat(author_data["End Date"]):
-                continue
-            # Paper does not have a library status
-            if entry.find(".//api:library-status", NS):
-                continue
-            # Publication type is either a journal article, book chapter, or
-            # conference proceeding
-            pub_type = extract_attribute(entry, ".//api:object", "type-id")
-            if pub_type not in ("3", "4", "5"):
-                continue
-            # Paper does not have any OA policy exceptions, except for "Waiver"
-            # which we do request
-            if entry.find(".//api:oa-policy-exception", NS):
-                exceptions = [
-                    e.text
-                    for e in entry.findall(
-                        ".//api:oa-policy-" "exception/" "api:type", NS
-                    )
-                ]
-                if "Waiver" not in exceptions:
-                    continue
-            # If paper has a manual entry record in Elements, none of the
-            # following fields are true
-            if entry.find(".//api:record[@source-name='manual']", NS):
-                if (
-                    entry.find(
-                        ".//api:field[@name='c-do-not-request']" "/api:boolean", NS
-                    ).text
-                    == "true"
-                    or entry.find(".//api:field[@name='c-optout']/api:boolean", NS).text
-                    == "true"
-                    or entry.find(".//api:field[@name='c-received']/api:boolean", NS).text
-                    == "true"
-                    or entry.find(
-                        ".//api:field[@name='c-requested']/api:boolean", NS
-                    ).text
-                    == "true"
-                ):
-                    continue
-            # If paper has a dspace record in Elements, status is not 'Public'
-            # or 'Private' (in either case it has been deposited and should not
-            # be requested)
-            if entry.find(".//api:record[@source-name='dspace']", NS):
-                status = extract_field(
-                    entry, ".//api:field[@name=" "'repository-status']/api:text"
-                )
-                if status == "Public" or status == "Private":
-                    continue
+        for element in root.findall("./atom:entry", NS):
+            pub_element = element.find(".//api:object[@category='publication']", NS)
+            if pub_element:
+                pub_id = pub_element.get("id")
+            else:
+                pub_id = None
+
+            title_element = element.find(".//api:field[@name='title']/api:text", NS)
+            if title_element:
+                title = title_element.text
+
+            if include_pub_in_request(element, author_data):
+                RESULTS.append({"id": pub_id, "title": title})
+            else:
+                print("Bloop!")
+
             # If paper has passed all the checks above, add it to request list
             RESULTS.append({"id": pub_id, "title": title})
     return RESULTS
@@ -211,3 +338,19 @@ def parse_paper_xml(paper_xml):
         "Issue": extract_field(root, ".//api:field[@name='issue']/api:text"),
     }
     return PAPER_DATA
+
+
+def test(pub_dates):
+    for date in pub_dates:
+        if not date:
+            pass
+        # Paper was published after OA policy enacted
+        elif date <= dt.date(2009, 3, 18):
+            print("Pub date <= OA POLICY ENACTED")
+            continue
+            # Paper was published while author was MIT faculty
+        elif date < dt.date(2010, 4, 5) or date > dt.date(2012, 3, 1):
+            print("Check")
+            continue
+
+        print("Hooray")