Skip to content

Commit

Permalink
[wip] address linting errors in xml handlers
Browse files Browse the repository at this point in the history
  • Loading branch information
jonavellecuerdo committed Nov 21, 2023
1 parent 1cc0119 commit 618de0c
Show file tree
Hide file tree
Showing 3 changed files with 236 additions and 93 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ mypy:
pipenv run mypy .

ruff:
pipenv run ruff check .
pipenv run ruff check ./solenoid/elements

safety:
pipenv check
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ line-length = 90
[tool.mypy]
disallow_untyped_calls = true
disallow_untyped_defs = true
exclude = ["tests/"]
exclude = ["tests/", "userauth/"]

[tool.pytest.ini_options]
log_level = "INFO"
Expand Down
325 changes: 234 additions & 91 deletions solenoid/elements/xml_handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import logging
import xml.etree.ElementTree as ET

from typing import Generator

from django.utils import timezone

logger = logging.getLogger(__name__)
Expand All @@ -10,53 +12,223 @@
"atom": "http://www.w3.org/2005/Atom",
"api": "http://www.symplectic.co.uk/publications/api",
}
OA_POLICY_ENACTED_DATE = dt.date(2009, 3, 18)


def extract_attribute(root, search_string, attribute):
try:
value = root.find(search_string, NS).get(attribute)
if value is None:
value = ""
except AttributeError:
value = ""
return value
def extract_attribute(root: ET.Element, search_string: str, attribute: str) -> str | None:
element = root.find(search_string, NS)
if element:
return element.get(attribute, "")
else:
return ""


def extract_field(root, search_string):
try:
field = root.find(search_string, NS).text
except AttributeError:
def extract_field(root: ET.Element, search_string: str) -> str | None:
element = root.find(search_string)
if element:
try:
field = element.text
except AttributeError:
field = ""
else:
field = ""
return field


def get_pub_date(root):
try:
year = int(
extract_field(root, ".//api:field[@name='publication-date']" "//api:year")
)
except ValueError:
def get_pub_date(root: ET.Element) -> str | None | dt.date:
year_value = extract_field(root, ".//api:field[@name='publication-date']//api:year")
if year_value:
try:
year = int(year_value)
except ValueError:
return None
else:
return None
try:
month = int(
extract_field(root, ".//api:field[@name='publication-date']" "//api:month")
)
except ValueError:

month_value = extract_field(root, ".//api:field[@name='publication-date']//api:month")
if month_value:
try:
month = int(month_value)
except ValueError:
month = 1
else:
month = 1
try:
day = int(
extract_field(root, ".//api:field[@name='publication-date']" "//api:day")
)
except ValueError:

day_value = extract_field(root, ".//api:field[@name='publication-date']//api:day")
if day_value:
try:
day = int(day_value)
except ValueError:
day = 1
else:
day = 1

try:
pub_date = dt.date(year, month, day)
except ValueError:
pub_date = dt.date(year, 1, 1)
return pub_date


def make_xml(username):
def pub_date_is_valid(pub_date: dt.date, author_data: dict) -> bool:
"""Determine whether an article's publication date is valid for inclusion in request.
An article's publication date is considered valid for inclusion and the function will
proceed with other checks if it meets ANY (at least one) of the following criteria:
1. publication date is unknown (None);
2. publication date is *after* the date the OA policy was in effect AND
*during* the author's period of employment with MIT.
An article's publication date is considered invalid and will be excluded from the
request if it meets ALL of the following criteria:
1. publication date is known;
2. publication date is before (<=) the date the OA policy was in effect;
3. publication date is outside of author's employment with MIT
Args:
pub_date (dt.date): Date of publication.
author_data (dict): Data about the author's start and end dates of MIT employment.
Returns:
bool: Flag indicating whether date is valid (True) or invalid (False).
"""
if pub_date is None:
return True
else:
author_start_date = dt.date.fromisoformat(author_data["Start Date"])
author_end_date = dt.date.fromisoformat(author_data["End Date"])
if pub_date <= OA_POLICY_ENACTED_DATE:
return False
elif pub_date < author_start_date or pub_date > author_end_date:
return False
else:
return True


def pub_has_library_status(element: ET.Element) -> bool:
if element.find(".//api:library-status", NS):
return True
return False


def pub_manual_entry_is_valid(element: ET.Element) -> bool:
"""Determine if a manual entry for the article exists and is valid.
If a manual entry *doesn't* exist for the article, the function can proceed with
other checks to determine if the article should be included in request.
If a manual entry *does* exist for the article, the entry must have
ALL of the following fields marked as "false" before the function can proceed
with other checks:
1. c-do-not-request
2. c-optout
3. c-received
4. c-requested
If any of the fields are marked as "true", the function will
Args:
element (ET.Element): Element representing an article.
Returns:
bool: Flag indicating whether manually entered data for an article is valid
(True) or invalid (False).
"""
if do_not_request := element.find(
".//api:field[@name='c-do-not-request']/api:boolean", NS
):
if do_not_request.text == "true":
return False

if optout := element.find(".//api:field[@name='c-optout']/api:boolean", NS):
if optout.text == "true":
return False

if received := element.find(".//api:field[@name='c-received']/api:boolean", NS):
if received.text == "true":
return False

if requested := element.find(".//api:field[@name='c-requested']/api:boolean", NS):
if received.text == "true":
return False
return True


def pub_is_oa_exempt(element: ET.Element) -> bool:
"""Determine if an article is exempt from inclusion in request due to OA excepions.
An article *is* exempt from inclusion if ALL of the following criteria are met:
1. At least one (1) OA policy exception, excluding "Waiver",
applies to the article.
An article *is not* exempt from inclusion if ANY of the following criteria are met:
1. Zero (0) OA policy exceptions apply to the article.
2. If any OA policy exceptions apply to the article, "Waiver" is included.
Args:
element (ET.Element): Element representing an article.
Returns:
bool: Flag indicating whether article is exempt from OA policy
(i.e., excluded from request).
"""
if element.find(".//api:oa-policy-exception", NS):
exceptions = [
e.text for e in element.findall(".//api:oa-policy-exception/api:type", NS)
]
if "Waiver" in exceptions:
return False
return True
return False


def pub_type_is_valid(element: ET.Element) -> bool:
"""Determine whether an article's publication type is valid for inclusion in request.
An article's publication date is considered valid for inclusion and will
proceed with other checks if the type is either a journal article, book chapter, or
conference proceeding.
"""
pub_type = extract_attribute(
element, search_string=".//api:object", attribute="type-id"
)
if pub_type in ("3", "4", "5"):
return True
return False


def include_pub_in_request(element: ET.Element, author_data: dict) -> bool:
# Filter for papers to be requested based on various criteria
pub_date = get_pub_date(element)
if not pub_date_is_valid(pub_date, author_data):
return False
if pub_has_library_status(element):
return False
if not pub_type_is_valid(element):
return False
if pub_is_oa_exempt(element):
return False

if element.find(".//api:record[@source-name='manual']", NS):
if not pub_manual_entry_is_valid(element):
return False

# # If paper has a dspace record in Elements, status is not 'Public'
# # or 'Private' (in either case it has been deposited and should not
# # be requested)
# if element.find(".//api:record[@source-name='dspace']", NS):
# status = extract_field(
# element, ".//api:field[@name=" "'repository-status']/api:text"
# )
# if status == "Public" or status == "Private":
# continue


def make_xml(username: str) -> ET.Element:
top = ET.Element("update-object")
top.set("xmlns", "http://www.symplectic.co.uk/publications/api")
oa_field = ET.SubElement(top, "oa")
Expand All @@ -79,7 +251,7 @@ def make_xml(username):
return top


def parse_author_pubs_xml(xml_gen, author_data):
def parse_author_pubs_xml(xml_gen: Generator, author_data: dict) -> list[dict]:
"""Takes a an author-publications record feed from Symplectic
Elements, parses each record according to local rules for which
publications should be requested based on certain metadata fields, and
Expand All @@ -89,67 +261,22 @@ def parse_author_pubs_xml(xml_gen, author_data):
RESULTS = []
for page in xml_gen:
root = ET.fromstring(page)
for entry in root.findall("./atom:entry", NS):
pub_id = entry.find(".//api:object[@category='publication']", NS).get("id")
title = entry.find(".//api:field[@name='title']/api:text", NS).text
# Filter for papers to be requested based on various criteria
pub_date = get_pub_date(entry)
if not pub_date:
pass
# Paper was published after OA policy enacted
elif pub_date <= dt.date(2009, 3, 18):
continue
# Paper was published while author was MIT faculty
elif pub_date < dt.date.fromisoformat(
author_data["Start Date"]
) or pub_date > dt.date.fromisoformat(author_data["End Date"]):
continue
# Paper does not have a library status
if entry.find(".//api:library-status", NS):
continue
# Publication type is either a journal article, book chapter, or
# conference proceeding
pub_type = extract_attribute(entry, ".//api:object", "type-id")
if pub_type not in ("3", "4", "5"):
continue
# Paper does not have any OA policy exceptions, except for "Waiver"
# which we do request
if entry.find(".//api:oa-policy-exception", NS):
exceptions = [
e.text
for e in entry.findall(
".//api:oa-policy-" "exception/" "api:type", NS
)
]
if "Waiver" not in exceptions:
continue
# If paper has a manual entry record in Elements, none of the
# following fields are true
if entry.find(".//api:record[@source-name='manual']", NS):
if (
entry.find(
".//api:field[@name='c-do-not-request']" "/api:boolean", NS
).text
== "true"
or entry.find(".//api:field[@name='c-optout']/api:boolean", NS).text
== "true"
or entry.find(".//api:field[@name='c-received']/api:boolean", NS).text
== "true"
or entry.find(
".//api:field[@name='c-requested']/api:boolean", NS
).text
== "true"
):
continue
# If paper has a dspace record in Elements, status is not 'Public'
# or 'Private' (in either case it has been deposited and should not
# be requested)
if entry.find(".//api:record[@source-name='dspace']", NS):
status = extract_field(
entry, ".//api:field[@name=" "'repository-status']/api:text"
)
if status == "Public" or status == "Private":
continue
for element in root.findall("./atom:entry", NS):
pub_element = element.find(".//api:object[@category='publication']", NS)
if pub_element:
pub_id = pub_element.get("id")
else:
pub_id = None

title_element = element.find(".//api:field[@name='title']/api:text", NS)
if title_element:
title = title_element.text

if include_pub_in_request(element, author_data):
RESULTS.append({"id": pub_id, "title": title})
else:
print("Bloop!")

# If paper has passed all the checks above, add it to request list
RESULTS.append({"id": pub_id, "title": title})
return RESULTS
Expand Down Expand Up @@ -211,3 +338,19 @@ def parse_paper_xml(paper_xml):
"Issue": extract_field(root, ".//api:field[@name='issue']/api:text"),
}
return PAPER_DATA


def test(pub_dates):
for date in pub_dates:
if not date:
pass
# Paper was published after OA policy enacted
elif date <= dt.date(2009, 3, 18):
print("Pub date <= OA POLICY ENACTED")
continue
# Paper was published while author was MIT faculty
elif date < dt.date(2010, 4, 5) or date > dt.date(2012, 3, 1):
print("Check")
continue

print("Hooray")

0 comments on commit 618de0c

Please sign in to comment.