Skip to content

Commit

Permalink
Update project formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
davidmezzetti committed Dec 22, 2024
1 parent 3008a02 commit 6706e3c
Show file tree
Hide file tree
Showing 16 changed files with 38 additions and 144 deletions.
3 changes: 0 additions & 3 deletions .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,3 @@ min-public-methods=0

[FORMAT]
max-line-length=150

[MESSAGES CONTROL]
disable=R0201
29 changes: 5 additions & 24 deletions src/python/paperetl/cord19/entry.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,7 @@ def download(maxdate):

# Read list of dates from AI2 CORD-19 page
changelog = requests.get(f"{URL}/latest/changelog")
dates = [
line
for line in changelog.text.splitlines()
if re.match(r"\d{4}\-\d{2}\-\d{2}", line)
]
dates = [line for line in changelog.text.splitlines() if re.match(r"\d{4}\-\d{2}\-\d{2}", line)]

# Sort dates
dates = sorted(dates)
Expand All @@ -66,11 +62,7 @@ def download(maxdate):
# Current date
current = datetime.strptime(date, "%Y-%m-%d")

if (
date == dates[-1]
or current.day == 1
or (last and current.month != last.month)
):
if date == dates[-1] or current.day == 1 or (last and current.month != last.month):
url = f"{URL}/{date}/metadata.csv"
path = os.path.join(DIRECTORY, f"{date}.csv")
print(f"Retrieving {url} to {path}")
Expand Down Expand Up @@ -100,24 +92,15 @@ def run(output=None, maxdate=None):
Entry.download(maxdate)

# Get sorted list of metadata csv files
files = sorted(
[
f
for f in os.listdir(DIRECTORY)
if os.path.isfile(os.path.join(DIRECTORY, f))
and re.match(r"\d{4}-\d{2}-\d{2}\.csv", f)
]
)
files = sorted([f for f in os.listdir(DIRECTORY) if os.path.isfile(os.path.join(DIRECTORY, f)) and re.match(r"\d{4}-\d{2}-\d{2}\.csv", f)])

uids = {}

# Process each file, first time id is seen is considered entry date
for metadata in files:
# Parse date from file name
date = os.path.splitext(metadata)[0]
with open(
os.path.join(DIRECTORY, metadata), mode="r", encoding="utf-8"
) as csvfile:
with open(os.path.join(DIRECTORY, metadata), mode="r", encoding="utf-8") as csvfile:
for row in csv.DictReader(csvfile):
# Get hash value
sha = Execute.getHash(row)
Expand All @@ -131,9 +114,7 @@ def run(output=None, maxdate=None):
os.makedirs(output, exist_ok=True)

# Output file
output = (
os.path.join(output, "entry-dates.csv") if output else "entry-dates.csv"
)
output = os.path.join(output, "entry-dates.csv") if output else "entry-dates.csv"

# Build DataFrame
df = pd.DataFrame(uids.values(), columns=["cord_uid", "sha", "date"])
Expand Down
18 changes: 4 additions & 14 deletions src/python/paperetl/cord19/execute.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,9 +140,7 @@ def stream(indir, dates):
# Filter out duplicate ids
ids, hashes = set(), set()

with open(
os.path.join(indir, "metadata.csv"), mode="r", encoding="utf-8"
) as csvfile:
with open(os.path.join(indir, "metadata.csv"), mode="r", encoding="utf-8") as csvfile:
for row in csv.DictReader(csvfile):
# cord uid
uid = row["cord_uid"]
Expand Down Expand Up @@ -182,11 +180,7 @@ def process(params):
sections = Section.parse(row, indir)

# Search recent documents for COVID-19 keywords
tags = (
Execute.getTags(sections)
if not date or date >= datetime(2019, 7, 1)
else None
)
tags = Execute.getTags(sections) if not date or date >= datetime(2019, 7, 1) else None

# Article metadata - id, source, published, publication, authors, affiliations, affiliation, title,
# tags, reference
Expand Down Expand Up @@ -232,9 +226,7 @@ def entryDates(indir, entryfile):

# Reduce down to entries only in metadata
dates = {}
with open(
os.path.join(indir, "metadata.csv"), mode="r", encoding="utf-8"
) as csvfile:
with open(os.path.join(indir, "metadata.csv"), mode="r", encoding="utf-8") as csvfile:
for row in csv.DictReader(csvfile):
# Lookup hash
sha = Execute.getHash(row)
Expand Down Expand Up @@ -270,9 +262,7 @@ def run(indir, url, entryfile=None, replace=False):

# Create process pool
with Pool(os.cpu_count()) as pool:
for article in pool.imap(
Execute.process, Execute.stream(indir, dates), 100
):
for article in pool.imap(Execute.process, Execute.stream(indir, dates), 100):
# Get unique id
uid = article.uid()

Expand Down
10 changes: 2 additions & 8 deletions src/python/paperetl/cord19/section.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,7 @@ def parse(row, directory):
# Extract text from body
for section in data["body_text"]:
# Section name and text
name = (
section["section"].upper()
if len(section["section"].strip()) > 0
else None
)
name = section["section"].upper() if len(section["section"].strip()) > 0 else None
text = section["text"].replace("\n", " ")

# Clean and transform text
Expand All @@ -73,9 +69,7 @@ def parse(row, directory):
# Extract text from tables
for name, entry in data["ref_entries"].items():
if "html" in entry and entry["html"]:
sections.extend(
[(name, x) for x in Table.parse(entry["html"])]
)
sections.extend([(name, x) for x in Table.parse(entry["html"])])

# pylint: disable=W0703
except Exception as ex:
Expand Down
12 changes: 2 additions & 10 deletions src/python/paperetl/file/arx.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,7 @@ def parse(stream, source):
authors, affiliations, affiliation = ARX.authors(entry.find_all("author"))

# Get tags
tags = "; ".join(
["ARX"]
+ [category.get("term") for category in entry.find_all("category")]
)
tags = "; ".join(["ARX"] + [category.get("term") for category in entry.find_all("category")])

# Transform section text
sections = ARX.sections(title, ARX.get(entry, "summary"))
Expand Down Expand Up @@ -128,12 +125,7 @@ def authors(elements):
authors.append(", ".join(name.rsplit(maxsplit=1)[::-1]))

# Add affiliations
affiliations.extend(
[
ARX.clean(affiliation.text)
for affiliation in author.find_all("arxiv:affiliation")
]
)
affiliations.extend([ARX.clean(affiliation.text) for affiliation in author.find_all("arxiv:affiliation")])

return (
"; ".join(authors),
Expand Down
4 changes: 1 addition & 3 deletions src/python/paperetl/file/csvf.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,7 @@ def metadata(row, source):
elif field == "entry":
# Parse date field if found, otherwise use current date
value = row.get(field)
value = parser.parse(
value if value else datetime.datetime.now().strftime("%Y-%m-%d")
)
value = parser.parse(value if value else datetime.datetime.now().strftime("%Y-%m-%d"))
else:
value = row.get(field)

Expand Down
4 changes: 1 addition & 3 deletions src/python/paperetl/file/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,7 @@ def convert(stream):
"""

# Call GROBID API
response = requests.post(
"http://localhost:8070/api/processFulltextDocument", files={"input": stream}
)
response = requests.post("http://localhost:8070/api/processFulltextDocument", files={"input": stream})

# Validate request was successful
if not response.ok:
Expand Down
33 changes: 6 additions & 27 deletions src/python/paperetl/file/pmb.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,11 +257,7 @@ def mesh(citation):
list of MeSH codes
"""

return [
descriptor.attrib["UI"]
for descriptor in citation.findall("MeshHeadingList//DescriptorName")
if descriptor.attrib["UI"]
]
return [descriptor.attrib["UI"] for descriptor in citation.findall("MeshHeadingList//DescriptorName") if descriptor.attrib["UI"]]

@staticmethod
def sections(article, title):
Expand Down Expand Up @@ -344,12 +340,7 @@ def formatted(element):
# - cleaned inner text has data
# - no section text queued
# - element tag is a <b> or matches a defined section background category name
if (
not tag
and ctext
and not texts
and (x.tag.lower() == "b" or PMB.background(ctext))
):
if not tag and ctext and not texts and (x.tag.lower() == "b" or PMB.background(ctext)):
tag = x.tag

# New section if one of following:
Expand All @@ -359,14 +350,10 @@ def formatted(element):
# - no section text
# - last section text element ends in period
# pylint: disable=R0916
if ((x.tag == tag and ctext) or (not tag and texts)) and (
not texts or texts[-1].strip().endswith(".")
):
if ((x.tag == tag and ctext) or (not tag and texts)) and (not texts or texts[-1].strip().endswith(".")):
# Save previous section
if texts:
sections.extend(
[(name, t) for t in sent_tokenize("".join(texts).strip())]
)
sections.extend([(name, t) for t in sent_tokenize("".join(texts).strip())])

# Reset section name/texts
name = ctext if tag else "ABSTRACT"
Expand Down Expand Up @@ -401,11 +388,7 @@ def parsed(elements):

# Parsed abstract
for element in elements:
name = (
PMB.section(element.attrib["Label"])
if "Label" in element.attrib
else None
)
name = PMB.section(element.attrib["Label"]) if "Label" in element.attrib else None
name = name if name else "ABSTRACT"

if element.text:
Expand All @@ -429,11 +412,7 @@ def background(name):
True if the section name is a background category
"""

return [
x
for x in ["aim", "introduction", "background", "purpose", "objective"]
if x in name.lower()
]
return [x for x in ["aim", "introduction", "background", "purpose", "objective"] if x in name.lower()]

@staticmethod
def section(name):
Expand Down
20 changes: 4 additions & 16 deletions src/python/paperetl/file/tei.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,7 @@ def parse(stream, source):
sections = TEI.text(soup, title)

# Derive uid
uid = hashlib.sha1(
title.encode("utf-8") if title else reference.encode("utf-8")
).hexdigest()
uid = hashlib.sha1(title.encode("utf-8") if title else reference.encode("utf-8")).hexdigest()

# Default title to source if empty
title = title if title else source
Expand Down Expand Up @@ -95,11 +93,7 @@ def date(published):
# Parse publication date
# pylint: disable=W0702
try:
published = (
parser.parse(published["when"])
if published and "when" in published.attrs
else None
)
published = parser.parse(published["when"]) if published and "when" in published.attrs else None
except:
published = None

Expand Down Expand Up @@ -161,11 +155,7 @@ def metadata(soup):
authors, affiliations, affiliation = TEI.authors(source)

struct = soup.find("biblstruct")
reference = (
"https://doi.org/" + struct.find("idno").text
if struct and struct.find("idno")
else None
)
reference = "https://doi.org/" + struct.find("idno").text if struct and struct.find("idno") else None
else:
published, publication, authors, affiliations, affiliation, reference = (
None,
Expand Down Expand Up @@ -230,9 +220,7 @@ def text(soup, title):
else:
name = None

text = " ".join(
[str(e.text) if hasattr(e, "text") else str(e) for e in children]
)
text = " ".join([str(e.text) if hasattr(e, "text") else str(e) for e in children])
text = text.replace("\n", " ")

# Transform and clean text
Expand Down
6 changes: 1 addition & 5 deletions src/python/paperetl/filesystem.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,7 @@ def __init__(self, outdir):

def save(self, article):
output = article.uid() + f".{self.extension()}"
output = (
f"{os.path.splitext(article.source())[0]}-{output}"
if article.source()
else output
)
output = f"{os.path.splitext(article.source())[0]}-{output}" if article.source() else output

with open(os.path.join(self.outdir, output), "w", encoding="utf-8") as output:
self.write(output, article.build())
Expand Down
8 changes: 2 additions & 6 deletions src/python/paperetl/sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,9 +152,7 @@ def savearticle(self, article):
self.insert(SQLite.ARTICLES, "articles", article.metadata)
except sqlite3.IntegrityError:
# Duplicate detected get entry date to determine action
entry = parser.parse(
self.cur.execute(SQLite.LOOKUP_ENTRY, [article.uid()]).fetchone()[0]
)
entry = parser.parse(self.cur.execute(SQLite.LOOKUP_ENTRY, [article.uid()]).fetchone()[0])

# Keep existing article if existing entry date is same or newer
if article.entry() <= entry:
Expand Down Expand Up @@ -219,9 +217,7 @@ def insert(self, table, name, row):

# Build insert prepared statement
columns = [name for name, _ in table.items()]
insert = SQLite.INSERT_ROW.format(
table=name, columns=", ".join(columns), values=("?, " * len(columns))[:-2]
)
insert = SQLite.INSERT_ROW.format(table=name, columns=", ".join(columns), values=("?, " * len(columns))[:-2])

# Execute insert statement
self.cur.execute(insert, self.values(table, row, columns))
Expand Down
5 changes: 1 addition & 4 deletions src/python/paperetl/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,7 @@ def extract(table):

for row in rows:
# Build concatenated header value string
values = [
f"{headers[x] if x < len(headers) else ''} {column.text}"
for x, column in enumerate(row)
]
values = [f"{headers[x] if x < len(headers) else ''} {column.text}" for x, column in enumerate(row)]

# Create single row string
value = " ".join(values)
Expand Down
12 changes: 3 additions & 9 deletions test/python/testcord19.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,8 @@ def testDate(self):
Test article publish dates
"""

self.assertEqual(
Execute.getDate({"publish_time": "2020"}), datetime(2020, 1, 1)
)
self.assertEqual(
Execute.getDate({"publish_time": "2020-10-10"}), datetime(2020, 10, 10)
)
self.assertEqual(Execute.getDate({"publish_time": "2020"}), datetime(2020, 1, 1))
self.assertEqual(Execute.getDate({"publish_time": "2020-10-10"}), datetime(2020, 10, 10))
self.assertEqual(Execute.getDate({"publish_time": "bad date"}), None)
self.assertEqual(Execute.getDate({"publish_time": None}), None)

Expand Down Expand Up @@ -113,9 +109,7 @@ def testHash(self):
"62520f1c4f656dcb5fe565a4c2bf4ce1f7d435ef",
)
self.assertEqual(
Execute.getHash(
{"sha": "47ed55bfa014cd59f58896c132c36bb0a218d11d; abcdef"}
),
Execute.getHash({"sha": "47ed55bfa014cd59f58896c132c36bb0a218d11d; abcdef"}),
"47ed55bfa014cd59f58896c132c36bb0a218d11d",
)

Expand Down
Loading

0 comments on commit 6706e3c

Please sign in to comment.