Update project formatting

neuml · Dec 22, 2024 · 6706e3c · 6706e3c
1 parent 3008a02
commit 6706e3c
Show file tree

Hide file tree

Showing 16 changed files with 38 additions and 144 deletions.
diff --git a/.pylintrc b/.pylintrc
@@ -15,6 +15,3 @@ min-public-methods=0
 
 [FORMAT]
 max-line-length=150
-
-[MESSAGES CONTROL]
-disable=R0201
diff --git a/src/python/paperetl/cord19/entry.py b/src/python/paperetl/cord19/entry.py
@@ -44,11 +44,7 @@ def download(maxdate):
 
         # Read list of dates from AI2 CORD-19 page
         changelog = requests.get(f"{URL}/latest/changelog")
-        dates = [
-            line
-            for line in changelog.text.splitlines()
-            if re.match(r"\d{4}\-\d{2}\-\d{2}", line)
-        ]
+        dates = [line for line in changelog.text.splitlines() if re.match(r"\d{4}\-\d{2}\-\d{2}", line)]
 
         # Sort dates
         dates = sorted(dates)
@@ -66,11 +62,7 @@ def download(maxdate):
             # Current date
             current = datetime.strptime(date, "%Y-%m-%d")
 
-            if (
-                date == dates[-1]
-                or current.day == 1
-                or (last and current.month != last.month)
-            ):
+            if date == dates[-1] or current.day == 1 or (last and current.month != last.month):
                 url = f"{URL}/{date}/metadata.csv"
                 path = os.path.join(DIRECTORY, f"{date}.csv")
                 print(f"Retrieving {url} to {path}")
@@ -100,24 +92,15 @@ def run(output=None, maxdate=None):
         Entry.download(maxdate)
 
         # Get sorted list of metadata csv files
-        files = sorted(
-            [
-                f
-                for f in os.listdir(DIRECTORY)
-                if os.path.isfile(os.path.join(DIRECTORY, f))
-                and re.match(r"\d{4}-\d{2}-\d{2}\.csv", f)
-            ]
-        )
+        files = sorted([f for f in os.listdir(DIRECTORY) if os.path.isfile(os.path.join(DIRECTORY, f)) and re.match(r"\d{4}-\d{2}-\d{2}\.csv", f)])
 
         uids = {}
 
         # Process each file, first time id is seen is considered entry date
         for metadata in files:
             # Parse date from file name
             date = os.path.splitext(metadata)[0]
-            with open(
-                os.path.join(DIRECTORY, metadata), mode="r", encoding="utf-8"
-            ) as csvfile:
+            with open(os.path.join(DIRECTORY, metadata), mode="r", encoding="utf-8") as csvfile:
                 for row in csv.DictReader(csvfile):
                     # Get hash value
                     sha = Execute.getHash(row)
@@ -131,9 +114,7 @@ def run(output=None, maxdate=None):
             os.makedirs(output, exist_ok=True)
 
         # Output file
-        output = (
-            os.path.join(output, "entry-dates.csv") if output else "entry-dates.csv"
-        )
+        output = os.path.join(output, "entry-dates.csv") if output else "entry-dates.csv"
 
         # Build DataFrame
         df = pd.DataFrame(uids.values(), columns=["cord_uid", "sha", "date"])

diff --git a/src/python/paperetl/cord19/execute.py b/src/python/paperetl/cord19/execute.py
@@ -140,9 +140,7 @@ def stream(indir, dates):
         # Filter out duplicate ids
         ids, hashes = set(), set()
 
-        with open(
-            os.path.join(indir, "metadata.csv"), mode="r", encoding="utf-8"
-        ) as csvfile:
+        with open(os.path.join(indir, "metadata.csv"), mode="r", encoding="utf-8") as csvfile:
             for row in csv.DictReader(csvfile):
                 # cord uid
                 uid = row["cord_uid"]
@@ -182,11 +180,7 @@ def process(params):
         sections = Section.parse(row, indir)
 
         # Search recent documents for COVID-19 keywords
-        tags = (
-            Execute.getTags(sections)
-            if not date or date >= datetime(2019, 7, 1)
-            else None
-        )
+        tags = Execute.getTags(sections) if not date or date >= datetime(2019, 7, 1) else None
 
         # Article metadata - id, source, published, publication, authors, affiliations, affiliation, title,
         #                    tags, reference
@@ -232,9 +226,7 @@ def entryDates(indir, entryfile):
 
         # Reduce down to entries only in metadata
         dates = {}
-        with open(
-            os.path.join(indir, "metadata.csv"), mode="r", encoding="utf-8"
-        ) as csvfile:
+        with open(os.path.join(indir, "metadata.csv"), mode="r", encoding="utf-8") as csvfile:
             for row in csv.DictReader(csvfile):
                 # Lookup hash
                 sha = Execute.getHash(row)
@@ -270,9 +262,7 @@ def run(indir, url, entryfile=None, replace=False):
 
         # Create process pool
         with Pool(os.cpu_count()) as pool:
-            for article in pool.imap(
-                Execute.process, Execute.stream(indir, dates), 100
-            ):
+            for article in pool.imap(Execute.process, Execute.stream(indir, dates), 100):
                 # Get unique id
                 uid = article.uid()
 

diff --git a/src/python/paperetl/cord19/section.py b/src/python/paperetl/cord19/section.py
@@ -57,11 +57,7 @@ def parse(row, directory):
                     # Extract text from body
                     for section in data["body_text"]:
                         # Section name and text
-                        name = (
-                            section["section"].upper()
-                            if len(section["section"].strip()) > 0
-                            else None
-                        )
+                        name = section["section"].upper() if len(section["section"].strip()) > 0 else None
                         text = section["text"].replace("\n", " ")
 
                         # Clean and transform text
@@ -73,9 +69,7 @@ def parse(row, directory):
                     # Extract text from tables
                     for name, entry in data["ref_entries"].items():
                         if "html" in entry and entry["html"]:
-                            sections.extend(
-                                [(name, x) for x in Table.parse(entry["html"])]
-                            )
+                            sections.extend([(name, x) for x in Table.parse(entry["html"])])
 
             # pylint: disable=W0703
             except Exception as ex:

diff --git a/src/python/paperetl/file/arx.py b/src/python/paperetl/file/arx.py
@@ -49,10 +49,7 @@ def parse(stream, source):
             authors, affiliations, affiliation = ARX.authors(entry.find_all("author"))
 
             # Get tags
-            tags = "; ".join(
-                ["ARX"]
-                + [category.get("term") for category in entry.find_all("category")]
-            )
+            tags = "; ".join(["ARX"] + [category.get("term") for category in entry.find_all("category")])
 
             # Transform section text
             sections = ARX.sections(title, ARX.get(entry, "summary"))
@@ -128,12 +125,7 @@ def authors(elements):
             authors.append(", ".join(name.rsplit(maxsplit=1)[::-1]))
 
             # Add affiliations
-            affiliations.extend(
-                [
-                    ARX.clean(affiliation.text)
-                    for affiliation in author.find_all("arxiv:affiliation")
-                ]
-            )
+            affiliations.extend([ARX.clean(affiliation.text) for affiliation in author.find_all("arxiv:affiliation")])
 
         return (
             "; ".join(authors),

diff --git a/src/python/paperetl/file/csvf.py b/src/python/paperetl/file/csvf.py
@@ -71,9 +71,7 @@ def metadata(row, source):
             elif field == "entry":
                 # Parse date field if found, otherwise use current date
                 value = row.get(field)
-                value = parser.parse(
-                    value if value else datetime.datetime.now().strftime("%Y-%m-%d")
-                )
+                value = parser.parse(value if value else datetime.datetime.now().strftime("%Y-%m-%d"))
             else:
                 value = row.get(field)
 

diff --git a/src/python/paperetl/file/pdf.py b/src/python/paperetl/file/pdf.py
@@ -46,9 +46,7 @@ def convert(stream):
         """
 
         # Call GROBID API
-        response = requests.post(
-            "http://localhost:8070/api/processFulltextDocument", files={"input": stream}
-        )
+        response = requests.post("http://localhost:8070/api/processFulltextDocument", files={"input": stream})
 
         # Validate request was successful
         if not response.ok:

diff --git a/src/python/paperetl/file/pmb.py b/src/python/paperetl/file/pmb.py
@@ -257,11 +257,7 @@ def mesh(citation):
             list of MeSH codes
         """
 
-        return [
-            descriptor.attrib["UI"]
-            for descriptor in citation.findall("MeshHeadingList//DescriptorName")
-            if descriptor.attrib["UI"]
-        ]
+        return [descriptor.attrib["UI"] for descriptor in citation.findall("MeshHeadingList//DescriptorName") if descriptor.attrib["UI"]]
 
     @staticmethod
     def sections(article, title):
@@ -344,12 +340,7 @@ def formatted(element):
             #   - cleaned inner text has data
             #   - no section text queued
             #   - element tag is a <b> or matches a defined section background category name
-            if (
-                not tag
-                and ctext
-                and not texts
-                and (x.tag.lower() == "b" or PMB.background(ctext))
-            ):
+            if not tag and ctext and not texts and (x.tag.lower() == "b" or PMB.background(ctext)):
                 tag = x.tag
 
             # New section if one of following:
@@ -359,14 +350,10 @@ def formatted(element):
             #   - no section text
             #   - last section text element ends in period
             # pylint: disable=R0916
-            if ((x.tag == tag and ctext) or (not tag and texts)) and (
-                not texts or texts[-1].strip().endswith(".")
-            ):
+            if ((x.tag == tag and ctext) or (not tag and texts)) and (not texts or texts[-1].strip().endswith(".")):
                 # Save previous section
                 if texts:
-                    sections.extend(
-                        [(name, t) for t in sent_tokenize("".join(texts).strip())]
-                    )
+                    sections.extend([(name, t) for t in sent_tokenize("".join(texts).strip())])
 
                 # Reset section name/texts
                 name = ctext if tag else "ABSTRACT"
@@ -401,11 +388,7 @@ def parsed(elements):
 
         # Parsed abstract
         for element in elements:
-            name = (
-                PMB.section(element.attrib["Label"])
-                if "Label" in element.attrib
-                else None
-            )
+            name = PMB.section(element.attrib["Label"]) if "Label" in element.attrib else None
             name = name if name else "ABSTRACT"
 
             if element.text:
@@ -429,11 +412,7 @@ def background(name):
             True if the section name is a background category
         """
 
-        return [
-            x
-            for x in ["aim", "introduction", "background", "purpose", "objective"]
-            if x in name.lower()
-        ]
+        return [x for x in ["aim", "introduction", "background", "purpose", "objective"] if x in name.lower()]
 
     @staticmethod
     def section(name):

diff --git a/src/python/paperetl/file/tei.py b/src/python/paperetl/file/tei.py
@@ -55,9 +55,7 @@ def parse(stream, source):
         sections = TEI.text(soup, title)
 
         # Derive uid
-        uid = hashlib.sha1(
-            title.encode("utf-8") if title else reference.encode("utf-8")
-        ).hexdigest()
+        uid = hashlib.sha1(title.encode("utf-8") if title else reference.encode("utf-8")).hexdigest()
 
         # Default title to source if empty
         title = title if title else source
@@ -95,11 +93,7 @@ def date(published):
         # Parse publication date
         # pylint: disable=W0702
         try:
-            published = (
-                parser.parse(published["when"])
-                if published and "when" in published.attrs
-                else None
-            )
+            published = parser.parse(published["when"]) if published and "when" in published.attrs else None
         except:
             published = None
 
@@ -161,11 +155,7 @@ def metadata(soup):
             authors, affiliations, affiliation = TEI.authors(source)
 
             struct = soup.find("biblstruct")
-            reference = (
-                "https://doi.org/" + struct.find("idno").text
-                if struct and struct.find("idno")
-                else None
-            )
+            reference = "https://doi.org/" + struct.find("idno").text if struct and struct.find("idno") else None
         else:
             published, publication, authors, affiliations, affiliation, reference = (
                 None,
@@ -230,9 +220,7 @@ def text(soup, title):
             else:
                 name = None
 
-            text = " ".join(
-                [str(e.text) if hasattr(e, "text") else str(e) for e in children]
-            )
+            text = " ".join([str(e.text) if hasattr(e, "text") else str(e) for e in children])
             text = text.replace("\n", " ")
 
             # Transform and clean text

diff --git a/src/python/paperetl/filesystem.py b/src/python/paperetl/filesystem.py
@@ -30,11 +30,7 @@ def __init__(self, outdir):
 
     def save(self, article):
         output = article.uid() + f".{self.extension()}"
-        output = (
-            f"{os.path.splitext(article.source())[0]}-{output}"
-            if article.source()
-            else output
-        )
+        output = f"{os.path.splitext(article.source())[0]}-{output}" if article.source() else output
 
         with open(os.path.join(self.outdir, output), "w", encoding="utf-8") as output:
             self.write(output, article.build())

diff --git a/src/python/paperetl/sqlite.py b/src/python/paperetl/sqlite.py
@@ -152,9 +152,7 @@ def savearticle(self, article):
             self.insert(SQLite.ARTICLES, "articles", article.metadata)
         except sqlite3.IntegrityError:
             # Duplicate detected get entry date to determine action
-            entry = parser.parse(
-                self.cur.execute(SQLite.LOOKUP_ENTRY, [article.uid()]).fetchone()[0]
-            )
+            entry = parser.parse(self.cur.execute(SQLite.LOOKUP_ENTRY, [article.uid()]).fetchone()[0])
 
             # Keep existing article if existing entry date is same or newer
             if article.entry() <= entry:
@@ -219,9 +217,7 @@ def insert(self, table, name, row):
 
         # Build insert prepared statement
         columns = [name for name, _ in table.items()]
-        insert = SQLite.INSERT_ROW.format(
-            table=name, columns=", ".join(columns), values=("?, " * len(columns))[:-2]
-        )
+        insert = SQLite.INSERT_ROW.format(table=name, columns=", ".join(columns), values=("?, " * len(columns))[:-2])
 
         # Execute insert statement
         self.cur.execute(insert, self.values(table, row, columns))

diff --git a/src/python/paperetl/table.py b/src/python/paperetl/table.py
@@ -51,10 +51,7 @@ def extract(table):
 
             for row in rows:
                 # Build concatenated header value string
-                values = [
-                    f"{headers[x] if x < len(headers) else ''} {column.text}"
-                    for x, column in enumerate(row)
-                ]
+                values = [f"{headers[x] if x < len(headers) else ''} {column.text}" for x, column in enumerate(row)]
 
                 # Create single row string
                 value = " ".join(values)

diff --git a/test/python/testcord19.py b/test/python/testcord19.py
@@ -76,12 +76,8 @@ def testDate(self):
         Test article publish dates
         """
 
-        self.assertEqual(
-            Execute.getDate({"publish_time": "2020"}), datetime(2020, 1, 1)
-        )
-        self.assertEqual(
-            Execute.getDate({"publish_time": "2020-10-10"}), datetime(2020, 10, 10)
-        )
+        self.assertEqual(Execute.getDate({"publish_time": "2020"}), datetime(2020, 1, 1))
+        self.assertEqual(Execute.getDate({"publish_time": "2020-10-10"}), datetime(2020, 10, 10))
         self.assertEqual(Execute.getDate({"publish_time": "bad date"}), None)
         self.assertEqual(Execute.getDate({"publish_time": None}), None)
 
@@ -113,9 +109,7 @@ def testHash(self):
             "62520f1c4f656dcb5fe565a4c2bf4ce1f7d435ef",
         )
         self.assertEqual(
-            Execute.getHash(
-                {"sha": "47ed55bfa014cd59f58896c132c36bb0a218d11d; abcdef"}
-            ),
+            Execute.getHash({"sha": "47ed55bfa014cd59f58896c132c36bb0a218d11d; abcdef"}),
             "47ed55bfa014cd59f58896c132c36bb0a218d11d",
         )