From 6706e3ca8c9ba08244d501db9a706e321d87251a Mon Sep 17 00:00:00 2001 From: davidmezzetti <561939+davidmezzetti@users.noreply.github.com> Date: Sun, 22 Dec 2024 09:25:31 -0500 Subject: [PATCH] Update project formatting --- .pylintrc | 3 --- src/python/paperetl/cord19/entry.py | 29 ++++------------------- src/python/paperetl/cord19/execute.py | 18 ++++----------- src/python/paperetl/cord19/section.py | 10 ++------ src/python/paperetl/file/arx.py | 12 ++-------- src/python/paperetl/file/csvf.py | 4 +--- src/python/paperetl/file/pdf.py | 4 +--- src/python/paperetl/file/pmb.py | 33 +++++---------------------- src/python/paperetl/file/tei.py | 20 ++++------------ src/python/paperetl/filesystem.py | 6 +---- src/python/paperetl/sqlite.py | 8 ++----- src/python/paperetl/table.py | 5 +--- test/python/testcord19.py | 12 +++------- test/python/testelastic.py | 6 ++--- test/python/testfiledatabase.py | 8 ++----- test/python/testfileexport.py | 4 +--- 16 files changed, 38 insertions(+), 144 deletions(-) diff --git a/.pylintrc b/.pylintrc index dcb45dd..3bb9f57 100644 --- a/.pylintrc +++ b/.pylintrc @@ -15,6 +15,3 @@ min-public-methods=0 [FORMAT] max-line-length=150 - -[MESSAGES CONTROL] -disable=R0201 diff --git a/src/python/paperetl/cord19/entry.py b/src/python/paperetl/cord19/entry.py index 56703c8..79b6a87 100644 --- a/src/python/paperetl/cord19/entry.py +++ b/src/python/paperetl/cord19/entry.py @@ -44,11 +44,7 @@ def download(maxdate): # Read list of dates from AI2 CORD-19 page changelog = requests.get(f"{URL}/latest/changelog") - dates = [ - line - for line in changelog.text.splitlines() - if re.match(r"\d{4}\-\d{2}\-\d{2}", line) - ] + dates = [line for line in changelog.text.splitlines() if re.match(r"\d{4}\-\d{2}\-\d{2}", line)] # Sort dates dates = sorted(dates) @@ -66,11 +62,7 @@ def download(maxdate): # Current date current = datetime.strptime(date, "%Y-%m-%d") - if ( - date == dates[-1] - or current.day == 1 - or (last and current.month != last.month) - ): + if date == dates[-1] or current.day == 1 or (last and current.month != last.month): url = f"{URL}/{date}/metadata.csv" path = os.path.join(DIRECTORY, f"{date}.csv") print(f"Retrieving {url} to {path}") @@ -100,14 +92,7 @@ def run(output=None, maxdate=None): Entry.download(maxdate) # Get sorted list of metadata csv files - files = sorted( - [ - f - for f in os.listdir(DIRECTORY) - if os.path.isfile(os.path.join(DIRECTORY, f)) - and re.match(r"\d{4}-\d{2}-\d{2}\.csv", f) - ] - ) + files = sorted([f for f in os.listdir(DIRECTORY) if os.path.isfile(os.path.join(DIRECTORY, f)) and re.match(r"\d{4}-\d{2}-\d{2}\.csv", f)]) uids = {} @@ -115,9 +100,7 @@ def run(output=None, maxdate=None): for metadata in files: # Parse date from file name date = os.path.splitext(metadata)[0] - with open( - os.path.join(DIRECTORY, metadata), mode="r", encoding="utf-8" - ) as csvfile: + with open(os.path.join(DIRECTORY, metadata), mode="r", encoding="utf-8") as csvfile: for row in csv.DictReader(csvfile): # Get hash value sha = Execute.getHash(row) @@ -131,9 +114,7 @@ def run(output=None, maxdate=None): os.makedirs(output, exist_ok=True) # Output file - output = ( - os.path.join(output, "entry-dates.csv") if output else "entry-dates.csv" - ) + output = os.path.join(output, "entry-dates.csv") if output else "entry-dates.csv" # Build DataFrame df = pd.DataFrame(uids.values(), columns=["cord_uid", "sha", "date"]) diff --git a/src/python/paperetl/cord19/execute.py b/src/python/paperetl/cord19/execute.py index eec8529..40501ce 100644 --- a/src/python/paperetl/cord19/execute.py +++ b/src/python/paperetl/cord19/execute.py @@ -140,9 +140,7 @@ def stream(indir, dates): # Filter out duplicate ids ids, hashes = set(), set() - with open( - os.path.join(indir, "metadata.csv"), mode="r", encoding="utf-8" - ) as csvfile: + with open(os.path.join(indir, "metadata.csv"), mode="r", encoding="utf-8") as csvfile: for row in csv.DictReader(csvfile): # cord uid uid = row["cord_uid"] @@ -182,11 +180,7 @@ def process(params): sections = Section.parse(row, indir) # Search recent documents for COVID-19 keywords - tags = ( - Execute.getTags(sections) - if not date or date >= datetime(2019, 7, 1) - else None - ) + tags = Execute.getTags(sections) if not date or date >= datetime(2019, 7, 1) else None # Article metadata - id, source, published, publication, authors, affiliations, affiliation, title, # tags, reference @@ -232,9 +226,7 @@ def entryDates(indir, entryfile): # Reduce down to entries only in metadata dates = {} - with open( - os.path.join(indir, "metadata.csv"), mode="r", encoding="utf-8" - ) as csvfile: + with open(os.path.join(indir, "metadata.csv"), mode="r", encoding="utf-8") as csvfile: for row in csv.DictReader(csvfile): # Lookup hash sha = Execute.getHash(row) @@ -270,9 +262,7 @@ def run(indir, url, entryfile=None, replace=False): # Create process pool with Pool(os.cpu_count()) as pool: - for article in pool.imap( - Execute.process, Execute.stream(indir, dates), 100 - ): + for article in pool.imap(Execute.process, Execute.stream(indir, dates), 100): # Get unique id uid = article.uid() diff --git a/src/python/paperetl/cord19/section.py b/src/python/paperetl/cord19/section.py index 40afa94..3c81e94 100644 --- a/src/python/paperetl/cord19/section.py +++ b/src/python/paperetl/cord19/section.py @@ -57,11 +57,7 @@ def parse(row, directory): # Extract text from body for section in data["body_text"]: # Section name and text - name = ( - section["section"].upper() - if len(section["section"].strip()) > 0 - else None - ) + name = section["section"].upper() if len(section["section"].strip()) > 0 else None text = section["text"].replace("\n", " ") # Clean and transform text @@ -73,9 +69,7 @@ def parse(row, directory): # Extract text from tables for name, entry in data["ref_entries"].items(): if "html" in entry and entry["html"]: - sections.extend( - [(name, x) for x in Table.parse(entry["html"])] - ) + sections.extend([(name, x) for x in Table.parse(entry["html"])]) # pylint: disable=W0703 except Exception as ex: diff --git a/src/python/paperetl/file/arx.py b/src/python/paperetl/file/arx.py index 01abd5a..62a0e3e 100644 --- a/src/python/paperetl/file/arx.py +++ b/src/python/paperetl/file/arx.py @@ -49,10 +49,7 @@ def parse(stream, source): authors, affiliations, affiliation = ARX.authors(entry.find_all("author")) # Get tags - tags = "; ".join( - ["ARX"] - + [category.get("term") for category in entry.find_all("category")] - ) + tags = "; ".join(["ARX"] + [category.get("term") for category in entry.find_all("category")]) # Transform section text sections = ARX.sections(title, ARX.get(entry, "summary")) @@ -128,12 +125,7 @@ def authors(elements): authors.append(", ".join(name.rsplit(maxsplit=1)[::-1])) # Add affiliations - affiliations.extend( - [ - ARX.clean(affiliation.text) - for affiliation in author.find_all("arxiv:affiliation") - ] - ) + affiliations.extend([ARX.clean(affiliation.text) for affiliation in author.find_all("arxiv:affiliation")]) return ( "; ".join(authors), diff --git a/src/python/paperetl/file/csvf.py b/src/python/paperetl/file/csvf.py index 7704896..f9316f9 100644 --- a/src/python/paperetl/file/csvf.py +++ b/src/python/paperetl/file/csvf.py @@ -71,9 +71,7 @@ def metadata(row, source): elif field == "entry": # Parse date field if found, otherwise use current date value = row.get(field) - value = parser.parse( - value if value else datetime.datetime.now().strftime("%Y-%m-%d") - ) + value = parser.parse(value if value else datetime.datetime.now().strftime("%Y-%m-%d")) else: value = row.get(field) diff --git a/src/python/paperetl/file/pdf.py b/src/python/paperetl/file/pdf.py index 2a3cf45..aa20c13 100644 --- a/src/python/paperetl/file/pdf.py +++ b/src/python/paperetl/file/pdf.py @@ -46,9 +46,7 @@ def convert(stream): """ # Call GROBID API - response = requests.post( - "http://localhost:8070/api/processFulltextDocument", files={"input": stream} - ) + response = requests.post("http://localhost:8070/api/processFulltextDocument", files={"input": stream}) # Validate request was successful if not response.ok: diff --git a/src/python/paperetl/file/pmb.py b/src/python/paperetl/file/pmb.py index ecec9f1..3f1bfe4 100644 --- a/src/python/paperetl/file/pmb.py +++ b/src/python/paperetl/file/pmb.py @@ -257,11 +257,7 @@ def mesh(citation): list of MeSH codes """ - return [ - descriptor.attrib["UI"] - for descriptor in citation.findall("MeshHeadingList//DescriptorName") - if descriptor.attrib["UI"] - ] + return [descriptor.attrib["UI"] for descriptor in citation.findall("MeshHeadingList//DescriptorName") if descriptor.attrib["UI"]] @staticmethod def sections(article, title): @@ -344,12 +340,7 @@ def formatted(element): # - cleaned inner text has data # - no section text queued # - element tag is a or matches a defined section background category name - if ( - not tag - and ctext - and not texts - and (x.tag.lower() == "b" or PMB.background(ctext)) - ): + if not tag and ctext and not texts and (x.tag.lower() == "b" or PMB.background(ctext)): tag = x.tag # New section if one of following: @@ -359,14 +350,10 @@ def formatted(element): # - no section text # - last section text element ends in period # pylint: disable=R0916 - if ((x.tag == tag and ctext) or (not tag and texts)) and ( - not texts or texts[-1].strip().endswith(".") - ): + if ((x.tag == tag and ctext) or (not tag and texts)) and (not texts or texts[-1].strip().endswith(".")): # Save previous section if texts: - sections.extend( - [(name, t) for t in sent_tokenize("".join(texts).strip())] - ) + sections.extend([(name, t) for t in sent_tokenize("".join(texts).strip())]) # Reset section name/texts name = ctext if tag else "ABSTRACT" @@ -401,11 +388,7 @@ def parsed(elements): # Parsed abstract for element in elements: - name = ( - PMB.section(element.attrib["Label"]) - if "Label" in element.attrib - else None - ) + name = PMB.section(element.attrib["Label"]) if "Label" in element.attrib else None name = name if name else "ABSTRACT" if element.text: @@ -429,11 +412,7 @@ def background(name): True if the section name is a background category """ - return [ - x - for x in ["aim", "introduction", "background", "purpose", "objective"] - if x in name.lower() - ] + return [x for x in ["aim", "introduction", "background", "purpose", "objective"] if x in name.lower()] @staticmethod def section(name): diff --git a/src/python/paperetl/file/tei.py b/src/python/paperetl/file/tei.py index a190324..7d77947 100644 --- a/src/python/paperetl/file/tei.py +++ b/src/python/paperetl/file/tei.py @@ -55,9 +55,7 @@ def parse(stream, source): sections = TEI.text(soup, title) # Derive uid - uid = hashlib.sha1( - title.encode("utf-8") if title else reference.encode("utf-8") - ).hexdigest() + uid = hashlib.sha1(title.encode("utf-8") if title else reference.encode("utf-8")).hexdigest() # Default title to source if empty title = title if title else source @@ -95,11 +93,7 @@ def date(published): # Parse publication date # pylint: disable=W0702 try: - published = ( - parser.parse(published["when"]) - if published and "when" in published.attrs - else None - ) + published = parser.parse(published["when"]) if published and "when" in published.attrs else None except: published = None @@ -161,11 +155,7 @@ def metadata(soup): authors, affiliations, affiliation = TEI.authors(source) struct = soup.find("biblstruct") - reference = ( - "https://doi.org/" + struct.find("idno").text - if struct and struct.find("idno") - else None - ) + reference = "https://doi.org/" + struct.find("idno").text if struct and struct.find("idno") else None else: published, publication, authors, affiliations, affiliation, reference = ( None, @@ -230,9 +220,7 @@ def text(soup, title): else: name = None - text = " ".join( - [str(e.text) if hasattr(e, "text") else str(e) for e in children] - ) + text = " ".join([str(e.text) if hasattr(e, "text") else str(e) for e in children]) text = text.replace("\n", " ") # Transform and clean text diff --git a/src/python/paperetl/filesystem.py b/src/python/paperetl/filesystem.py index 4e7599a..52ea2a1 100644 --- a/src/python/paperetl/filesystem.py +++ b/src/python/paperetl/filesystem.py @@ -30,11 +30,7 @@ def __init__(self, outdir): def save(self, article): output = article.uid() + f".{self.extension()}" - output = ( - f"{os.path.splitext(article.source())[0]}-{output}" - if article.source() - else output - ) + output = f"{os.path.splitext(article.source())[0]}-{output}" if article.source() else output with open(os.path.join(self.outdir, output), "w", encoding="utf-8") as output: self.write(output, article.build()) diff --git a/src/python/paperetl/sqlite.py b/src/python/paperetl/sqlite.py index 5f803ac..19255d2 100644 --- a/src/python/paperetl/sqlite.py +++ b/src/python/paperetl/sqlite.py @@ -152,9 +152,7 @@ def savearticle(self, article): self.insert(SQLite.ARTICLES, "articles", article.metadata) except sqlite3.IntegrityError: # Duplicate detected get entry date to determine action - entry = parser.parse( - self.cur.execute(SQLite.LOOKUP_ENTRY, [article.uid()]).fetchone()[0] - ) + entry = parser.parse(self.cur.execute(SQLite.LOOKUP_ENTRY, [article.uid()]).fetchone()[0]) # Keep existing article if existing entry date is same or newer if article.entry() <= entry: @@ -219,9 +217,7 @@ def insert(self, table, name, row): # Build insert prepared statement columns = [name for name, _ in table.items()] - insert = SQLite.INSERT_ROW.format( - table=name, columns=", ".join(columns), values=("?, " * len(columns))[:-2] - ) + insert = SQLite.INSERT_ROW.format(table=name, columns=", ".join(columns), values=("?, " * len(columns))[:-2]) # Execute insert statement self.cur.execute(insert, self.values(table, row, columns)) diff --git a/src/python/paperetl/table.py b/src/python/paperetl/table.py index bf4628d..c65a2b7 100644 --- a/src/python/paperetl/table.py +++ b/src/python/paperetl/table.py @@ -51,10 +51,7 @@ def extract(table): for row in rows: # Build concatenated header value string - values = [ - f"{headers[x] if x < len(headers) else ''} {column.text}" - for x, column in enumerate(row) - ] + values = [f"{headers[x] if x < len(headers) else ''} {column.text}" for x, column in enumerate(row)] # Create single row string value = " ".join(values) diff --git a/test/python/testcord19.py b/test/python/testcord19.py index 0fafc48..ab42acf 100644 --- a/test/python/testcord19.py +++ b/test/python/testcord19.py @@ -76,12 +76,8 @@ def testDate(self): Test article publish dates """ - self.assertEqual( - Execute.getDate({"publish_time": "2020"}), datetime(2020, 1, 1) - ) - self.assertEqual( - Execute.getDate({"publish_time": "2020-10-10"}), datetime(2020, 10, 10) - ) + self.assertEqual(Execute.getDate({"publish_time": "2020"}), datetime(2020, 1, 1)) + self.assertEqual(Execute.getDate({"publish_time": "2020-10-10"}), datetime(2020, 10, 10)) self.assertEqual(Execute.getDate({"publish_time": "bad date"}), None) self.assertEqual(Execute.getDate({"publish_time": None}), None) @@ -113,9 +109,7 @@ def testHash(self): "62520f1c4f656dcb5fe565a4c2bf4ce1f7d435ef", ) self.assertEqual( - Execute.getHash( - {"sha": "47ed55bfa014cd59f58896c132c36bb0a218d11d; abcdef"} - ), + Execute.getHash({"sha": "47ed55bfa014cd59f58896c132c36bb0a218d11d; abcdef"}), "47ed55bfa014cd59f58896c132c36bb0a218d11d", ) diff --git a/test/python/testelastic.py b/test/python/testelastic.py index 30421c8..c546663 100644 --- a/test/python/testelastic.py +++ b/test/python/testelastic.py @@ -15,6 +15,7 @@ class Indices: Mock elasticsearch class for testing """ + # pylint: disable=C3001 exists = lambda *args, **kwargs: True delete = lambda *args, **kwargs: True refresh = lambda *args, **kwargs: True @@ -25,6 +26,7 @@ class ElasticStub: Mock elasticsearch class for testing """ + # pylint: disable=C3001 indices = Indices() bulk = lambda *args: True @@ -34,9 +36,7 @@ class TestElastic(unittest.TestCase): Elastic tests """ - @mock.patch( - "paperetl.elastic.Elasticsearch", mock.MagicMock(return_value=ElasticStub()) - ) + @mock.patch("paperetl.elastic.Elasticsearch", mock.MagicMock(return_value=ElasticStub())) @mock.patch("paperetl.elastic.helpers", mock.MagicMock(return_value=ElasticStub())) def testSave(self): """ diff --git a/test/python/testfiledatabase.py b/test/python/testfiledatabase.py index da4dc5f..db21e8b 100644 --- a/test/python/testfiledatabase.py +++ b/test/python/testfiledatabase.py @@ -47,9 +47,7 @@ def setUpClass(cls): output.write("0\n") # Run again with replace=False and filtering - Execute.run( - Utils.FILE + "/data", Utils.FILE + "/models", Utils.FILE + "/models" - ) + Execute.run(Utils.FILE + "/data", Utils.FILE + "/models", Utils.FILE + "/models") def setUp(self): """ @@ -99,9 +97,7 @@ def testArticles(self): self.articles(hashes) - @mock.patch( - "paperetl.file.pdf.requests.post", mock.MagicMock(return_value=RequestsStub()) - ) + @mock.patch("paperetl.file.pdf.requests.post", mock.MagicMock(return_value=RequestsStub())) def testPDF(self): """ Tests parsing PDFs diff --git a/test/python/testfileexport.py b/test/python/testfileexport.py index d0ffd85..e72f465 100644 --- a/test/python/testfileexport.py +++ b/test/python/testfileexport.py @@ -58,9 +58,7 @@ def export(self, method): articles += 1 with open(path, encoding="utf-8") as ifile: - data = ( - yaml.safe_load(ifile) if method == "yaml" else json.load(ifile) - ) + data = yaml.safe_load(ifile) if method == "yaml" else json.load(ifile) sections += len(data["sections"]) # Validate counts