Add new 2.0 functionality, closes #46, closes #47, closes #48, closes #…

…49
neuml · Dec 3, 2021 · 1a296b3 · 1a296b3
1 parent 27d2a2c
commit 1a296b3
Show file tree

Hide file tree

Showing 28 changed files with 402 additions and 189 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,8 @@
 build/
 dist/
+htmlcov/
 *egg-info/
 __pycache__/
 .coverage
+.coverage.*
 *.pyc
diff --git a/.pylintrc b/.pylintrc
@@ -17,4 +17,4 @@ min-public-methods=0
 max-line-length=150
 
 [MESSAGES CONTROL]
-disable=I0011,R0201,W0105,W0108,W0110,W0141,W0621,W0640
+disable=R0201
diff --git a/Makefile b/Makefile
@@ -14,7 +14,7 @@ PYTHON ?= python
 # Download test data
 data: 
 	mkdir -p /tmp/paperai
-	wget -N https://github.com/neuml/paperai/releases/download/v1.3.0/tests.tar.gz -P /tmp
+	wget -N https://github.com/neuml/paperai/releases/download/v1.10.0/tests.tar.gz -P /tmp
 	tar -xvzf /tmp/tests.tar.gz -C /tmp
 
 # Unit tests

diff --git a/README.md b/README.md
@@ -46,11 +46,11 @@ The easiest way to install is via pip and PyPI
 
     pip install paperai
 
-You can also install paperai directly from GitHub. Using a Python Virtual Environment is recommended.
+Python 3.6+ is supported. Using a Python [virtual environment](https://docs.python.org/3/library/venv.html) is recommended.
 
-    pip install git+https://github.com/neuml/paperai
+paperai can also be installed directly from GitHub to access the latest, unreleased features.
 
-Python 3.6+ is supported
+    pip install git+https://github.com/neuml/paperai
 
 See [this link](https://github.com/neuml/txtai#installation) to help resolve environment-specific install issues.
 
@@ -125,15 +125,15 @@ no parameters are passed in.
 ## Building a report file
 Reports support generating output in multiple formats. An example report call:
 
-    python -m paperai.report tasks/risks.yml 50 md cord19/models
+    python -m paperai.report report.yml 50 md cord19/models
 
 The following report formats are supported:
 
 - Markdown (Default) - Renders a Markdown report. Columns and answers are extracted from articles with the results stored in a Markdown file.
 - CSV - Renders a CSV report. Columns and answers are extracted from articles with the results stored in a CSV file.
 - Annotation - Columns and answers are extracted from articles with the results annotated over the original PDF files. Requires passing in a path with the original PDF files.
 
-In the example above, a file named tasks/risk_factors.md will be created. Example report configuration files can be found [here](https://github.com/neuml/cord19q/tree/master/tasks).
+In the example above, a file named report.md will be created. Example report configuration files can be found [here](https://github.com/neuml/cord19q/tree/master/tasks).
 
 ## Running queries
 The fastest way to run queries is to start a paperai shell

diff --git a/demo.png b/demo.png
diff --git a/examples/search.py b/examples/search.py
@@ -28,8 +28,8 @@ def __init__(self, path):
         """
 
         # Default list of columns
-        self.columns = [("Title", True), ("Published", False), ("Publication", False), ("Design", False), ("Sample", False),
-                        ("Method", False), ("Entry", False), ("Id", False), ("Content", True)]
+        self.columns = [("Title", True), ("Published", False), ("Publication", False), ("Entry", False),
+                        ("Id", False), ("Content", True)]
 
         # Load model
         self.path = path
@@ -58,17 +58,16 @@ def search(self, query, topn, threshold):
 
             # Print each result, sorted by max score descending
             for uid in sorted(documents, key=lambda k: sum([x[0] for x in documents[k]]), reverse=True):
-                cur.execute("SELECT Title, Published, Publication, Design, Size, Sample, Method, Entry, Id, Reference " +
+                cur.execute("SELECT Title, Published, Publication, Entry, Id, Reference " +
                             "FROM articles WHERE id = ?", [uid])
                 article = cur.fetchone()
 
                 matches = "<br/>".join([text for _, text in documents[uid]])
 
-                title = "<a target='_blank' href='%s'>%s</a>" % (article[9], article[0])
+                title = f"<a target='_blank' href='{article[5]}'>{article[0]}</a>"
 
-                article = {"Title": title, "Published": Query.date(article[1]), "Publication": article[2], "Design": Query.design(article[3]),
-                           "Sample": Query.sample(article[4], article[5]), "Method": Query.text(article[6]), "Entry": article[7],
-                           "Id": article[8], "Content": matches}
+                article = {"Title": title, "Published": Query.date(article[1]), "Publication": article[2], "Entry": article[3],
+                           "Id": article[4], "Content": matches}
 
                 articles.append(article)
 

diff --git a/setup.py b/setup.py
@@ -1,11 +1,11 @@
 # pylint: disable = C0111
 from setuptools import find_packages, setup
 
-with open("README.md", "r") as f:
+with open("README.md", "r", encoding="utf-8") as f:
     DESCRIPTION = f.read()
 
 setup(name="paperai",
-      version="1.11.0",
+      version="2.0.0",
       author="NeuML",
       description="AI-powered literature discovery and review engine for medical/scientific papers",
       long_description=DESCRIPTION,
@@ -32,6 +32,7 @@
           "networkx>=2.4",
           "PyYAML>=5.3",
           "regex>=2020.5.14",
+          "text2digits>=0.1.0",
           "txtai[api,similarity]>=3.4.0",
           "txtmarker>=1.0.0"
       ],

diff --git a/src/python/paperai/api.py b/src/python/paperai/api.py
@@ -14,7 +14,7 @@ class API(txtai.api.API):
     Extended API on top of txtai to return enriched query results.
     """
 
-    def search(self, query, request):
+    def search(self, query, request=None):
         """
         Extends txtai API to enrich results with content.
 
@@ -28,8 +28,8 @@ def search(self, query, request):
 
         if self.embeddings:
             dbfile = os.path.join(self.config["path"], "articles.sqlite")
-            limit = self.limit(request.query_params.get("limit"))
-            threshold = float(request.query_params["threshold"]) if "threshold" in request.query_params else None
+            limit = self.limit(request.query_params.get("limit")) if request else 10
+            threshold = float(request.query_params["threshold"]) if request and "threshold" in request.query_params else None
 
             with sqlite3.connect(dbfile) as db:
                 cur = db.cursor()
@@ -44,16 +44,15 @@ def search(self, query, request):
 
                 # Print each result, sorted by max score descending
                 for uid in sorted(documents, key=lambda k: sum([x[0] for x in documents[k]]), reverse=True):
-                    cur.execute("SELECT Title, Published, Publication, Design, Size, Sample, Method, Entry, Id, Reference " +
+                    cur.execute("SELECT Title, Published, Publication, Entry, Id, Reference " +
                                 "FROM articles WHERE id = ?", [uid])
                     article = cur.fetchone()
 
                     score = max([score for score, text in documents[uid]])
                     matches = [text for _, text in documents[uid]]
 
-                    article = {"id": article[8], "score": score, "title": article[0], "published": Query.date(article[1]), "publication": article[2],
-                               "design": Query.design(article[3]), "sample": Query.sample(article[4], article[5]), "method": Query.text(article[6]),
-                               "entry": article[7], "reference": article[9], "matches": matches}
+                    article = {"id": article[4], "score": score, "title": article[0], "published": Query.date(article[1]), "publication": article[2],
+                               "entry": article[3], "reference": article[5], "matches": matches}
 
                     articles.append(article)
 

diff --git a/src/python/paperai/export.py b/src/python/paperai/export.py
@@ -14,7 +14,7 @@
 from .index import Index
 from .models import Models
 
-class Export(object):
+class Export:
     """
     Exports database rows into a text file line-by-line.
     """
@@ -29,26 +29,26 @@ def stream(dbfile, output):
             output: output file to store text
         """
 
-        with open(output, "w") as output:
+        with open(output, "w", encoding="utf-8") as output:
             # Connection to database file
             db = sqlite3.connect(dbfile)
             cur = db.cursor()
 
-            # Get all indexed text, with a detected study design, excluding modeling designs
-            cur.execute(Index.SECTION_QUERY + " AND design NOT IN (0, 9)")
+            # Get all indexed text
+            cur.execute(Index.SECTION_QUERY)
 
             count = 0
             for _, name, text in cur:
                 if not name or not re.search(Index.SECTION_FILTER, name.lower()):
                     count += 1
                     if count % 1000 == 0:
-                        print("Streamed %d documents" % (count), end="\r")
+                        print(f"Streamed {count} documents", end="\r")
 
                     # Write row
                     if text:
                         output.write(text + "\n")
 
-            print("Iterated over %d total rows" % (count))
+            print(f"Iterated over {count} total rows")
 
             # Free database resources
             db.close()

diff --git a/src/python/paperai/highlights.py b/src/python/paperai/highlights.py
@@ -8,7 +8,7 @@
 
 from txtai.pipeline import Tokenizer
 
-class Highlights(object):
+class Highlights:
     """
     Methods to extract highlights from a list of text sections.
     """

diff --git a/src/python/paperai/index.py b/src/python/paperai/index.py
@@ -14,14 +14,14 @@
 
 from .models import Models
 
-class Index(object):
+class Index:
     """
     Methods to build a new sentence embeddings index.
     """
 
     # Section query and filtering logic constants
     SECTION_FILTER = r"background|(?<!.*?results.*?)discussion|introduction|reference"
-    SECTION_QUERY = "SELECT Id, Name, Text FROM sections WHERE (labels is null or labels NOT IN ('FRAGMENT', 'QUESTION'))"
+    SECTION_QUERY = "SELECT Id, Name, Text FROM sections"
 
     @staticmethod
     def stream(dbfile, maxsize):
@@ -37,11 +37,11 @@ def stream(dbfile, maxsize):
         db = sqlite3.connect(dbfile)
         cur = db.cursor()
 
-        # Select tagged sentences without a NLP label. NLP labels are set for non-informative sentences.
-        query = Index.SECTION_QUERY + " AND tags is not null"
+        # Select sentences from tagged articles
+        query = Index.SECTION_QUERY + " WHERE article in (SELECT article FROM articles a WHERE a.id = article AND a.tags IS NOT NULL)"
 
         if maxsize > 0:
-            query += " AND article in (SELECT id FROM articles ORDER BY entry DESC LIMIT %d)" % maxsize
+            query += f" AND article in (SELECT id FROM articles ORDER BY entry DESC LIMIT {maxsize})"
 
         # Run the query
         cur.execute(query)
@@ -59,13 +59,13 @@ def stream(dbfile, maxsize):
 
                 count += 1
                 if count % 1000 == 0:
-                    print("Streamed %d documents" % (count), end="\r")
+                    print(f"Streamed {count} documents", end="\r")
 
                 # Skip documents with no tokens parsed
                 if tokens:
                     yield document
 
-        print("Iterated over %d total rows" % (count))
+        print(f"Iterated over {count} total rows")
 
         # Free database resources
         db.close()
@@ -88,7 +88,7 @@ def config(vectors):
 
         # Read YAML index configuration
         if vectors.endswith(".yml"):
-            with open(vectors, "r") as f:
+            with open(vectors, "r", encoding="utf-8") as f:
                 return yaml.safe_load(f)
 
         return {"path": vectors, "scoring": "bm25", "pca": 3, "quantize": True}

diff --git a/src/python/paperai/models.py b/src/python/paperai/models.py
@@ -8,7 +8,7 @@
 
 from txtai.embeddings import Embeddings
 
-class Models(object):
+class Models:
     """
     Common methods for generating data paths.
     """
@@ -95,7 +95,7 @@ def load(path):
         dbfile = os.path.join(path, "articles.sqlite")
 
         if os.path.isfile(os.path.join(path, "config")):
-            print("Loading model from %s" % path)
+            print(f"Loading model from {path}")
             embeddings = Embeddings()
             embeddings.load(path)
         else:

diff --git a/src/python/paperai/query.py b/src/python/paperai/query.py
@@ -14,7 +14,7 @@
 from .highlights import Highlights
 from .models import Models
 
-class Query(object):
+class Query:
     """
     Methods to query an embeddings index.
     """
@@ -238,7 +238,7 @@ def authors(authors):
             else:
                 authors = authors.split()[-1]
 
-            return "%s et al" % authors
+            return f"{authors} et al"
 
         return None
 
@@ -289,40 +289,6 @@ def text(text):
 
         return text
 
-    @staticmethod
-    def design(design):
-        """
-        Formats a study design field.
-
-        Args:
-            design: study design integer
-
-        Returns:
-            Study Design string
-        """
-
-        # Study design type mapping
-        mapping = {1:"Systematic review", 2:"Randomized control trial", 3:"Non-randomized trial",
-                   4:"Prospective observational", 5:"Time-to-event analysis", 6:"Retrospective observational",
-                   7:"Cross-sectional", 8:"Case series", 9:"Modeling", 0:"Other"}
-
-        return mapping[design]
-
-    @staticmethod
-    def sample(size, text):
-        """
-        Formats a sample string.
-
-        Args:
-            size: Sample size
-            text: Sample text
-
-        Returns:
-            Formatted sample text
-        """
-
-        return "[%s] %s" % (size, Query.text(text)) if size else Query.text(text)
-
     @staticmethod
     def query(embeddings, db, query, topn, threshold):
         """
@@ -341,15 +307,15 @@ def query(embeddings, db, query, topn, threshold):
 
         cur = db.cursor()
 
-        print(Query.render("#Query: %s" % query, theme="729.8953") + "\n")
+        print(Query.render(f"#Query: {query}", theme="729.8953") + "\n")
 
         # Query for best matches
         results = Query.search(embeddings, cur, query, topn, threshold)
 
         # Extract top sections as highlights
         print(Query.render("# Highlights"))
         for highlight in Query.highlights(results, int(topn / 5)):
-            print(Query.render("## - %s" % Query.text(highlight)))
+            print(Query.render(f"## - {Query.text(highlight)}"))
 
         print()
 
@@ -360,22 +326,19 @@ def query(embeddings, db, query, topn, threshold):
 
         # Print each result, sorted by max score descending
         for uid in sorted(documents, key=lambda k: sum([x[0] for x in documents[k]]), reverse=True):
-            cur.execute("SELECT Title, Published, Publication, Design, Size, Sample, Method, Entry, Id, Reference FROM articles WHERE id = ?", [uid])
+            cur.execute("SELECT Title, Published, Publication, Entry, Id, Reference FROM articles WHERE id = ?", [uid])
             article = cur.fetchone()
 
-            print("Title: %s" % article[0])
-            print("Published: %s" % Query.date(article[1]))
-            print("Publication: %s" % article[2])
-            print("Design: %s" % Query.design(article[3]))
-            print("Sample: %s" % Query.sample(article[4], article[5]))
-            print("Method: %s" % Query.text(article[6]))
-            print("Entry: %s" % article[7])
-            print("Id: %s" % article[8])
-            print("Reference: %s" % article[9])
+            print(f"Title: {article[0]}")
+            print(f"Published: {Query.date(article[1])}")
+            print(f"Publication: {article[2]}")
+            print(f"Entry: {article[3]}")
+            print(f"Id: {article[4]}")
+            print(f"Reference: {article[5]}")
 
             # Print top matches
             for score, text in documents[uid]:
-                print(Query.render("## - (%.4f): %s" % (score, Query.text(text)), html=False))
+                print(Query.render(f"## - ({score:.4f}): {Query.text(text)}", html=False))
 
             print()