Skip to content

Commit

Permalink
Add new 2.0 functionality, closes #46, closes #47, closes #48, closes #…
Browse files Browse the repository at this point in the history
  • Loading branch information
davidmezzetti committed Dec 3, 2021
1 parent 27d2a2c commit 1a296b3
Show file tree
Hide file tree
Showing 28 changed files with 402 additions and 189 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
build/
dist/
htmlcov/
*egg-info/
__pycache__/
.coverage
.coverage.*
*.pyc
2 changes: 1 addition & 1 deletion .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@ min-public-methods=0
max-line-length=150

[MESSAGES CONTROL]
disable=I0011,R0201,W0105,W0108,W0110,W0141,W0621,W0640
disable=R0201
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ PYTHON ?= python
# Download test data
data:
mkdir -p /tmp/paperai
wget -N https://github.com/neuml/paperai/releases/download/v1.3.0/tests.tar.gz -P /tmp
wget -N https://github.com/neuml/paperai/releases/download/v1.10.0/tests.tar.gz -P /tmp
tar -xvzf /tmp/tests.tar.gz -C /tmp

# Unit tests
Expand Down
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,11 @@ The easiest way to install is via pip and PyPI

pip install paperai

You can also install paperai directly from GitHub. Using a Python Virtual Environment is recommended.
Python 3.6+ is supported. Using a Python [virtual environment](https://docs.python.org/3/library/venv.html) is recommended.

pip install git+https://github.com/neuml/paperai
paperai can also be installed directly from GitHub to access the latest, unreleased features.

Python 3.6+ is supported
pip install git+https://github.com/neuml/paperai

See [this link](https://github.com/neuml/txtai#installation) to help resolve environment-specific install issues.

Expand Down Expand Up @@ -125,15 +125,15 @@ no parameters are passed in.
## Building a report file
Reports support generating output in multiple formats. An example report call:

python -m paperai.report tasks/risks.yml 50 md cord19/models
python -m paperai.report report.yml 50 md cord19/models

The following report formats are supported:

- Markdown (Default) - Renders a Markdown report. Columns and answers are extracted from articles with the results stored in a Markdown file.
- CSV - Renders a CSV report. Columns and answers are extracted from articles with the results stored in a CSV file.
- Annotation - Columns and answers are extracted from articles with the results annotated over the original PDF files. Requires passing in a path with the original PDF files.

In the example above, a file named tasks/risk_factors.md will be created. Example report configuration files can be found [here](https://github.com/neuml/cord19q/tree/master/tasks).
In the example above, a file named report.md will be created. Example report configuration files can be found [here](https://github.com/neuml/cord19q/tree/master/tasks).

## Running queries
The fastest way to run queries is to start a paperai shell
Expand Down
Binary file modified demo.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
13 changes: 6 additions & 7 deletions examples/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ def __init__(self, path):
"""

# Default list of columns
self.columns = [("Title", True), ("Published", False), ("Publication", False), ("Design", False), ("Sample", False),
("Method", False), ("Entry", False), ("Id", False), ("Content", True)]
self.columns = [("Title", True), ("Published", False), ("Publication", False), ("Entry", False),
("Id", False), ("Content", True)]

# Load model
self.path = path
Expand Down Expand Up @@ -58,17 +58,16 @@ def search(self, query, topn, threshold):

# Print each result, sorted by max score descending
for uid in sorted(documents, key=lambda k: sum([x[0] for x in documents[k]]), reverse=True):
cur.execute("SELECT Title, Published, Publication, Design, Size, Sample, Method, Entry, Id, Reference " +
cur.execute("SELECT Title, Published, Publication, Entry, Id, Reference " +
"FROM articles WHERE id = ?", [uid])
article = cur.fetchone()

matches = "<br/>".join([text for _, text in documents[uid]])

title = "<a target='_blank' href='%s'>%s</a>" % (article[9], article[0])
title = f"<a target='_blank' href='{article[5]}'>{article[0]}</a>"

article = {"Title": title, "Published": Query.date(article[1]), "Publication": article[2], "Design": Query.design(article[3]),
"Sample": Query.sample(article[4], article[5]), "Method": Query.text(article[6]), "Entry": article[7],
"Id": article[8], "Content": matches}
article = {"Title": title, "Published": Query.date(article[1]), "Publication": article[2], "Entry": article[3],
"Id": article[4], "Content": matches}

articles.append(article)

Expand Down
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
# pylint: disable = C0111
from setuptools import find_packages, setup

with open("README.md", "r") as f:
with open("README.md", "r", encoding="utf-8") as f:
DESCRIPTION = f.read()

setup(name="paperai",
version="1.11.0",
version="2.0.0",
author="NeuML",
description="AI-powered literature discovery and review engine for medical/scientific papers",
long_description=DESCRIPTION,
Expand All @@ -32,6 +32,7 @@
"networkx>=2.4",
"PyYAML>=5.3",
"regex>=2020.5.14",
"text2digits>=0.1.0",
"txtai[api,similarity]>=3.4.0",
"txtmarker>=1.0.0"
],
Expand Down
13 changes: 6 additions & 7 deletions src/python/paperai/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class API(txtai.api.API):
Extended API on top of txtai to return enriched query results.
"""

def search(self, query, request):
def search(self, query, request=None):
"""
Extends txtai API to enrich results with content.
Expand All @@ -28,8 +28,8 @@ def search(self, query, request):

if self.embeddings:
dbfile = os.path.join(self.config["path"], "articles.sqlite")
limit = self.limit(request.query_params.get("limit"))
threshold = float(request.query_params["threshold"]) if "threshold" in request.query_params else None
limit = self.limit(request.query_params.get("limit")) if request else 10
threshold = float(request.query_params["threshold"]) if request and "threshold" in request.query_params else None

with sqlite3.connect(dbfile) as db:
cur = db.cursor()
Expand All @@ -44,16 +44,15 @@ def search(self, query, request):

# Print each result, sorted by max score descending
for uid in sorted(documents, key=lambda k: sum([x[0] for x in documents[k]]), reverse=True):
cur.execute("SELECT Title, Published, Publication, Design, Size, Sample, Method, Entry, Id, Reference " +
cur.execute("SELECT Title, Published, Publication, Entry, Id, Reference " +
"FROM articles WHERE id = ?", [uid])
article = cur.fetchone()

score = max([score for score, text in documents[uid]])
matches = [text for _, text in documents[uid]]

article = {"id": article[8], "score": score, "title": article[0], "published": Query.date(article[1]), "publication": article[2],
"design": Query.design(article[3]), "sample": Query.sample(article[4], article[5]), "method": Query.text(article[6]),
"entry": article[7], "reference": article[9], "matches": matches}
article = {"id": article[4], "score": score, "title": article[0], "published": Query.date(article[1]), "publication": article[2],
"entry": article[3], "reference": article[5], "matches": matches}

articles.append(article)

Expand Down
12 changes: 6 additions & 6 deletions src/python/paperai/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from .index import Index
from .models import Models

class Export(object):
class Export:
"""
Exports database rows into a text file line-by-line.
"""
Expand All @@ -29,26 +29,26 @@ def stream(dbfile, output):
output: output file to store text
"""

with open(output, "w") as output:
with open(output, "w", encoding="utf-8") as output:
# Connection to database file
db = sqlite3.connect(dbfile)
cur = db.cursor()

# Get all indexed text, with a detected study design, excluding modeling designs
cur.execute(Index.SECTION_QUERY + " AND design NOT IN (0, 9)")
# Get all indexed text
cur.execute(Index.SECTION_QUERY)

count = 0
for _, name, text in cur:
if not name or not re.search(Index.SECTION_FILTER, name.lower()):
count += 1
if count % 1000 == 0:
print("Streamed %d documents" % (count), end="\r")
print(f"Streamed {count} documents", end="\r")

# Write row
if text:
output.write(text + "\n")

print("Iterated over %d total rows" % (count))
print(f"Iterated over {count} total rows")

# Free database resources
db.close()
Expand Down
2 changes: 1 addition & 1 deletion src/python/paperai/highlights.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from txtai.pipeline import Tokenizer

class Highlights(object):
class Highlights:
"""
Methods to extract highlights from a list of text sections.
"""
Expand Down
16 changes: 8 additions & 8 deletions src/python/paperai/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,14 @@

from .models import Models

class Index(object):
class Index:
"""
Methods to build a new sentence embeddings index.
"""

# Section query and filtering logic constants
SECTION_FILTER = r"background|(?<!.*?results.*?)discussion|introduction|reference"
SECTION_QUERY = "SELECT Id, Name, Text FROM sections WHERE (labels is null or labels NOT IN ('FRAGMENT', 'QUESTION'))"
SECTION_QUERY = "SELECT Id, Name, Text FROM sections"

@staticmethod
def stream(dbfile, maxsize):
Expand All @@ -37,11 +37,11 @@ def stream(dbfile, maxsize):
db = sqlite3.connect(dbfile)
cur = db.cursor()

# Select tagged sentences without a NLP label. NLP labels are set for non-informative sentences.
query = Index.SECTION_QUERY + " AND tags is not null"
# Select sentences from tagged articles
query = Index.SECTION_QUERY + " WHERE article in (SELECT article FROM articles a WHERE a.id = article AND a.tags IS NOT NULL)"

if maxsize > 0:
query += " AND article in (SELECT id FROM articles ORDER BY entry DESC LIMIT %d)" % maxsize
query += f" AND article in (SELECT id FROM articles ORDER BY entry DESC LIMIT {maxsize})"

# Run the query
cur.execute(query)
Expand All @@ -59,13 +59,13 @@ def stream(dbfile, maxsize):

count += 1
if count % 1000 == 0:
print("Streamed %d documents" % (count), end="\r")
print(f"Streamed {count} documents", end="\r")

# Skip documents with no tokens parsed
if tokens:
yield document

print("Iterated over %d total rows" % (count))
print(f"Iterated over {count} total rows")

# Free database resources
db.close()
Expand All @@ -88,7 +88,7 @@ def config(vectors):

# Read YAML index configuration
if vectors.endswith(".yml"):
with open(vectors, "r") as f:
with open(vectors, "r", encoding="utf-8") as f:
return yaml.safe_load(f)

return {"path": vectors, "scoring": "bm25", "pca": 3, "quantize": True}
Expand Down
4 changes: 2 additions & 2 deletions src/python/paperai/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from txtai.embeddings import Embeddings

class Models(object):
class Models:
"""
Common methods for generating data paths.
"""
Expand Down Expand Up @@ -95,7 +95,7 @@ def load(path):
dbfile = os.path.join(path, "articles.sqlite")

if os.path.isfile(os.path.join(path, "config")):
print("Loading model from %s" % path)
print(f"Loading model from {path}")
embeddings = Embeddings()
embeddings.load(path)
else:
Expand Down
61 changes: 12 additions & 49 deletions src/python/paperai/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from .highlights import Highlights
from .models import Models

class Query(object):
class Query:
"""
Methods to query an embeddings index.
"""
Expand Down Expand Up @@ -238,7 +238,7 @@ def authors(authors):
else:
authors = authors.split()[-1]

return "%s et al" % authors
return f"{authors} et al"

return None

Expand Down Expand Up @@ -289,40 +289,6 @@ def text(text):

return text

@staticmethod
def design(design):
"""
Formats a study design field.
Args:
design: study design integer
Returns:
Study Design string
"""

# Study design type mapping
mapping = {1:"Systematic review", 2:"Randomized control trial", 3:"Non-randomized trial",
4:"Prospective observational", 5:"Time-to-event analysis", 6:"Retrospective observational",
7:"Cross-sectional", 8:"Case series", 9:"Modeling", 0:"Other"}

return mapping[design]

@staticmethod
def sample(size, text):
"""
Formats a sample string.
Args:
size: Sample size
text: Sample text
Returns:
Formatted sample text
"""

return "[%s] %s" % (size, Query.text(text)) if size else Query.text(text)

@staticmethod
def query(embeddings, db, query, topn, threshold):
"""
Expand All @@ -341,15 +307,15 @@ def query(embeddings, db, query, topn, threshold):

cur = db.cursor()

print(Query.render("#Query: %s" % query, theme="729.8953") + "\n")
print(Query.render(f"#Query: {query}", theme="729.8953") + "\n")

# Query for best matches
results = Query.search(embeddings, cur, query, topn, threshold)

# Extract top sections as highlights
print(Query.render("# Highlights"))
for highlight in Query.highlights(results, int(topn / 5)):
print(Query.render("## - %s" % Query.text(highlight)))
print(Query.render(f"## - {Query.text(highlight)}"))

print()

Expand All @@ -360,22 +326,19 @@ def query(embeddings, db, query, topn, threshold):

# Print each result, sorted by max score descending
for uid in sorted(documents, key=lambda k: sum([x[0] for x in documents[k]]), reverse=True):
cur.execute("SELECT Title, Published, Publication, Design, Size, Sample, Method, Entry, Id, Reference FROM articles WHERE id = ?", [uid])
cur.execute("SELECT Title, Published, Publication, Entry, Id, Reference FROM articles WHERE id = ?", [uid])
article = cur.fetchone()

print("Title: %s" % article[0])
print("Published: %s" % Query.date(article[1]))
print("Publication: %s" % article[2])
print("Design: %s" % Query.design(article[3]))
print("Sample: %s" % Query.sample(article[4], article[5]))
print("Method: %s" % Query.text(article[6]))
print("Entry: %s" % article[7])
print("Id: %s" % article[8])
print("Reference: %s" % article[9])
print(f"Title: {article[0]}")
print(f"Published: {Query.date(article[1])}")
print(f"Publication: {article[2]}")
print(f"Entry: {article[3]}")
print(f"Id: {article[4]}")
print(f"Reference: {article[5]}")

# Print top matches
for score, text in documents[uid]:
print(Query.render("## - (%.4f): %s" % (score, Query.text(text)), html=False))
print(Query.render(f"## - ({score:.4f}): {Query.text(text)}", html=False))

print()

Expand Down
Loading

0 comments on commit 1a296b3

Please sign in to comment.