Skip to content

Commit

Permalink
Expose getTermPositions in the IndexReader class from Anserini (#223)
Browse files Browse the repository at this point in the history
  • Loading branch information
nsndimt authored Aug 6, 2020
1 parent 8f70f51 commit 49fd7cb
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 0 deletions.
19 changes: 19 additions & 0 deletions docs/usage-indexreader.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,25 @@ print(doc_vector)
```

The result is a dictionary where the keys are the analyzed terms and the values are the term frequencies.

If you want to know the positions of each term in the document, you can use `get_term_positions`:
```python
term_positions = index_reader.get_term_positions('FBIS4-67701')
print(term_positions)
```
The result is a dictionary where the keys are the analyzed terms and the values are the positions every term occur in the document.

If you want to reconstruct the document using the position information, you can do this:
```python
doc = []
for term, positions in term_positions.items():
for p in positions:
doc.append((term,p))
doc = ' '.join([t for t, p in sorted(doc, key=lambda x: x[1])])
print(doc)
```
The reconstructed document contains analyzed terms while [doc.contents()](https://github.com/castorini/pyserini/tree/master#how-do-i-fetch-a-document) contains unanalyzed terms.

To compute the tf-idf representation of a document, do something like this:

```python
Expand Down
24 changes: 24 additions & 0 deletions pyserini/index/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,30 @@ def get_document_vector(self, docid: str) -> Optional[Dict[str, int]]:
doc_vector_dict[term] = doc_vector_map.get(JString(term.encode('utf-8')))
return doc_vector_dict

def get_term_positions(self, docid: str) -> Optional[Dict[str, int]]:
"""Return the term position mapping of the document with ``docid``. Note that the term in the document is
stemmed and stop words may be removed according to your index settings. Also, requesting the document vector of
a ``docid`` that does not exist in the index will return ``None`` (as opposed to an empty dictionary); this
forces the caller to handle ``None`` explicitly and guards against silent errors.
Parameters
----------
docid : str
Collection ``docid``.
Returns
-------
Optional[Dict[str, int]]
A tuple contains a dictionary with analyzed terms as keys and corresponding posting list as values
"""
java_term_position_map = self.object.getTermPositions(self.reader, JString(docid))
if java_term_position_map is None:
return None
term_position_map = {}
for term in java_term_position_map.keySet().toArray():
term_position_map[term] = java_term_position_map.get(JString(term.encode('utf-8'))).toArray()
return term_position_map

def doc(self, docid: str) -> Optional[Document]:
"""Return the :class:`Document` corresponding to ``docid``. Returns ``None`` if the ``docid`` does not exist
in the index.
Expand Down
24 changes: 24 additions & 0 deletions tests/test_index_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,30 @@ def test_doc_vector_matches_index(self):
# The tf values should match.
self.assertEqual(postings_list[i].tf, 8)

def test_term_position(self):
term_positions = self.index_reader.get_term_positions('CACM-3134')
self.assertEqual(len(term_positions), 94)
self.assertEqual(term_positions['inform'], [7,24,36,46,60,112,121,159])
self.assertEqual(term_positions['retriev'], [10,20,44,132,160,164,172])

def test_term_position_invalid(self):
self.assertTrue(self.index_reader.get_term_positions('foo') is None)

def test_term_position_matches_index(self):
# From the term positions mapping, look up the position list of "information".
term_positions = self.index_reader.get_term_positions('CACM-3134')
self.assertEqual(term_positions['inform'], [7,24,36,46,60,112,121,159])

# Now look up the postings list for "information".
term = 'information'
postings_list = list(self.index_reader.get_postings_list(term))

for i in range(len(postings_list)):
# Go through the postings and find the matching document.
if self.index_reader.convert_internal_docid_to_collection_docid(postings_list[i].docid) == 'CACM-3134':
# The position list should match.
self.assertEqual(postings_list[i].positions, [7,24,36,46,60,112,121,159])

def test_doc_invalid(self):
self.assertTrue(self.index_reader.doc('foo') is None)
self.assertTrue(self.index_reader.doc_contents('foo') is None)
Expand Down

0 comments on commit 49fd7cb

Please sign in to comment.