diff --git a/docs/usage-indexreader.md b/docs/usage-indexreader.md index 4f5bc0c35..2d96b1eff 100644 --- a/docs/usage-indexreader.md +++ b/docs/usage-indexreader.md @@ -72,6 +72,25 @@ print(doc_vector) ``` The result is a dictionary where the keys are the analyzed terms and the values are the term frequencies. + +If you want to know the positions of each term in the document, you can use `get_term_positions`: +```python +term_positions = index_reader.get_term_positions('FBIS4-67701') +print(term_positions) +``` +The result is a dictionary where the keys are the analyzed terms and the values are the positions every term occur in the document. + +If you want to reconstruct the document using the position information, you can do this: +```python +doc = [] +for term, positions in term_positions.items(): + for p in positions: + doc.append((term,p)) +doc = ' '.join([t for t, p in sorted(doc, key=lambda x: x[1])]) +print(doc) +``` +The reconstructed document contains analyzed terms while [doc.contents()](https://github.com/castorini/pyserini/tree/master#how-do-i-fetch-a-document) contains unanalyzed terms. + To compute the tf-idf representation of a document, do something like this: ```python diff --git a/pyserini/index/_base.py b/pyserini/index/_base.py index 07887d344..02ffff64f 100644 --- a/pyserini/index/_base.py +++ b/pyserini/index/_base.py @@ -260,6 +260,30 @@ def get_document_vector(self, docid: str) -> Optional[Dict[str, int]]: doc_vector_dict[term] = doc_vector_map.get(JString(term.encode('utf-8'))) return doc_vector_dict + def get_term_positions(self, docid: str) -> Optional[Dict[str, int]]: + """Return the term position mapping of the document with ``docid``. Note that the term in the document is + stemmed and stop words may be removed according to your index settings. Also, requesting the document vector of + a ``docid`` that does not exist in the index will return ``None`` (as opposed to an empty dictionary); this + forces the caller to handle ``None`` explicitly and guards against silent errors. + + Parameters + ---------- + docid : str + Collection ``docid``. + + Returns + ------- + Optional[Dict[str, int]] + A tuple contains a dictionary with analyzed terms as keys and corresponding posting list as values + """ + java_term_position_map = self.object.getTermPositions(self.reader, JString(docid)) + if java_term_position_map is None: + return None + term_position_map = {} + for term in java_term_position_map.keySet().toArray(): + term_position_map[term] = java_term_position_map.get(JString(term.encode('utf-8'))).toArray() + return term_position_map + def doc(self, docid: str) -> Optional[Document]: """Return the :class:`Document` corresponding to ``docid``. Returns ``None`` if the ``docid`` does not exist in the index. diff --git a/tests/test_index_reader.py b/tests/test_index_reader.py index c7c041738..85cf9153e 100644 --- a/tests/test_index_reader.py +++ b/tests/test_index_reader.py @@ -183,6 +183,30 @@ def test_doc_vector_matches_index(self): # The tf values should match. self.assertEqual(postings_list[i].tf, 8) + def test_term_position(self): + term_positions = self.index_reader.get_term_positions('CACM-3134') + self.assertEqual(len(term_positions), 94) + self.assertEqual(term_positions['inform'], [7,24,36,46,60,112,121,159]) + self.assertEqual(term_positions['retriev'], [10,20,44,132,160,164,172]) + + def test_term_position_invalid(self): + self.assertTrue(self.index_reader.get_term_positions('foo') is None) + + def test_term_position_matches_index(self): + # From the term positions mapping, look up the position list of "information". + term_positions = self.index_reader.get_term_positions('CACM-3134') + self.assertEqual(term_positions['inform'], [7,24,36,46,60,112,121,159]) + + # Now look up the postings list for "information". + term = 'information' + postings_list = list(self.index_reader.get_postings_list(term)) + + for i in range(len(postings_list)): + # Go through the postings and find the matching document. + if self.index_reader.convert_internal_docid_to_collection_docid(postings_list[i].docid) == 'CACM-3134': + # The position list should match. + self.assertEqual(postings_list[i].positions, [7,24,36,46,60,112,121,159]) + def test_doc_invalid(self): self.assertTrue(self.index_reader.doc('foo') is None) self.assertTrue(self.index_reader.doc_contents('foo') is None)