Skip to content

Commit

Permalink
Expose document-level term positions (#1337)
Browse files Browse the repository at this point in the history
  • Loading branch information
nsndimt authored Aug 4, 2020
1 parent 62fcb52 commit bd8fff8
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 0 deletions.
42 changes: 42 additions & 0 deletions src/main/java/io/anserini/index/IndexReaderUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,48 @@ public static Map<String, Long> getDocumentVector(IndexReader reader, String doc
return docVector;
}

/**
* Returns the term position mapping for a particular document. Note that this method explicitly returns
* {@code null} if the document does not exist (as opposed to an empty map), so that the caller is explicitly forced
* to handle this case.
*
* @param reader index reader
* @param docid collection docid
* @return term position mapping for a particular document or {@code null} if document does not exist.
* @throws IOException if error encountered during query
* @throws NotStoredException if the term vector is not stored
*/
public static Map<String, List<Integer>> getTermPositions(IndexReader reader, String docid) throws IOException, NotStoredException {
int ldocid = convertDocidToLuceneDocid(reader, docid);
if (ldocid == -1) {
return null;
}
Terms terms = reader.getTermVector(ldocid, IndexArgs.CONTENTS);
if (terms == null) {
throw new NotStoredException("Document vector not stored!");
}
TermsEnum termIter = terms.iterator();
if (termIter == null) {
throw new NotStoredException("Document vector not stored!");
}

Map<String, List<Integer>> termPosition = new HashMap<>();
PostingsEnum positionIter = null;

while ((termIter.next()) != null) {
List<Integer> positions = new ArrayList<>();
long termFreq = termIter.totalTermFreq();
positionIter = termIter.postings(positionIter, PostingsEnum.POSITIONS);
positionIter.nextDoc();
for ( int i = 0; i < termFreq; i++ ) {
positions.add(positionIter.nextPosition());
}
termPosition.put(termIter.term().utf8ToString(), positions);
}

return termPosition;
}

/**
* Returns the Lucene {@link Document} based on a collection docid. The method is named to be consistent with Lucene's
* {@link IndexReader#document(int)}, contra Java's standard method naming conventions.
Expand Down
32 changes: 32 additions & 0 deletions src/test/java/io/anserini/index/IndexReaderUtilsTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,38 @@ public void testDocumentVector() throws Exception {
dir.close();
}

@Test
public void testTermPositions() throws Exception {
Directory dir = FSDirectory.open(tempDir1);
IndexReader reader = DirectoryReader.open(dir);

Map<String, List<Integer>> termPositions;

termPositions = IndexReaderUtils.getTermPositions(reader, "doc1");
assertEquals(Integer.valueOf(0), termPositions.get("here").get(0));
assertEquals(Integer.valueOf(4), termPositions.get("here").get(1));
assertEquals(Integer.valueOf(2), termPositions.get("some").get(0));
assertEquals(Integer.valueOf(6), termPositions.get("some").get(1));
assertEquals(Integer.valueOf(3), termPositions.get("text").get(0));
assertEquals(Integer.valueOf(8), termPositions.get("text").get(1));
assertEquals(Integer.valueOf(7), termPositions.get("more").get(0));
assertEquals(Integer.valueOf(9), termPositions.get("citi").get(0));

termPositions = IndexReaderUtils.getTermPositions(reader, "doc2");
assertEquals(Integer.valueOf(0), termPositions.get("more").get(0));
assertEquals(Integer.valueOf(1), termPositions.get("text").get(0));

termPositions = IndexReaderUtils.getTermPositions(reader, "doc3");
assertEquals(Integer.valueOf(0), termPositions.get("here").get(0));
assertEquals(Integer.valueOf(3), termPositions.get("test").get(0));

// Invalid docid.
assertTrue(IndexReaderUtils.getDocumentVector(reader, "foo") == null);

reader.close();
dir.close();
}

@Test
public void testGetDocumentRaw() throws Exception {
Directory dir = FSDirectory.open(tempDir1);
Expand Down

0 comments on commit bd8fff8

Please sign in to comment.