From c5ee9af442c500ec43fc28808903cfca2417ac12 Mon Sep 17 00:00:00 2001 From: Chris Kamphuis Date: Thu, 29 Aug 2019 15:31:00 +0200 Subject: [PATCH] Add date filter to background linking reranker (#786) --- .../index/generator/WapoGenerator.java | 3 +- .../lib/NewsBackgroundLinkingReranker.java | 30 ++++++++++++++----- .../java/io/anserini/search/SearchArgs.java | 6 +++- 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/src/main/java/io/anserini/index/generator/WapoGenerator.java b/src/main/java/io/anserini/index/generator/WapoGenerator.java index aea88c89ce..8ad917049f 100644 --- a/src/main/java/io/anserini/index/generator/WapoGenerator.java +++ b/src/main/java/io/anserini/index/generator/WapoGenerator.java @@ -45,7 +45,7 @@ public class WapoGenerator extends LuceneDocumentGenerator CONTENT_TYPE_TAG = Arrays.asList("sanitized_html", "tweet"); @@ -86,6 +86,7 @@ public Document createDocument(WashingtonPostCollection.Document wapoDoc) { // This is needed to break score ties by docid. doc.add(new SortedDocValuesField(FIELD_ID, new BytesRef(id))); doc.add(new LongPoint(WapoField.PUBLISHED_DATE.name, wapoDoc.getPublishDate())); + doc.add(new StoredField(WapoField.PUBLISHED_DATE.name, wapoDoc.getPublishDate())); wapoDoc.getAuthor().ifPresent(author -> { doc.add(new StringField(WapoField.AUTHOR.name, author, Field.Store.NO)); }); diff --git a/src/main/java/io/anserini/rerank/lib/NewsBackgroundLinkingReranker.java b/src/main/java/io/anserini/rerank/lib/NewsBackgroundLinkingReranker.java index a34c5d26d9..bce520aff1 100644 --- a/src/main/java/io/anserini/rerank/lib/NewsBackgroundLinkingReranker.java +++ b/src/main/java/io/anserini/rerank/lib/NewsBackgroundLinkingReranker.java @@ -34,6 +34,7 @@ import static io.anserini.index.generator.LuceneDocumentGenerator.FIELD_BODY; import static io.anserini.index.generator.LuceneDocumentGenerator.FIELD_ID; +import static io.anserini.index.generator.WapoGenerator.WapoField.PUBLISHED_DATE; /* * TREC News Track Background Linking task postprocessing. @@ -54,29 +55,44 @@ public ScoredDocuments rerank(ScoredDocuments docs, RerankerContext context) { } // remove the duplicates: 1. the same doc with the query doc 2. duplicated docs in the results - Set duplicates = new HashSet<>(); + Set toRemove = new HashSet<>(); for (int i = 0; i < docs.documents.length; i++) { - if (duplicates.contains(i)) continue; - String docid = docs.documents[i].getField(FIELD_ID).stringValue(); + if (toRemove.contains(i)) continue; if (computeCosineSimilarity(queryTermsMap, docsVectorsMap.get(i)) >= 0.9) { - duplicates.add(i); + toRemove.add(i); continue; } for (int j = i+1; j < docs.documents.length; j++) { if (computeCosineSimilarity(docsVectorsMap.get(i), docsVectorsMap.get(j)) >= 0.9) { - duplicates.add(j); + toRemove.add(j); + } + } + } + + if (context.getSearchArgs().backgroundlinking_datefilter) { + try { + int luceneId = NewsBackgroundLinkingTopicReader.convertDocidToLuceneDocid(reader, queryDocId); + Document queryDoc = reader.document(luceneId); + long queryDocDate = Long.parseLong(queryDoc.getField(PUBLISHED_DATE.name).stringValue()); + for (int i = 0; i < docs.documents.length; i++) { + long date = Long.parseLong(docs.documents[i].getField(PUBLISHED_DATE.name).stringValue()); + if (date > queryDocDate) { + toRemove.add(i); + } } + } catch (Exception e) { + e.printStackTrace(); } } ScoredDocuments scoredDocs = new ScoredDocuments(); - int resSize = docs.documents.length - duplicates.size(); + int resSize = docs.documents.length - toRemove.size(); scoredDocs.documents = new Document[resSize]; scoredDocs.ids = new int[resSize]; scoredDocs.scores = new float[resSize]; int idx = 0; for (int i = 0; i < docs.documents.length; i++) { - if (!duplicates.contains(i)) { + if (!toRemove.contains(i)) { scoredDocs.documents[idx] = docs.documents[i]; scoredDocs.scores[idx] = docs.scores[i]; scoredDocs.ids[idx] = docs.ids[i]; diff --git a/src/main/java/io/anserini/search/SearchArgs.java b/src/main/java/io/anserini/search/SearchArgs.java index 69f87723f7..773451f060 100644 --- a/src/main/java/io/anserini/search/SearchArgs.java +++ b/src/main/java/io/anserini/search/SearchArgs.java @@ -66,7 +66,11 @@ public class SearchArgs { @Option(name = "-backgroundlinking.weighted", usage = "Boolean switch to construct boosted query for TREC News Track Background " + "Linking task. The terms scores are their tf-idf score from the query document") public boolean backgroundlinking_weighted = false; - + + @Option(name = "-backgroundlinking.datefilter", usage = "Boolean switch to filter out articles published after topic article " + + "for the TREC News Track Background Linking task.") + public boolean backgroundlinking_datefilter = false; + @Option(name = "-stemmer", usage = "Stemmer: one of the following porter,krovetz,none. Default porter") public String stemmer = "porter";