Skip to content

Commit

Permalink
GITHUB#11742: MatchingFacetSetsCounts#getTopChildren now returns top …
Browse files Browse the repository at this point in the history
…children instead of all children (#11764)
  • Loading branch information
gsmiller authored Sep 13, 2022
1 parent e491ef7 commit 4463a0b
Show file tree
Hide file tree
Showing 3 changed files with 173 additions and 1 deletion.
5 changes: 5 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,11 @@ Other

======================== Lucene 9.5.0 =======================

API Changes
---------------------
* GITHUB#11742: MatchingFacetSetsCounts#getTopChildren now properly returns "top" children instead
of all children. (Greg Miller)

Bug Fixes
---------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue;

/**
* Returns the counts for each given {@link FacetSet}
Expand Down Expand Up @@ -156,7 +157,45 @@ public FacetResult getAllChildren(String dim, String... path) throws IOException
@Override
public FacetResult getTopChildren(int topN, String dim, String... path) throws IOException {
validateTopN(topN);
return getAllChildren(dim, path);

topN = Math.min(topN, counts.length);

PriorityQueue<Entry> pq =
new PriorityQueue<>(topN, () -> new Entry("", 0)) {
@Override
protected boolean lessThan(Entry a, Entry b) {
return compare(a.count, b.count, a.label, b.label) < 0;
}
};

int childCount = 0;
Entry reuse = pq.top();
for (int i = 0; i < counts.length; i++) {
int count = counts[i];
if (count > 0) {
childCount++;
String label = facetSetMatchers[i].label;
if (compare(reuse.count, count, reuse.label, label) < 0) {
reuse.label = label;
reuse.count = count;
reuse = pq.updateTop();
}
}
}

// Pop off any sentinel values in the case that we had fewer child labels with non-zero
// counts than the requested top-n:
while (childCount < pq.size()) {
pq.pop();
}

LabelAndValue[] labelValues = new LabelAndValue[Math.min(topN, childCount)];
for (int i = pq.size() - 1; i >= 0; i--) {
Entry e = pq.pop();
labelValues[i] = new LabelAndValue(e.label, e.count);
}

return new FacetResult(dim, path, totCount, labelValues, childCount);
}

@Override
Expand All @@ -176,4 +215,22 @@ private static boolean areFacetSetMatcherDimensionsInconsistent(
return Arrays.stream(facetSetMatchers)
.anyMatch(facetSetMatcher -> facetSetMatcher.dims != dims);
}

private static int compare(int count1, int count2, String label1, String label2) {
int cmp = Integer.compare(count1, count2);
if (cmp == 0) {
cmp = label2.compareTo(label1);
}
return cmp;
}

private static final class Entry {
String label;
int count;

Entry(String label, int count) {
this.label = label;
this.count = count;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,30 @@
*/
package org.apache.lucene.facet.facetset;

import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
import java.io.IOException;
import java.util.Locale;
import org.apache.lucene.document.Document;
import org.apache.lucene.facet.FacetResult;
import org.apache.lucene.facet.FacetTestCase;
import org.apache.lucene.facet.Facets;
import org.apache.lucene.facet.FacetsCollector;
import org.apache.lucene.facet.FacetsCollectorManager;
import org.apache.lucene.facet.LabelAndValue;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.tests.index.RandomIndexWriter;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.InPlaceMergeSorter;

public class TestMatchingFacetSetsCounts extends FacetTestCase {
private static final int FORD_ORD = 100;
private static final int TOYOTA_ORD = 101;
private static final int CHEVY_ORD = 102;
private static final int NISSAN_ORD = 103;
private static final int[] MANUFACTURER_ORDS = {FORD_ORD, TOYOTA_ORD, CHEVY_ORD, NISSAN_ORD};

public void testInvalidTopN() throws IOException {
Directory d = newDirectory();
Expand Down Expand Up @@ -87,4 +98,103 @@ public void testInconsistentNumOfIndexedDimensions() throws IOException {
r.close();
d.close();
}

public void testTopChildren() throws Exception {
Directory d = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), d);

// As a test scenario, we're faceting on the number of vehicles produced per day/make
// combination over the past 30 days:
final int numBins = 30;

final int[] expectedCounts = new int[numBins * MANUFACTURER_ORDS.length];
FacetSetMatcher[] facetSetMatchers = new FacetSetMatcher[numBins * MANUFACTURER_ORDS.length];

int totalDocs = 0;
int totalNonZeroBins = 0;
int index = 0;
for (int i = 0; i < numBins; i++) {
for (int ord : MANUFACTURER_ORDS) {
facetSetMatchers[index] =
new ExactFacetSetMatcher(
String.format(Locale.ROOT, "%d:%d", i, ord), new LongFacetSet(i, ord));

int carsManufactured = RandomNumbers.randomIntBetween(random(), 0, 100);
for (int k = 0; k < carsManufactured; k++) {
// Create a document for every vehicle produced:
Document doc = new Document();
doc.add(FacetSetsField.create("field", new LongFacetSet(i, ord)));
w.addDocument(doc);
}

if (carsManufactured > 0) {
totalNonZeroBins++;
}
totalDocs += carsManufactured;
expectedCounts[index] = carsManufactured;
index++;
}
}

IndexReader r = w.getReader();
w.close();

IndexSearcher s = newSearcher(r);
FacetsCollector fc = s.search(new MatchAllDocsQuery(), new FacetsCollectorManager());

Facets facets =
new MatchingFacetSetsCounts("field", fc, FacetSetDecoder::decodeLongs, facetSetMatchers);

// Sort by count (high-to-low) and tie-break on label, same as in
// MatchingFacetCounts#getTopChildren:
final int[] originalIndexes = new int[expectedCounts.length];
for (int i = 0; i < originalIndexes.length; i++) {
originalIndexes[i] = i;
}
new InPlaceMergeSorter() {
@Override
protected int compare(int i, int j) {
int cmp = Integer.compare(expectedCounts[j], expectedCounts[i]);
if (cmp == 0) {
int dayBinI = originalIndexes[i] / MANUFACTURER_ORDS.length;
int dayBinJ = originalIndexes[j] / MANUFACTURER_ORDS.length;
int ordIndexI = originalIndexes[i] % MANUFACTURER_ORDS.length;
int ordIndexJ = originalIndexes[j] % MANUFACTURER_ORDS.length;
String labelI =
String.format(Locale.ROOT, "%d:%d", dayBinI, MANUFACTURER_ORDS[ordIndexI]);
String labelJ =
String.format(Locale.ROOT, "%d:%d", dayBinJ, MANUFACTURER_ORDS[ordIndexJ]);
cmp = new BytesRef(labelI).compareTo(new BytesRef(labelJ));
}
return cmp;
}

@Override
protected void swap(int i, int j) {
int tmp = expectedCounts[i];
expectedCounts[i] = expectedCounts[j];
expectedCounts[j] = tmp;
tmp = originalIndexes[i];
originalIndexes[i] = originalIndexes[j];
originalIndexes[j] = tmp;
}
}.sort(0, expectedCounts.length);

final int topN = 10;
final LabelAndValue[] expected = new LabelAndValue[topN];
for (int i = 0; i < topN; i++) {
int count = expectedCounts[i];
int dayBin = originalIndexes[i] / MANUFACTURER_ORDS.length;
int ordIndex = originalIndexes[i] % MANUFACTURER_ORDS.length;
expected[i] =
new LabelAndValue(
String.format(Locale.ROOT, "%d:%d", dayBin, MANUFACTURER_ORDS[ordIndex]), count);
}

final FacetResult result = facets.getTopChildren(topN, "field");
assertFacetResult(result, "field", new String[0], totalNonZeroBins, totalDocs, expected);

r.close();
d.close();
}
}

0 comments on commit 4463a0b

Please sign in to comment.