From 9c4165566c661a42f978ffe64efc859f6313de75 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Thu, 28 Nov 2024 14:49:36 +0100 Subject: [PATCH 01/12] Implement boosting in "new" search --- .../main/groovy/whelk/search2/EsBoost.java | 358 ++++++++++++++++++ .../main/groovy/whelk/search2/QueryUtil.java | 5 +- .../whelk/search2/querytree/QueryTree.java | 16 +- 3 files changed, 370 insertions(+), 9 deletions(-) create mode 100644 whelk-core/src/main/groovy/whelk/search2/EsBoost.java diff --git a/whelk-core/src/main/groovy/whelk/search2/EsBoost.java b/whelk-core/src/main/groovy/whelk/search2/EsBoost.java new file mode 100644 index 0000000000..b0aac60f21 --- /dev/null +++ b/whelk-core/src/main/groovy/whelk/search2/EsBoost.java @@ -0,0 +1,358 @@ +package whelk.search2; + +import whelk.JsonLd; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.stream.Collectors; + +import static whelk.JsonLd.ALTERNATE_PROPERTIES; +import static whelk.JsonLd.ID_KEY; +import static whelk.JsonLd.RANGE; +import static whelk.JsonLd.SEARCH_KEY; +import static whelk.JsonLd.SUB_PROPERTY_OF; +import static whelk.JsonLd.TYPE_KEY; +import static whelk.JsonLd.asList; +import static whelk.util.DocumentUtil.getAtPath; + +public class EsBoost { + private static final int CHIP_BOOST = 200; + private static final int STR_BOOST = 100; + private static final int CARD_BOOST = 10; + + JsonLd jsonLd; + Chips chipLenses; + Cards cardLenses; + + private final Map> boostFieldsByType = new HashMap<>(); + + EsBoost(JsonLd jsonLd) { + this.jsonLd = jsonLd; + this.chipLenses = collectChipLenses(); + this.cardLenses = collectCardLenses(); + } + + public List getBoostFields(Collection types) { + List sortedTypes = types.stream().sorted().toList(); + + String typeKey = String.join(",", sortedTypes); + + List boostFields = boostFieldsByType.getOrDefault(typeKey, computeBoostFields(sortedTypes)); + boostFieldsByType.put(typeKey, boostFields); + + return boostFields; + } + + private List computeBoostFields(List types) { + /* FIXME: + lensBoost.computeBoostFieldsFromLenses does not give a good result for Concept. + Use hand-tuned boosting instead until we improve boosting/ranking in general. See LXL-3399 for details. + */ + List conceptTypes = new ArrayList<>(); + List otherTypes = new ArrayList<>(); + for (String s : types) { + if (jsonLd.isSubClassOf(s, "Concept")) { + conceptTypes.add(s); + } else { + otherTypes.add(s); + } + } + + Map boostFields; + + if (conceptTypes.isEmpty()) { + boostFields = computeBoostFieldsFromLenses(otherTypes); + } else { + boostFields = CONCEPT_BOOST.stream() + .map(s -> s.split("\\^")) + .collect(Collectors.toMap(parts -> parts[0], parts -> Integer.parseInt(parts[1]))); + + computeBoostFieldsFromLenses(otherTypes).forEach(boostFields::putIfAbsent); + } + + return boostFields.entrySet() + .stream() + .sorted(Map.Entry.comparingByKey()) + .sorted(Map.Entry.comparingByValue(Collections.reverseOrder())) + .map(e -> e.getKey() + "^" + e.getValue()) + .toList(); + } + + private Map computeBoostFieldsFromLenses(List types) { + Map boostFields = new HashMap<>(); + + boostFields.put(SEARCH_KEY, STR_BOOST); + + var baseTypes = List.of("Identity", "Instance", "Item"); + + collectLenses().forEach(lensGroup -> + lensGroup.collectBoostFields(types, baseTypes) + .forEach(boostFields::putIfAbsent) + ); + + return boostFields; + } + + private List collectLenses() { + List lensGroups = new ArrayList<>(); + lensGroups.add(chipLenses); + lensGroups.add(cardLenses); + return lensGroups; + } + + private Chips collectChipLenses() { + List chips = new ArrayList<>(); + + ((Map) getAtPath(jsonLd.displayData, List.of("lensGroups", "chips", "lenses"), Collections.emptyMap())) + .forEach((type, lens) -> chips.add(new EsBoost.Chip((String) type, (Map) lens))); + + return new Chips(chips); + } + + private Cards collectCardLenses() { + List cards = new ArrayList<>(); + + ((Map) getAtPath(jsonLd.displayData, List.of("lensGroups", "cards", "lenses"), Collections.emptyMap())) + .forEach((type, lens) -> cards.add(new EsBoost.Card((String) type, (Map) lens))); + + return new Cards(cards); + } + + private sealed abstract class LensGroup permits Cards, Chips { + abstract List lenses(); + + abstract Lens newLens(String type, Map lens); + + Map collectBoostFields(List types, List baseTypes) { + Map boostFields = new HashMap<>(); + getLensesForTypes(types, baseTypes) + .forEach(lens -> boostFields.putAll(lens.collectBoostFields())); + return boostFields; + } + + Map getLensForType(String type) { + var lensMap = Map.of("lenses", lenses().stream().collect(Collectors.toMap(Lens::type, Lens::lens))); + return jsonLd.getLensFor(Map.of(TYPE_KEY, type), lensMap); + } + + private List getLensesForTypes(List types, List baseTypes) { + return !types.isEmpty() ? getLensesForTypes(types) : getLensesForBaseTypes(baseTypes); + } + + private List getLensesForTypes(List types) { + List lenses = new ArrayList<>(); + types.forEach(t -> + Optional.ofNullable(getLensForType(t)) + .map(lens -> newLens(t, lens)) + .ifPresent(lenses::add) + ); + return lenses; + } + + private List getLensesForBaseTypes(List baseTypes) { + return lenses().stream() + .filter(c -> baseTypes.stream().anyMatch(c::partiallyAppliesTo)) + .toList(); + } + } + + private final class Chips extends LensGroup { + List chips; + + Chips(List chips) { + this.chips = chips; + } + + @Override + List lenses() { + return chips; + } + + @Override + Lens newLens(String type, Map lens) { + return new Chip(type, lens); + } + } + + private final class Cards extends LensGroup { + List cards; + + Cards(List cards) { + this.cards = cards; + } + + @Override + List lenses() { + return cards; + } + + @Override + Lens newLens(String type, Map lens) { + return new Card(type, lens); + } + } + + private sealed abstract class Lens permits EsBoost.Card, Chip { + abstract String type(); + + abstract Map lens(); + + abstract Map collectBoostFields(); + + boolean partiallyAppliesTo(String baseType) { + return jsonLd.isSubClassOf((String) lens().get("classLensDomain"), baseType); + } + } + + private final class Chip extends Lens { + String type; + Map lens; + + Chip(String type, Map lens) { + this.type = type; + this.lens = lens; + } + + @Override + String type() { + return type; + } + + @Override + Map lens() { + return lens; + } + + @Override + Map collectBoostFields() { + return EsBoost.this.collectBoostFields(lens, CHIP_BOOST); + } + } + + private final class Card extends Lens { + String type; + Map lens; + + Card(String type, Map lens) { + this.type = type; + this.lens = lens; + } + + @Override + String type() { + return type; + } + + @Override + Map lens() { + return lens; + } + + @Override + Map collectBoostFields() { + Map boostFields = new HashMap<>(); + getPropertiesToShow(lens).stream() + .map(EsBoost.this::computeCardPropertyBoosts) + .forEach(boostFields::putAll); + return boostFields; + } + } + + private Map collectBoostFields(Map lens, int boost) { + Map boostFields = new HashMap<>(); + + for (String key : getPropertiesToShow(lens)) { + Map term = jsonLd.vocabIndex.get(key); + if (term != null) { + String termType = (String) term.get(TYPE_KEY); + if ("ObjectProperty".equals(termType)) { + key = key + "." + SEARCH_KEY; + } else if (jsonLd.isLangContainer(jsonLd.context.get(key))) { + key = key + "." + jsonLd.locales.getFirst(); + } + } + boostFields.put(key, boost); + } + + return boostFields; + } + + private static List getPropertiesToShow(Map lens) { + var properties = new LinkedHashSet(); + + for (var dfn : (List) lens.get("showProperties")) { + if (dfn instanceof String) { + properties.add((String) dfn); + } else if (dfn instanceof Map) { + for (var alt : asList(((Map) dfn).get(ALTERNATE_PROPERTIES))) { + if (alt instanceof String) { + properties.add((String) alt); + } else if (alt instanceof Map) { + var subPropertyOf = ((Map) alt).get(SUB_PROPERTY_OF); + if (subPropertyOf != null) { + properties.add((String) subPropertyOf); + } + } + } + } + } + + return properties.stream().toList(); + } + + private Map computeCardPropertyBoosts(String prop) { + Map boostFields = new HashMap<>(); + + Map dfn = jsonLd.vocabIndex.get(prop); + + // Follow the object property range to append chip properties to the boosted path. + if (dfn != null && "ObjectProperty".equals(dfn.get(TYPE_KEY))) { + Optional rangeKey = Optional.ofNullable(dfn.get(RANGE)) + .map(r -> r instanceof List ? ((List) r).getFirst() : r) + .map(Map.class::cast) + .map(r -> (String) r.get(ID_KEY)) + .map(jsonLd::toTermKey); + + if (rangeKey.isPresent() && jsonLd.isSubClassOf(rangeKey.get(), "QualifiedRole")) { + var rangeChipLens = chipLenses.getLensForType(rangeKey.get()); + collectBoostFields(rangeChipLens, CARD_BOOST).forEach((k, v) -> boostFields.put(prop + "." + k, v)); + } else { + boostFields.put(prop + "." + SEARCH_KEY, CARD_BOOST); + } + } else if (jsonLd.isLangContainer(jsonLd.context.get(prop))) { + boostFields.put(prop + "." + jsonLd.locales.getFirst(), CARD_BOOST); + } + + return boostFields; + } + + private static final List CONCEPT_BOOST = List.of( + "prefLabel^1500", + "prefLabelByLang.sv^1500", + "label^500", + "labelByLang.sv^500", + "code^200", + "termComponentList._str.exact^125", + "termComponentList._str^75", + "altLabel^150", + "altLabelByLang.sv^150", + "hasVariant.prefLabel.exact^150", + "_str.exact^100", + "inScheme._str.exact^100", + "inScheme._str^100", + "inCollection._str.exact^10", + "broader._str.exact^10", + "exactMatch._str.exact^10", + "closeMatch._str.exact^10", + "broadMatch._str.exact^10", + "related._str.exact^10", + "scopeNote^10", + "keyword._str.exact^10" + ); +} diff --git a/whelk-core/src/main/groovy/whelk/search2/QueryUtil.java b/whelk-core/src/main/groovy/whelk/search2/QueryUtil.java index ea5fac754c..1bfadf19b6 100644 --- a/whelk-core/src/main/groovy/whelk/search2/QueryUtil.java +++ b/whelk-core/src/main/groovy/whelk/search2/QueryUtil.java @@ -4,7 +4,6 @@ import com.google.common.net.UrlEscapers; import whelk.JsonLd; import whelk.Whelk; -import whelk.search.ESQueryLensBoost; import whelk.search2.querytree.QueryTree; import whelk.util.DocumentUtil; @@ -25,12 +24,12 @@ public class QueryUtil { private final Whelk whelk; public final EsMappings esMappings; - public final ESQueryLensBoost lensBoost; + public final EsBoost esBoost; public QueryUtil(Whelk whelk) { this.whelk = whelk; this.esMappings = new EsMappings(whelk.elastic != null ? whelk.elastic.getMappings() : Collections.emptyMap()); - this.lensBoost = new ESQueryLensBoost(whelk.getJsonld()); + this.esBoost = new EsBoost(whelk.getJsonld()); } public Map query(Map queryDsl) { diff --git a/whelk-core/src/main/groovy/whelk/search2/querytree/QueryTree.java b/whelk-core/src/main/groovy/whelk/search2/querytree/QueryTree.java index 24f2037648..6219ddb420 100644 --- a/whelk-core/src/main/groovy/whelk/search2/querytree/QueryTree.java +++ b/whelk-core/src/main/groovy/whelk/search2/querytree/QueryTree.java @@ -39,10 +39,14 @@ public QueryTree(Node tree) { } public Map toEs(QueryUtil queryUtil, Disambiguate disambiguate) { - return (isFiltered() ? filtered.tree : tree) - .expand(disambiguate, getOutsetType()) + List boostFields = queryUtil.esBoost.getBoostFields(getFiltered().collectGivenTypes()); + return expand(disambiguate) .insertNested(queryUtil::getNestedPath) - .toEs(queryUtil.lensBoost.computeBoostFieldsFromLenses(new String[0])); // TODO: Implement boosting + .toEs(boostFields); + } + + private Node expand(Disambiguate disambiguate) { + return getFiltered().tree.expand(disambiguate, getOutsetType()); } public Map toSearchMapping(Map nonQueryParams) { @@ -62,7 +66,7 @@ public OutsetType getOutsetType() { } public void setOutsetType(Disambiguate disambiguate) { - this.outsetType = disambiguate.decideOutset(isFiltered() ? filtered : this); + this.outsetType = disambiguate.decideOutset(getFiltered()); } /** @@ -394,8 +398,8 @@ public void addFilters(QueryParams queryParams, AppParams appParams) { this.filtered = new QueryTree(newTree); } - private boolean isFiltered() { - return filtered != null; + private QueryTree getFiltered() { + return filtered != null ? filtered : this; } private List getFilters(QueryParams queryParams, AppParams appParams) { From 032758fbc356a19442c997df23ae5a2625d0d0ca Mon Sep 17 00:00:00 2001 From: kwahlin Date: Thu, 28 Nov 2024 14:59:38 +0100 Subject: [PATCH 02/12] Don't reshape Elastic result inside query method --- .../whelk/component/ElasticSearch.groovy | 23 ++++--------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy b/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy index 0e6811f227..80b3269b20 100644 --- a/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy +++ b/whelk-core/src/main/groovy/whelk/component/ElasticSearch.groovy @@ -580,18 +580,13 @@ class ElasticSearch { } Map query(Map jsonDsl) { - return performQuery( - jsonDsl, - getQueryUrl(), - { def d = it."_source"; d."_id" = it."_id"; return d } - ) + return performQuery(jsonDsl, getQueryUrl()) } Map queryIds(Map jsonDsl) { return performQuery( jsonDsl, - getQueryUrl(['took','hits.total','hits.hits._id']), - { it."_id" } + getQueryUrl(['took','hits.total','hits.hits._id']) ) } @@ -629,7 +624,7 @@ class ElasticSearch { return super.hashCode() } - private Map performQuery(Map jsonDsl, String queryUrl, Closure hitCollector) { + private Map performQuery(Map jsonDsl, String queryUrl) { try { def start = System.currentTimeMillis() String responseBody = client.performRequest('POST', @@ -643,17 +638,7 @@ class ElasticSearch { log.info("ES query took ${duration} (${responseMap.took} server-side)") } - def results = [:] - - results.startIndex = jsonDsl.from - results.totalHits = responseMap.hits.total.value - results.items = responseMap.hits.hits.collect(hitCollector) - results.aggregations = responseMap.aggregations - // Spell checking - if (responseMap.suggest?.simple_phrase) { - results.spell = responseMap.suggest.simple_phrase[0].options - } - return results + return responseMap } catch (Exception e) { if (isBadRequest(e)) { From bbb3fea870a7e89cc974dac9ad5445f1d735c2af Mon Sep 17 00:00:00 2001 From: kwahlin Date: Thu, 28 Nov 2024 15:11:02 +0100 Subject: [PATCH 03/12] Enable optional ES score debug info in "old" API (+ unintentional formatting) --- .../main/groovy/whelk/search/ESQuery.groovy | 336 +++++++++--------- 1 file changed, 177 insertions(+), 159 deletions(-) diff --git a/whelk-core/src/main/groovy/whelk/search/ESQuery.groovy b/whelk-core/src/main/groovy/whelk/search/ESQuery.groovy index 43dcefe7cf..7f6d7415df 100644 --- a/whelk-core/src/main/groovy/whelk/search/ESQuery.groovy +++ b/whelk-core/src/main/groovy/whelk/search/ESQuery.groovy @@ -32,7 +32,7 @@ class ESQuery { private static final int DEFAULT_PAGE_SIZE = 50 private static final List RESERVED_PARAMS = [ - 'q', 'o', '_limit', '_offset', '_sort', '_statsrepr', '_site_base_uri', '_debug', '_boost', '_lens', '_stats', '_suggest', '_site', '_spell' + 'q', 'o', '_limit', '_offset', '_sort', '_statsrepr', '_site_base_uri', '_debug', '_boost', '_lens', '_stats', '_suggest', '_site', '_spell' ] public static final String AND_PREFIX = 'and-' public static final String AND_MATCHES_PREFIX = 'and-matches-' @@ -40,7 +40,7 @@ class ESQuery { private static final String NOT_PREFIX = 'not-' private static final String EXISTS_PREFIX = 'exists-' - private static final List QUERY_RANGE_PREFIXES = [AND_MATCHES_PREFIX] + RangeParameterPrefix.values().collect{ it.prefix } + private static final List QUERY_RANGE_PREFIXES = [AND_MATCHES_PREFIX] + RangeParameterPrefix.values().collect { it.prefix } // Prefixes are matched in this order so AND_MATCHES_PREFIX must be before AND_PREFIX. private static final List QUERY_PREFIXES = QUERY_RANGE_PREFIXES + [AND_PREFIX, OR_PREFIX, NOT_PREFIX, EXISTS_PREFIX] @@ -54,12 +54,12 @@ class ESQuery { private static final Map recordsOverCacheRecordsBoost = [ 'bool': ['should': [ ['constant_score': [ - 'filter': [ 'term': [ (JsonLd.RECORD_KEY + '.' + JsonLd.TYPE_KEY) : JsonLd.RECORD_TYPE ]], - 'boost': 1000.0 + 'filter': ['term': [(JsonLd.RECORD_KEY + '.' + JsonLd.TYPE_KEY): JsonLd.RECORD_TYPE]], + 'boost' : 1000.0 ]], ['constant_score': [ - 'filter': [ 'term': [ (JsonLd.RECORD_KEY + '.' + JsonLd.TYPE_KEY) : JsonLd.CACHE_RECORD_TYPE ]], - 'boost': 1.0 + 'filter': ['term': [(JsonLd.RECORD_KEY + '.' + JsonLd.TYPE_KEY): JsonLd.CACHE_RECORD_TYPE]], + 'boost' : 1.0 ]] ]] ] @@ -86,7 +86,7 @@ class ESQuery { void initFieldMappings(Whelk whelk) { if (whelk.elastic) { Map mappings = whelk.elastic.getMappings() - this.keywordFields = getKeywordFields(mappings) + this.keywordFields = getKeywordFields(mappings) this.dateFields = getFieldsOfType('date', mappings) this.nestedFields = getFieldsOfType('nested', mappings) this.nestedNotInParentFields = nestedFields - getFieldsWithSetting('include_in_parent', true, mappings) @@ -102,7 +102,7 @@ class ESQuery { this.nestedFields = Collections.emptySet() } } - + void setKeywords(Set keywordFields) { // NOTE: For unit tests only! this.keywordFields = keywordFields @@ -111,21 +111,41 @@ class ESQuery { @CompileStatic(TypeCheckingMode.SKIP) Map doQuery(Map queryParameters, String suggest = null, String spell = null) { Map esQuery = getESQuery(queryParameters, suggest, spell) - Map esResponse = hideKeywordFields(moveAggregationsToTopLevel(whelk.elastic.query(esQuery))) - if ('esQuery' in queryParameters.get('_debug')) { - esResponse._debug = [esQuery: esQuery] - } - return esResponse + Map esResponse = whelk.elastic.query(esQuery) + return collectQueryResults(esResponse, esQuery, queryParameters, { def d = it."_source"; d."_id" = it."_id"; return d }) } @CompileStatic(TypeCheckingMode.SKIP) Map doQueryIds(Map queryParameters) { Map esQuery = getESQuery(queryParameters) - Map esResponse = hideKeywordFields(moveAggregationsToTopLevel(whelk.elastic.queryIds(esQuery))) + Map esResponse = whelk.elastic.query(esQuery) + return collectQueryResults(esResponse, esQuery, queryParameters, { it."_id" }) + } + + private Map collectQueryResults(Map esResponse, + Map esQuery, + Map queryParameters, + Closure hitCollector) { + def results = [:] + + results['startIndex'] = esQuery['from'] + results['totalHits'] = esResponse['hits']['total']['value'] + results['items'] = esResponse['hits']['hits'].collect(hitCollector) + results['aggregations'] = esResponse['aggregations'] + // Spell checking + if (esResponse['suggest']?['simple_phrase']) { + results['spell'] = ((List) esResponse['suggest']['simple_phrase'])[0]['options'] + } + if ('esQuery' in queryParameters.get('_debug')) { - esResponse._debug = [esQuery: esQuery] + results['_debug'] = [esQuery: esQuery] } - return esResponse + if ('esScore' in queryParameters.get('_debug')) { + results['_debug'] = results['_debug'] ?: [:] + results['_debug']['esScore'] = esResponse['hits']['hits'].collect { ((Map) it).subMap(['_id', '_score', '_explanation']) } + } + + return hideKeywordFields(moveAggregationsToTopLevel(results)) } @CompileStatic(TypeCheckingMode.SKIP) @@ -186,11 +206,11 @@ class ESQuery { } Map simpleQuery = [ - (queryMode) : [ - 'query': q, - 'default_operator': 'AND', - 'analyze_wildcard' : true - ] + (queryMode): [ + 'query' : q, + 'default_operator': 'AND', + 'analyze_wildcard': true + ] ] // In case of suggest/autocomplete search, target a specific field with a specific query type @@ -209,75 +229,75 @@ class ESQuery { } Map boostedExact = [ - (queryMode): [ - 'query': q, - 'default_operator': 'AND', - 'fields': exactFields, - 'analyze_wildcard' : true - ] + (queryMode): [ + 'query' : q, + 'default_operator': 'AND', + 'fields' : exactFields, + 'analyze_wildcard': true + ] ] Map boostedSoft = [ - (queryMode) : [ - 'query': q, - 'default_operator': 'AND', - 'fields': softFields, - 'quote_field_suffix': ".exact", - 'analyze_wildcard' : true - ] + (queryMode): [ + 'query' : q, + 'default_operator' : 'AND', + 'fields' : softFields, + 'quote_field_suffix': ".exact", + 'analyze_wildcard' : true + ] ] queryClauses = ['bool': - ['must': [ - ['bool': [ 'should': [ - boostedExact, - boostedSoft, - simpleQuery]]], - recordsOverCacheRecordsBoost - ] - ]] + ['must': [ + ['bool': ['should': [ + boostedExact, + boostedSoft, + simpleQuery]]], + recordsOverCacheRecordsBoost + ] + ]] } Map query if (suggest) { query = [ - 'query': [ - 'bool': [ - 'must': [ - 'multi_match': [ - 'query': q, - 'type': 'bool_prefix', - 'fields': [ - "_sortKeyByLang.${suggest}.suggest".toString(), - "_sortKeyByLang.${suggest}.suggest._2gram".toString(), - "_sortKeyByLang.${suggest}.suggest._3gram".toString() - ] + 'query': [ + 'bool': [ + 'must' : [ + 'multi_match': [ + 'query' : q, + 'type' : 'bool_prefix', + 'fields': [ + "_sortKeyByLang.${suggest}.suggest".toString(), + "_sortKeyByLang.${suggest}.suggest._2gram".toString(), + "_sortKeyByLang.${suggest}.suggest._3gram".toString() + ] + ] + ], + 'should': [ + 'prefix': [ + ("_sortKeyByLang.${suggest}.keyword".toString()): [ + 'value': q, + 'boost': 100 + ] + ] + ] ] - ], - 'should': [ - 'prefix': [ - ("_sortKeyByLang.${suggest}.keyword".toString()): [ - 'value': q, - 'boost': 100 - ] - ] - ] + ], + 'sort' : [ + '_score' : 'desc', + ("_sortKeyByLang.${suggest}.keyword".toString()): 'asc' ] - ], - 'sort': [ - '_score': 'desc', - ("_sortKeyByLang.${suggest}.keyword".toString()): 'asc' - ] ] } else { query = [ - 'query': [ - 'bool': [ - 'must': [ - queryClauses - ] + 'query': [ + 'bool': [ + 'must': [ + queryClauses + ] + ] ] - ] ] } @@ -313,7 +333,7 @@ class ESQuery { } if (multiSelectFilters) { - query['post_filter'] = ['bool': ['must' : multiSelectFilters.values()]] + query['post_filter'] = ['bool': ['must': multiSelectFilters.values()]] } if (ENABLE_SPELL_CHECK && spell && q) { @@ -322,6 +342,14 @@ class ESQuery { query['track_total_hits'] = true + if (queryParameters['_debug']?.contains('esScore')) { + if (sortBy) { + // Scores won't be calculated when also using sort unless explicitly asked for + query['track_scores'] = true + } + query['explain'] = true + } + return query } @@ -340,13 +368,13 @@ class ESQuery { if (boostFields == null) { if (boostMode == 'hardcoded') { boostFields = [ - 'prefLabel^100', - 'code^100', - 'name^100', - 'familyName^100', 'givenName^100', - 'lifeSpan^100', 'birthYear^100', 'deathYear^100', - 'hasTitle.mainTitle^100', 'title^100', - 'heldBy.sigel^100', + 'prefLabel^100', + 'code^100', + 'name^100', + 'familyName^100', 'givenName^100', + 'lifeSpan^100', 'birthYear^100', 'deathYear^100', + 'hasTitle.mainTitle^100', 'title^100', + 'heldBy.sigel^100', ] } else { boostFields = computeBoostFields(types) @@ -364,23 +392,21 @@ class ESQuery { */ def l = ((types ?: []) as List).split { jsonld.isSubClassOf(it, 'Concept') } def (conceptTypes, otherTypes) = [l[0], l[1]] - + if (conceptTypes) { if (otherTypes) { def fromLens = lensBoost.computeBoostFieldsFromLenses(otherTypes as String[]) - def conceptFields = CONCEPT_BOOST.collect{ it.split('\\^')[0]} - def otherFieldsBoost = fromLens.findAll{!conceptFields.contains(it.split('\\^')[0]) } + def conceptFields = CONCEPT_BOOST.collect { it.split('\\^')[0] } + def otherFieldsBoost = fromLens.findAll { !conceptFields.contains(it.split('\\^')[0]) } return CONCEPT_BOOST + otherFieldsBoost - } - else { + } else { return CONCEPT_BOOST } - } - else { + } else { return lensBoost.computeBoostFieldsFromLenses(types) } } - + private static final List CONCEPT_BOOST = [ 'prefLabel^1500', 'prefLabelByLang.sv^1500', @@ -404,9 +430,9 @@ class ESQuery { 'scopeNote^10', 'keyword._str.exact^10', ] - + private static final Set subjectRange = ["Person", "Family", "Meeting", "Organization", "Jurisdiction", "Subject", "Work"] as Set - + /** * Expand `@type` query parameter with subclasses. @@ -516,8 +542,8 @@ class ESQuery { // what about the filter condition then? if (field == 'hasTitle.mainTitle' || field == 'hasTitle.mainTitle.keyword') { clause[termPath]['nested'] = [ - 'path': 'hasTitle', - 'filter': ['term': ['hasTitle.@type': 'Title']] + 'path' : 'hasTitle', + 'filter': ['term': ['hasTitle.@type': 'Title']] ] } return clause @@ -561,8 +587,8 @@ class ESQuery { String siteBaseUri = queryParameters.get('_site_base_uri')[0] List prefixFilters = [ - ['prefix': ['@id': siteBaseUri]], - ['prefix': ['sameAs.@id': siteBaseUri]] + ['prefix': ['@id': siteBaseUri]], + ['prefix': ['sameAs.@id': siteBaseUri]] ] // We want either of the prefix filters to match, so we put them @@ -590,12 +616,12 @@ class ESQuery { // If both nested and notNested contains explicit OR they should be moved to notNested // If two different nested contains explicit OR they should be moved to not notNested boolean explicitOrInDifferentNested = nested.values() - .findAll{ it.keySet().any{ k -> k.startsWith(OR_PREFIX)} } + .findAll { it.keySet().any { k -> k.startsWith(OR_PREFIX) } } .size() > 1 if (notNested.keySet().any { it.startsWith(OR_PREFIX) } || explicitOrInDifferentNested) { nested.values().each { n -> - n.keySet().findAll{ it.startsWith(OR_PREFIX) }.each { + n.keySet().findAll { it.startsWith(OR_PREFIX) }.each { notNested.put(it, n.remove(it)) } } @@ -631,8 +657,7 @@ class ESQuery { getOrGroups(notNested).each { Map m -> if (m.size() == 1 && m.keySet().first() in multiSelectable) { multiSelectFilters[m.keySet().first()] = createBoolFilter(addMissingMatch(m, matchMissing)) - } - else { + } else { filters << wrapNestedNotInParent(createBoolFilter(addMissingMatch(m, matchMissing))) } } @@ -653,12 +678,11 @@ class ESQuery { private static getPrefixGroup(String key, Set nestedFields) { if (key.contains('.')) { - (QUERY_PREFIXES.find{ key.startsWith(it) } ?: "").with { String prefix -> - String nested = nestedFields.find{ key.startsWith(prefix + it + '.') } + (QUERY_PREFIXES.find { key.startsWith(it) } ?: "").with { String prefix -> + String nested = nestedFields.find { key.startsWith(prefix + it + '.') } if (nested) { return key.substring(prefix.length(), prefix.length() + nested.length()) - } - else { + } else { return key.substring(0, key.indexOf('.')) } } @@ -668,7 +692,7 @@ class ESQuery { } private Map wrapNestedNotInParent(Map boolFilter) { - var nested = { String f -> nestedNotInParentFields.find{ (f ?: '').startsWith(it + '.') } } + var nested = { String f -> nestedNotInParentFields.find { (f ?: '').startsWith(it + '.') } } var fields = DocumentUtil.getAtPath(boolFilter, ['bool', 'should', '*', 'simple_query_string', 'fields', 0], []) @@ -698,20 +722,17 @@ class ESQuery { parameters.each { String key, value -> if (key == 'p') { value.each { - p.put(it, parameters['_links']) + p.put(it, parameters['_links']) } - } - else if (key.startsWith(OR_PREFIX)) { + } else if (key.startsWith(OR_PREFIX)) { or.put(key.substring(OR_PREFIX.size()), value) - } - else if (key.startsWith(AND_PREFIX)) { + } else if (key.startsWith(AND_PREFIX)) { // For AND on the same field to work, we need a separate // map for each value value.each { and << [(key.substring(AND_PREFIX.size())): [it]] } - } - else { + } else { other.put(key, value) } } @@ -723,7 +744,7 @@ class ESQuery { } } - List result = other.collect {[(it.getKey()): it.getValue()]} + List result = other.collect { [(it.getKey()): it.getValue()] } if (or.size() > 0) { result.add(or) } @@ -742,7 +763,7 @@ class ESQuery { Map nested = groups.findAll { g -> // If included in parent: More than one property or more than one value for some property g.key in nestedNotInParentFields - || (g.key in nestedFields && (g.value.size() > 1 || g.value.values().any{ it.length > 1 })) + || (g.key in nestedFields && (g.value.size() > 1 || g.value.values().any { it.length > 1 })) } return nested } @@ -778,7 +799,7 @@ class ESQuery { int numberOfReferencedDocs = ands.collect { it.value }.collect { it.length }?.max() ?: 1 List result = [] - for (int i = 0 ; i < numberOfReferencedDocs ; i++) { + for (int i = 0; i < numberOfReferencedDocs; i++) { List musts = [] List mustNots = null @@ -799,10 +820,10 @@ class ESQuery { musts.addAll(ands.findResults { it.value.length > i - ? createBoolFilter([ (stripPrefix(it.key, AND_PREFIX)): [it.value[i]] ]) + ? createBoolFilter([(stripPrefix(it.key, AND_PREFIX)): [it.value[i]]]) : null }) - + result << Q.nested(prefix, Q.bool(musts, mustNots)) } @@ -847,7 +868,7 @@ class ESQuery { // The following chars are reserved in ES and need to be escaped to be used as literals: \+-=|&> fieldsAndVals) { List clauses = [] - fieldsAndVals.each {field, values -> + fieldsAndVals.each { field, values -> if (field.startsWith(EXISTS_PREFIX)) { def f = field.substring(EXISTS_PREFIX.length()) for (val in values) { @@ -881,8 +902,7 @@ class ESQuery { ? ['exists': ['field': f]] : ['bool': ['must_not': ['exists': ['field': f]]]]) } - } - else { + } else { for (val in values) { boolean isSimple = isSimple(val) clauses.add([(isSimple ? 'simple_query_string' : 'query_string'): [ @@ -897,7 +917,7 @@ class ESQuery { // FIXME? "should" wrapper is not needed if values/clauses.size == 1 return ['bool': ['should': clauses]] } - + private String expandLangMapKeys(String field) { var parts = field.split('\\.') if (parts && parts[-1] in jsonld.langContainerAlias.keySet()) { @@ -910,11 +930,9 @@ class ESQuery { private static boolean parseBoolean(String parameterName, String value) { if (value.toLowerCase() == 'true') { true - } - else if (value.toLowerCase() == 'false') { + } else if (value.toLowerCase() == 'false') { false - } - else { + } else { throw new InvalidQueryException("$parameterName must be 'true' or 'false', got '$value'") } } @@ -937,7 +955,7 @@ class ESQuery { } @CompileStatic(TypeCheckingMode.SKIP) - private Map buildAggQuery(def tree, Map multiSelectFilters, int size=10) { + private Map buildAggQuery(def tree, Map multiSelectFilters, int size = 10) { Map query = [:] List keys = [] @@ -948,8 +966,8 @@ class ESQuery { } keys.each { key -> - String sort = tree[key]?.sort =='key' ? '_key' : '_count' - def sortOrder = tree[key]?.sortOrder =='asc' ? 'asc' : 'desc' + String sort = tree[key]?.sort == 'key' ? '_key' : '_count' + def sortOrder = tree[key]?.sortOrder == 'asc' ? 'asc' : 'desc' String termPath = getInferredTermPath(key) // Core agg query @@ -960,18 +978,18 @@ class ESQuery { ] // If field is nested, wrap agg query with nested - nestedFields.find{ key.startsWith(it) }?.with { nestedField -> + nestedFields.find { key.startsWith(it) }?.with { nestedField -> query[termPath] = [ - 'nested': [ 'path': nestedField ], - 'aggs' : [ (NESTED_AGG_NAME): query[termPath] ] + 'nested': ['path': nestedField], + 'aggs' : [(NESTED_AGG_NAME): query[termPath]] ] } // Wrap agg query with a filter so that we can get counts for multi select filters def filters = multiSelectFilters.findAll { it.key != key }.values() query[termPath] = [ - 'aggs' : [ (FILTERED_AGG_NAME): query[termPath] ], - 'filter': ['bool': ['must': filters]] + 'aggs' : [(FILTERED_AGG_NAME): query[termPath]], + 'filter': ['bool': ['must': filters]] ] if (tree[key].subItems instanceof Map) { @@ -992,7 +1010,7 @@ class ESQuery { static Map matchMissing(Map queryParameters) { getStatsRepr(queryParameters) .findAll { key, value -> value['_matchMissing'] } - .collectEntries { key, value -> [key, value['_matchMissing'] as String]} + .collectEntries { key, value -> [key, value['_matchMissing'] as String] } } @CompileStatic(TypeCheckingMode.SKIP) @@ -1051,7 +1069,7 @@ class ESQuery { return new Tuple2(handledParameters, filters) } - private static void parseRangeParameter (String parameter, Closure handler) { + private static void parseRangeParameter(String parameter, Closure handler) { for (RangeParameterPrefix p : RangeParameterPrefix.values()) { if (parameter.startsWith(p.prefix())) { handler(parameter.substring(p.prefix().size()), p) @@ -1072,7 +1090,7 @@ class ESQuery { Set fields = [] as Set DocumentUtil.findKey(mappings['properties'], setting) { v, path -> if (v == value) { - fields.add(path.dropRight(1).findAll{ it != 'properties'}.join('.')) + fields.add(path.dropRight(1).findAll { it != 'properties' }.join('.')) } DocumentUtil.NOP } @@ -1103,7 +1121,7 @@ class ESQuery { Set result = [] as Set properties.each { fieldName, fieldSettings -> result += getKeywordFieldsFromProperty(fieldName as String, - fieldSettings as Map, parentName) + fieldSettings as Map, parentName) } return result @@ -1190,30 +1208,30 @@ class ESQuery { static Map getSpellQuery(String q) { return [ - 'text': q, - 'simple_phrase': [ - 'phrase': [ - 'field': SPELL_CHECK_FIELD, - 'size': 1, - 'max_errors': 2, - 'direct_generator': [ - [ - 'field': SPELL_CHECK_FIELD, - 'suggest_mode': 'always', - ], - [ - 'field': SPELL_CHECK_FIELD_REVERSE, - 'suggest_mode': 'always', - "pre_filter" : "reverse", - "post_filter" : "reverse" + 'text' : q, + 'simple_phrase': [ + 'phrase': [ + 'field' : SPELL_CHECK_FIELD, + 'size' : 1, + 'max_errors' : 2, + 'direct_generator': [ + [ + 'field' : SPELL_CHECK_FIELD, + 'suggest_mode': 'always', + ], + [ + 'field' : SPELL_CHECK_FIELD_REVERSE, + 'suggest_mode': 'always', + "pre_filter" : "reverse", + "post_filter" : "reverse" + ] + ], + 'highlight' : [ + 'pre_tag' : '', + 'post_tag': '' + ] ] - ], - 'highlight': [ - 'pre_tag': '', - 'post_tag': '' - ] ] - ] ] } } From 0f977051ef7918c903d41321335b5059d43a2687 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Thu, 28 Nov 2024 15:15:30 +0100 Subject: [PATCH 04/12] Enable optional ES score debug info in new API --- .../groovy/whelk/rest/api/SearchUtils2.java | 19 ++++++++++- .../groovy/whelk/search2/QueryParams.java | 7 ++-- .../groovy/whelk/search2/QueryResult.java | 34 ++++++++++++++----- .../src/main/groovy/whelk/search2/Spell.java | 5 ++- 4 files changed, 50 insertions(+), 15 deletions(-) diff --git a/rest/src/main/groovy/whelk/rest/api/SearchUtils2.java b/rest/src/main/groovy/whelk/rest/api/SearchUtils2.java index 80d73f8ea9..b99219297e 100644 --- a/rest/src/main/groovy/whelk/rest/api/SearchUtils2.java +++ b/rest/src/main/groovy/whelk/rest/api/SearchUtils2.java @@ -67,8 +67,15 @@ Map doSearch(Map queryParameters) throws Inval Map partialCollectionView = getPartialCollectionView(queryRes, qTree, queryParams, appParams); + Map debugView = new HashMap<>(); if (queryParams.debug.contains(QueryParams.Debug.ES_QUERY)) { - partialCollectionView.put(QueryParams.ApiParams.DEBUG, Map.of(QueryParams.Debug.ES_QUERY, esQueryDsl)); + debugView.put(QueryParams.Debug.ES_QUERY, esQueryDsl); + } + if (queryParams.debug.contains(QueryParams.Debug.ES_SCORE)) { + debugView.put(QueryParams.Debug.ES_SCORE, queryRes.scores); + } + if (!debugView.isEmpty()) { + partialCollectionView.put(QueryParams.ApiParams.DEBUG, debugView); } return partialCollectionView; @@ -76,12 +83,14 @@ Map doSearch(Map queryParameters) throws Inval private Map getEsQueryDsl(QueryTree queryTree, QueryParams queryParams, AppParams.StatsRepr statsRepr) { var queryDsl = new LinkedHashMap(); + queryDsl.put("query", queryTree.toEs(queryUtil, disambiguate)); queryDsl.put("size", queryParams.limit); queryDsl.put("from", queryParams.offset); queryDsl.put("sort", (queryParams.sortBy == Sort.DEFAULT_BY_RELEVANCY && queryTree.isWild() ? Sort.BY_DOC_ID : queryParams.sortBy).getSortClauses(queryUtil::getSortField)); + if (queryParams.spell.suggest && queryUtil.esMappings.isSpellCheckAvailable()) { var spellQuery = Spell.getSpellQuery(queryTree); if (spellQuery.isPresent()) { @@ -92,8 +101,16 @@ private Map getEsQueryDsl(QueryTree queryTree, QueryParams query } } } + queryDsl.put("aggs", Aggs.buildAggQuery(statsRepr, disambiguate, queryTree.getOutsetType(), queryUtil::getNestedPath)); queryDsl.put("track_total_hits", true); + + if (queryParams.debug.contains(QueryParams.Debug.ES_SCORE)) { + queryDsl.put("explain", true); + // Scores won't be calculated when also using sort unless explicitly asked for + queryDsl.put("track_scores", true); + } + return queryDsl; } diff --git a/whelk-core/src/main/groovy/whelk/search2/QueryParams.java b/whelk-core/src/main/groovy/whelk/search2/QueryParams.java index 90a4980e74..90e7b44597 100644 --- a/whelk-core/src/main/groovy/whelk/search2/QueryParams.java +++ b/whelk-core/src/main/groovy/whelk/search2/QueryParams.java @@ -2,7 +2,6 @@ import whelk.exception.InvalidQueryException; -import java.io.IOException; import java.util.Arrays; import java.util.Collections; import java.util.LinkedHashMap; @@ -11,8 +10,6 @@ import java.util.Optional; import java.util.function.Predicate; -import static whelk.util.Jackson.mapper; - public class QueryParams { private final static int DEFAULT_LIMIT = 200; private final static int MAX_LIMIT = 4000; @@ -35,6 +32,7 @@ public static class ApiParams { public static class Debug { public static final String ES_QUERY = "esQuery"; + public static final String ES_SCORE = "esScore"; } public final int limit; @@ -50,8 +48,7 @@ public static class Debug { public final String q; public final String i; - public QueryParams(Map apiParameters) throws InvalidQueryException, - IOException { + public QueryParams(Map apiParameters) throws InvalidQueryException { this.sortBy = Sort.fromString(getOptionalSingleNonEmpty(ApiParams.SORT, apiParameters).orElse("")); this.object = getOptionalSingleNonEmpty(ApiParams.OBJECT, apiParameters).orElse(null); this.predicates = getMultiple(ApiParams.PREDICATES, apiParameters); diff --git a/whelk-core/src/main/groovy/whelk/search2/QueryResult.java b/whelk-core/src/main/groovy/whelk/search2/QueryResult.java index 53baf81375..7110fa1704 100644 --- a/whelk-core/src/main/groovy/whelk/search2/QueryResult.java +++ b/whelk-core/src/main/groovy/whelk/search2/QueryResult.java @@ -13,36 +13,54 @@ import java.util.function.Function; import java.util.stream.Collectors; +import static whelk.search2.QueryUtil.castToStringObjectMap; +import static whelk.util.DocumentUtil.getAtPath; + public class QueryResult { public final int numHits; private final List esItems; public final List aggs; public final List pAggs; public final List spell; + public final List> scores; public QueryResult(Map esResponse) { var normResponse = normalizeResponse(esResponse); - this.numHits = (int) normResponse.getOrDefault("totalHits", 0); - this.esItems = getEsItems(normResponse); + this.numHits = getNumHits(normResponse); + this.esItems = collectEsItems(normResponse); this.aggs = Aggs.collectAggResult(normResponse); this.pAggs = Aggs.collectPAggResult(normResponse); this.spell = Spell.collectSuggestions(normResponse); + this.scores = collectScores(normResponse); } public List> collectItems(Function, Map> applyLens) { return esItems.stream().map(item -> item.toLd(applyLens)).toList(); } - private static List getEsItems(Map esResponse) { - return getAsList(esResponse, "items") + private static int getNumHits(Map esResponse) { + return (int) getAtPath(esResponse, List.of("hits", "total", "value"), 1); + } + + private static List collectEsItems(Map esResponse) { + return ((List) getAtPath(esResponse, List.of("hits", "hits"), Collections.emptyList())) .stream() - .map(QueryUtil::castToStringObjectMap) + .map(Map.class::cast) + .map(hit -> { + var item = castToStringObjectMap(hit.get("_source")); + item.put("_id", hit.get("_id")); + return item; + }) .map(EsItem::new) .toList(); } - private static List getAsList(Map m, String key) { - return ((List) m.getOrDefault(key, Collections.emptyList())); + private static List> collectScores(Map esResponse) { + return ((List) getAtPath(esResponse, List.of("hits", "hits"), Collections.emptyList())) + .stream() + .map(QueryUtil::castToStringObjectMap) + .filter(m -> m.keySet().retainAll(List.of("_id", "_score", "_explanation"))) + .toList(); } private static Map normalizeResponse(Map esResponse) { @@ -97,7 +115,7 @@ private void normalizeIsniAndOrcid() { } private List> getIdentifiedBy() { - return getAsList(map, "identifiedBy") + return ((List) map.getOrDefault("identifiedBy", Collections.emptyList())) .stream() .map(QueryUtil::castToStringObjectMap) .collect(Collectors.toList()); diff --git a/whelk-core/src/main/groovy/whelk/search2/Spell.java b/whelk-core/src/main/groovy/whelk/search2/Spell.java index d4e68dd56d..692dc31bf7 100644 --- a/whelk-core/src/main/groovy/whelk/search2/Spell.java +++ b/whelk-core/src/main/groovy/whelk/search2/Spell.java @@ -3,6 +3,7 @@ import whelk.JsonLd; import whelk.search.ESQuery; import whelk.search2.querytree.QueryTree; +import whelk.util.DocumentUtil; import java.util.ArrayList; import java.util.Collections; @@ -12,6 +13,8 @@ import java.util.Optional; import java.util.function.Predicate; +import static whelk.util.DocumentUtil.getAtPath; + public class Spell { public record Suggestion(String text, String highlighted) { } @@ -45,7 +48,7 @@ public static List> buildSpellSuggestions(QueryResult queryR } public static List collectSuggestions(Map esResponse) { - return ((List) esResponse.getOrDefault("spell", Collections.emptyList())) + return ((List) getAtPath(esResponse, List.of("suggest", "simple_phrase", 0, "options"), Collections.emptyList())) .stream() .map(Map.class::cast) .map(m -> new Suggestion((String) m.get("text"), (String) m.get("highlighted"))) From 321bc7e217f3afa0b70f9deadf49985cc3390c25 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Thu, 28 Nov 2024 16:31:31 +0100 Subject: [PATCH 05/12] Add records over cache records boost --- .../groovy/whelk/rest/api/SearchUtils2.java | 3 +- .../main/groovy/whelk/search2/EsBoost.java | 32 +++++++++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/rest/src/main/groovy/whelk/rest/api/SearchUtils2.java b/rest/src/main/groovy/whelk/rest/api/SearchUtils2.java index b99219297e..b8e374e364 100644 --- a/rest/src/main/groovy/whelk/rest/api/SearchUtils2.java +++ b/rest/src/main/groovy/whelk/rest/api/SearchUtils2.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.util.*; +import static whelk.search2.EsBoost.addConstantBoosts; import static whelk.search2.Spell.buildSpellSuggestions; import static whelk.util.Jackson.mapper; @@ -84,7 +85,7 @@ Map doSearch(Map queryParameters) throws Inval private Map getEsQueryDsl(QueryTree queryTree, QueryParams queryParams, AppParams.StatsRepr statsRepr) { var queryDsl = new LinkedHashMap(); - queryDsl.put("query", queryTree.toEs(queryUtil, disambiguate)); + queryDsl.put("query", addConstantBoosts(queryTree.toEs(queryUtil, disambiguate))); queryDsl.put("size", queryParams.limit); queryDsl.put("from", queryParams.offset); queryDsl.put("sort", (queryParams.sortBy == Sort.DEFAULT_BY_RELEVANCY && queryTree.isWild() diff --git a/whelk-core/src/main/groovy/whelk/search2/EsBoost.java b/whelk-core/src/main/groovy/whelk/search2/EsBoost.java index b0aac60f21..4d40257795 100644 --- a/whelk-core/src/main/groovy/whelk/search2/EsBoost.java +++ b/whelk-core/src/main/groovy/whelk/search2/EsBoost.java @@ -13,12 +13,17 @@ import java.util.stream.Collectors; import static whelk.JsonLd.ALTERNATE_PROPERTIES; +import static whelk.JsonLd.CACHE_RECORD_TYPE; import static whelk.JsonLd.ID_KEY; import static whelk.JsonLd.RANGE; +import static whelk.JsonLd.RECORD_TYPE; import static whelk.JsonLd.SEARCH_KEY; import static whelk.JsonLd.SUB_PROPERTY_OF; import static whelk.JsonLd.TYPE_KEY; import static whelk.JsonLd.asList; +import static whelk.search2.QueryUtil.boolWrap; +import static whelk.search2.QueryUtil.mustWrap; +import static whelk.search2.QueryUtil.shouldWrap; import static whelk.util.DocumentUtil.getAtPath; public class EsBoost { @@ -49,6 +54,16 @@ public List getBoostFields(Collection types) { return boostFields; } + public static Map addConstantBoosts(Map esQuery) { + List> constantBoosts = List.of(recordsOverCacheRecordsBoost()); + + var mustClause = new ArrayList<>(); + mustClause.add(esQuery); + mustClause.addAll(constantBoosts); + + return mustWrap(mustClause); + } + private List computeBoostFields(List types) { /* FIXME: lensBoost.computeBoostFieldsFromLenses does not give a good result for Concept. @@ -332,6 +347,23 @@ private Map computeCardPropertyBoosts(String prop) { return boostFields; } + private static Map recordsOverCacheRecordsBoost() { + var recordType = JsonLd.RECORD_KEY + '.' + JsonLd.TYPE_KEY; + + var recordBoost = Map.of( + "constant_score", Map.of( + "filter", Map.of("term", Map.of(recordType, RECORD_TYPE)), + "boost", 1000) + ); + var cacheRecordBoost = Map.of( + "constant_score", Map.of( + "filter", Map.of("term", Map.of(recordType, CACHE_RECORD_TYPE)), + "boost", 1) + ); + + return shouldWrap(List.of(recordBoost, cacheRecordBoost)); + } + private static final List CONCEPT_BOOST = List.of( "prefLabel^1500", "prefLabelByLang.sv^1500", From 69a45df81bc2435eb083e24584140fae08943f92 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Thu, 28 Nov 2024 17:25:11 +0100 Subject: [PATCH 06/12] Support custom boosting via api param _boost --- .../main/groovy/whelk/rest/api/SearchUtils2.java | 12 ++++++++++-- .../main/groovy/whelk/search2/Disambiguate.java | 2 +- .../src/main/groovy/whelk/search2/EsBoost.java | 1 - .../main/groovy/whelk/search2/QueryParams.java | 3 +++ .../groovy/whelk/search2/querytree/QueryTree.java | 15 +++++++-------- .../whelk/search2/querytree/QueryTreeSpec.groovy | 4 ++-- 6 files changed, 23 insertions(+), 14 deletions(-) diff --git a/rest/src/main/groovy/whelk/rest/api/SearchUtils2.java b/rest/src/main/groovy/whelk/rest/api/SearchUtils2.java index b8e374e364..177e0c4062 100644 --- a/rest/src/main/groovy/whelk/rest/api/SearchUtils2.java +++ b/rest/src/main/groovy/whelk/rest/api/SearchUtils2.java @@ -85,7 +85,7 @@ Map doSearch(Map queryParameters) throws Inval private Map getEsQueryDsl(QueryTree queryTree, QueryParams queryParams, AppParams.StatsRepr statsRepr) { var queryDsl = new LinkedHashMap(); - queryDsl.put("query", addConstantBoosts(queryTree.toEs(queryUtil, disambiguate))); + queryDsl.put("query", getEsQuery(queryTree, queryParams.boost)); queryDsl.put("size", queryParams.limit); queryDsl.put("from", queryParams.offset); queryDsl.put("sort", (queryParams.sortBy == Sort.DEFAULT_BY_RELEVANCY && queryTree.isWild() @@ -115,7 +115,15 @@ private Map getEsQueryDsl(QueryTree queryTree, QueryParams query return queryDsl; } - public Map getPartialCollectionView(QueryResult queryResult, + private Map getEsQuery(QueryTree queryTree, String boostParam) { + if (boostParam.contains("^")) { + return queryTree.toEs(queryUtil, disambiguate, List.of(boostParam.split(","))); + } + List boostFields = queryUtil.esBoost.getBoostFields(queryTree.collectTypes()); + return addConstantBoosts(queryTree.toEs(queryUtil, disambiguate, boostFields)); + } + + private Map getPartialCollectionView(QueryResult queryResult, QueryTree qt, QueryParams queryParams, AppParams appParams) { diff --git a/whelk-core/src/main/groovy/whelk/search2/Disambiguate.java b/whelk-core/src/main/groovy/whelk/search2/Disambiguate.java index 1328d0f235..4e822db013 100644 --- a/whelk-core/src/main/groovy/whelk/search2/Disambiguate.java +++ b/whelk-core/src/main/groovy/whelk/search2/Disambiguate.java @@ -104,7 +104,7 @@ private String getDomain(String property) { } public OutsetType decideOutset(QueryTree qt) { - Set outset = qt.collectGivenTypes() + Set outset = qt.collectTypes() .stream() .map(this::getOutsetType) .collect(Collectors.toSet()); diff --git a/whelk-core/src/main/groovy/whelk/search2/EsBoost.java b/whelk-core/src/main/groovy/whelk/search2/EsBoost.java index 4d40257795..af67549614 100644 --- a/whelk-core/src/main/groovy/whelk/search2/EsBoost.java +++ b/whelk-core/src/main/groovy/whelk/search2/EsBoost.java @@ -21,7 +21,6 @@ import static whelk.JsonLd.SUB_PROPERTY_OF; import static whelk.JsonLd.TYPE_KEY; import static whelk.JsonLd.asList; -import static whelk.search2.QueryUtil.boolWrap; import static whelk.search2.QueryUtil.mustWrap; import static whelk.search2.QueryUtil.shouldWrap; import static whelk.util.DocumentUtil.getAtPath; diff --git a/whelk-core/src/main/groovy/whelk/search2/QueryParams.java b/whelk-core/src/main/groovy/whelk/search2/QueryParams.java index 90e7b44597..e999dc7a05 100644 --- a/whelk-core/src/main/groovy/whelk/search2/QueryParams.java +++ b/whelk-core/src/main/groovy/whelk/search2/QueryParams.java @@ -28,6 +28,7 @@ public static class ApiParams { public static final String EXTRA = "_x"; public static final String DEBUG = "_debug"; public static final String APP_CONFIG = "_appConfig"; + public static final String BOOST = "_boost"; } public static class Debug { @@ -44,6 +45,7 @@ public static class Debug { public final List debug; public final String lens; public final Spell spell; + public final String boost; public final String q; public final String i; @@ -58,6 +60,7 @@ public QueryParams(Map apiParameters) throws InvalidQueryExcep this.offset = getOffset(apiParameters); this.lens = getOptionalSingleNonEmpty(ApiParams.LENS, apiParameters).orElse("cards"); this.spell = new Spell(getOptionalSingleNonEmpty(ApiParams.SPELL, apiParameters).orElse("")); + this.boost = getOptionalSingleNonEmpty(ApiParams.BOOST, apiParameters).orElse(""); this.q = getOptionalSingle(ApiParams.QUERY, apiParameters).orElse(""); this.i = getOptionalSingle(ApiParams.SIMPLE_FREETEXT, apiParameters).orElse(""); } diff --git a/whelk-core/src/main/groovy/whelk/search2/querytree/QueryTree.java b/whelk-core/src/main/groovy/whelk/search2/querytree/QueryTree.java index 6219ddb420..4585ad6e35 100644 --- a/whelk-core/src/main/groovy/whelk/search2/querytree/QueryTree.java +++ b/whelk-core/src/main/groovy/whelk/search2/querytree/QueryTree.java @@ -38,8 +38,7 @@ public QueryTree(Node tree) { removeNeedlessWildcard(); } - public Map toEs(QueryUtil queryUtil, Disambiguate disambiguate) { - List boostFields = queryUtil.esBoost.getBoostFields(getFiltered().collectGivenTypes()); + public Map toEs(QueryUtil queryUtil, Disambiguate disambiguate, List boostFields) { return expand(disambiguate) .insertNested(queryUtil::getNestedPath) .toEs(boostFields); @@ -201,14 +200,14 @@ private static Node removeTopLevelNodesByCondition(Node tree, Predicate p) }; } - public Set collectGivenTypes() { - return collectGivenTypes(tree, new HashSet<>()); + public Set collectTypes() { + return collectTypes(getFiltered().tree, new HashSet<>()); } - private static Set collectGivenTypes(Node sqtNode, Set types) { + private static Set collectTypes(Node sqtNode, Set types) { switch (sqtNode) { - case And and -> and.children().forEach(c -> collectGivenTypes(c, types)); - case Or or -> or.children().forEach(d -> collectGivenTypes(d, types)); + case And and -> and.children().forEach(c -> collectTypes(c, types)); + case Or or -> or.children().forEach(d -> collectTypes(d, types)); case PropertyValue pv -> { if (pv.property().isRdfType() && pv.operator().equals(Operator.EQUALS)) { types.add(pv.value().string()); @@ -368,7 +367,7 @@ private QueryTree removeNegatedSelectableFilters(Collection selectableFi } public void addFilters(QueryParams queryParams, AppParams appParams) { - boolean typeNotGiven = collectGivenTypes().isEmpty(); + boolean typeNotGiven = collectTypes().isEmpty(); var currentActiveBfNodes = getActiveBfNodes(); Function isTypeEquals = pv -> diff --git a/whelk-core/src/test/groovy/whelk/search2/querytree/QueryTreeSpec.groovy b/whelk-core/src/test/groovy/whelk/search2/querytree/QueryTreeSpec.groovy index 6b3536999a..f076119036 100644 --- a/whelk-core/src/test/groovy/whelk/search2/querytree/QueryTreeSpec.groovy +++ b/whelk-core/src/test/groovy/whelk/search2/querytree/QueryTreeSpec.groovy @@ -176,7 +176,7 @@ class QueryTreeSpec extends Specification { ])) expect: - qt.collectGivenTypes() == [v1.string(), v3.string()] as Set + qt.collectTypes() == [v1.string(), v3.string()] as Set } def "collect given types"() { @@ -189,7 +189,7 @@ class QueryTreeSpec extends Specification { ])) expect: - qt.collectGivenTypes() == ['type1', 'type3'] as Set + qt.collectTypes() == ['type1', 'type3'] as Set } def "get top level free text as string"() { From 6962e3b236912c8d0781ed02c7ba1a6efc9e33ce Mon Sep 17 00:00:00 2001 From: kwahlin Date: Thu, 28 Nov 2024 17:49:11 +0100 Subject: [PATCH 07/12] Fix boost param --- rest/src/main/groovy/whelk/rest/api/SearchUtils2.java | 10 +++++----- .../src/main/groovy/whelk/search2/QueryParams.java | 7 +++++-- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/rest/src/main/groovy/whelk/rest/api/SearchUtils2.java b/rest/src/main/groovy/whelk/rest/api/SearchUtils2.java index 177e0c4062..47f2f549f2 100644 --- a/rest/src/main/groovy/whelk/rest/api/SearchUtils2.java +++ b/rest/src/main/groovy/whelk/rest/api/SearchUtils2.java @@ -85,7 +85,7 @@ Map doSearch(Map queryParameters) throws Inval private Map getEsQueryDsl(QueryTree queryTree, QueryParams queryParams, AppParams.StatsRepr statsRepr) { var queryDsl = new LinkedHashMap(); - queryDsl.put("query", getEsQuery(queryTree, queryParams.boost)); + queryDsl.put("query", getEsQuery(queryTree, queryParams.boostFields)); queryDsl.put("size", queryParams.limit); queryDsl.put("from", queryParams.offset); queryDsl.put("sort", (queryParams.sortBy == Sort.DEFAULT_BY_RELEVANCY && queryTree.isWild() @@ -115,11 +115,11 @@ private Map getEsQueryDsl(QueryTree queryTree, QueryParams query return queryDsl; } - private Map getEsQuery(QueryTree queryTree, String boostParam) { - if (boostParam.contains("^")) { - return queryTree.toEs(queryUtil, disambiguate, List.of(boostParam.split(","))); + private Map getEsQuery(QueryTree queryTree, List boostFields) { + if (!boostFields.isEmpty()) { + return queryTree.toEs(queryUtil, disambiguate, boostFields); } - List boostFields = queryUtil.esBoost.getBoostFields(queryTree.collectTypes()); + boostFields = queryUtil.esBoost.getBoostFields(queryTree.collectTypes()); return addConstantBoosts(queryTree.toEs(queryUtil, disambiguate, boostFields)); } diff --git a/whelk-core/src/main/groovy/whelk/search2/QueryParams.java b/whelk-core/src/main/groovy/whelk/search2/QueryParams.java index e999dc7a05..e90c7b8303 100644 --- a/whelk-core/src/main/groovy/whelk/search2/QueryParams.java +++ b/whelk-core/src/main/groovy/whelk/search2/QueryParams.java @@ -45,7 +45,7 @@ public static class Debug { public final List debug; public final String lens; public final Spell spell; - public final String boost; + public final List boostFields; public final String q; public final String i; @@ -60,7 +60,7 @@ public QueryParams(Map apiParameters) throws InvalidQueryExcep this.offset = getOffset(apiParameters); this.lens = getOptionalSingleNonEmpty(ApiParams.LENS, apiParameters).orElse("cards"); this.spell = new Spell(getOptionalSingleNonEmpty(ApiParams.SPELL, apiParameters).orElse("")); - this.boost = getOptionalSingleNonEmpty(ApiParams.BOOST, apiParameters).orElse(""); + this.boostFields = getMultiple(ApiParams.BOOST, apiParameters); this.q = getOptionalSingle(ApiParams.QUERY, apiParameters).orElse(""); this.i = getOptionalSingle(ApiParams.SIMPLE_FREETEXT, apiParameters).orElse(""); } @@ -95,6 +95,9 @@ public Map getNonQueryParams(int offset) { if (!debug.isEmpty()) { params.put(ApiParams.DEBUG, String.join(",", debug)); } + if (!boostFields.isEmpty()) { + params.put(ApiParams.BOOST, String.join(",", boostFields)); + } return params; } From bdb3d0847be69881a58361dad22f19a801ee63dd Mon Sep 17 00:00:00 2001 From: kwahlin Date: Thu, 28 Nov 2024 17:51:34 +0100 Subject: [PATCH 08/12] Bugfix: Don't let subpaths block full paths from being added as boost field --- .../src/main/groovy/whelk/search/ESQueryLensBoost.groovy | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/whelk-core/src/main/groovy/whelk/search/ESQueryLensBoost.groovy b/whelk-core/src/main/groovy/whelk/search/ESQueryLensBoost.groovy index 6806df2691..5eda13f6a8 100644 --- a/whelk-core/src/main/groovy/whelk/search/ESQueryLensBoost.groovy +++ b/whelk-core/src/main/groovy/whelk/search/ESQueryLensBoost.groovy @@ -122,9 +122,10 @@ class ESQueryLensBoost { def obj = [(JsonLd.TYPE_KEY): rangeKey] def rangeChipLens = jsonld.getLensFor(obj, chipsLenses) def rangeChipFields = collectBoostFields( - rangeChipLens, CARD_BOOST, seenKeys) + rangeChipLens, CARD_BOOST, [] as Set) return rangeChipFields.collect { "${key}.$it" as String } + .findAll {!seenKeys.contains(it) } } else { key = "${key}.${JsonLd.SEARCH_KEY}" } From 197ec83c8a0016cc1799349d83cbd4959ec03dc4 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Fri, 29 Nov 2024 15:39:07 +0100 Subject: [PATCH 09/12] Fix: Don't add boosting for non-existing field --- whelk-core/src/main/groovy/whelk/search2/EsBoost.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/whelk-core/src/main/groovy/whelk/search2/EsBoost.java b/whelk-core/src/main/groovy/whelk/search2/EsBoost.java index af67549614..55ada5d624 100644 --- a/whelk-core/src/main/groovy/whelk/search2/EsBoost.java +++ b/whelk-core/src/main/groovy/whelk/search2/EsBoost.java @@ -287,11 +287,11 @@ private Map collectBoostFields(Map lens, int boost) { String termType = (String) term.get(TYPE_KEY); if ("ObjectProperty".equals(termType)) { key = key + "." + SEARCH_KEY; - } else if (jsonLd.isLangContainer(jsonLd.context.get(key))) { - key = key + "." + jsonLd.locales.getFirst(); } + boostFields.put(key, boost); + } else if (jsonLd.isLangContainer(jsonLd.context.get(key))) { + boostFields.put(key + "." + jsonLd.locales.getFirst(), boost); } - boostFields.put(key, boost); } return boostFields; From 01b834077404d77b57d4b1c7f795c03b8f92a18a Mon Sep 17 00:00:00 2001 From: kwahlin Date: Fri, 29 Nov 2024 16:32:53 +0100 Subject: [PATCH 10/12] Fix more lost in transpilation errors --- .../main/groovy/whelk/search2/EsBoost.java | 30 +++++++++++-------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/whelk-core/src/main/groovy/whelk/search2/EsBoost.java b/whelk-core/src/main/groovy/whelk/search2/EsBoost.java index 55ada5d624..f5baf2e316 100644 --- a/whelk-core/src/main/groovy/whelk/search2/EsBoost.java +++ b/whelk-core/src/main/groovy/whelk/search2/EsBoost.java @@ -86,8 +86,9 @@ private List computeBoostFields(List types) { boostFields = CONCEPT_BOOST.stream() .map(s -> s.split("\\^")) .collect(Collectors.toMap(parts -> parts[0], parts -> Integer.parseInt(parts[1]))); - - computeBoostFieldsFromLenses(otherTypes).forEach(boostFields::putIfAbsent); + if (!otherTypes.isEmpty()) { + computeBoostFieldsFromLenses(otherTypes).forEach(boostFields::putIfAbsent); + } } return boostFields.entrySet() @@ -326,18 +327,21 @@ private Map computeCardPropertyBoosts(String prop) { Map dfn = jsonLd.vocabIndex.get(prop); // Follow the object property range to append chip properties to the boosted path. - if (dfn != null && "ObjectProperty".equals(dfn.get(TYPE_KEY))) { - Optional rangeKey = Optional.ofNullable(dfn.get(RANGE)) - .map(r -> r instanceof List ? ((List) r).getFirst() : r) - .map(Map.class::cast) - .map(r -> (String) r.get(ID_KEY)) - .map(jsonLd::toTermKey); - - if (rangeKey.isPresent() && jsonLd.isSubClassOf(rangeKey.get(), "QualifiedRole")) { - var rangeChipLens = chipLenses.getLensForType(rangeKey.get()); - collectBoostFields(rangeChipLens, CARD_BOOST).forEach((k, v) -> boostFields.put(prop + "." + k, v)); - } else { + if (dfn != null) { + if ("ObjectProperty".equals(dfn.get(TYPE_KEY))) { + Optional rangeKey = Optional.ofNullable(dfn.get(RANGE)) + .map(r -> r instanceof List ? ((List) r).getFirst() : r) + .map(Map.class::cast) + .map(r -> (String) r.get(ID_KEY)) + .map(jsonLd::toTermKey); + if (rangeKey.isPresent() && jsonLd.isSubClassOf(rangeKey.get(), "QualifiedRole")) { + var rangeChipLens = chipLenses.getLensForType(rangeKey.get()); + collectBoostFields(rangeChipLens, CARD_BOOST).forEach((k, v) -> boostFields.put(prop + "." + k, v)); + return boostFields; + } boostFields.put(prop + "." + SEARCH_KEY, CARD_BOOST); + } else { + boostFields.put(prop, CARD_BOOST); } } else if (jsonLd.isLangContainer(jsonLd.context.get(prop))) { boostFields.put(prop + "." + jsonLd.locales.getFirst(), CARD_BOOST); From 2453dfaf432595c92407b8a3f3a689280ee6feab Mon Sep 17 00:00:00 2001 From: kwahlin Date: Mon, 2 Dec 2024 09:01:36 +0100 Subject: [PATCH 11/12] Add unit test --- .../groovy/whelk/search2/EsBoostSpec.groovy | 88 +++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 whelk-core/src/test/groovy/whelk/search2/EsBoostSpec.groovy diff --git a/whelk-core/src/test/groovy/whelk/search2/EsBoostSpec.groovy b/whelk-core/src/test/groovy/whelk/search2/EsBoostSpec.groovy new file mode 100644 index 0000000000..0c03db9dae --- /dev/null +++ b/whelk-core/src/test/groovy/whelk/search2/EsBoostSpec.groovy @@ -0,0 +1,88 @@ +package whelk.search2 + +import spock.lang.Specification +import whelk.JsonLd + +class EsBoostSpec extends Specification { + def "should compute boost fields from lenses"() { + given: + def ns = 'http://example.org/ns/' + + def context = [ + '@context': [ + '@vocab': ns + ] + ] + + def display = [ + "lensGroups": [ + "chips": [ + "lenses": [ + "Instance" : [ + "classLensDomain": "Instance", + "showProperties" : ["hasTitle", "comment"] + ], + "Publication": [ + "classLensDomain": "Publication", + "showProperties" : ["agent"] + ] + ] + ], + "cards": [ + "lenses": [ + "Instance": [ + "classLensDomain": "Instance", + "showProperties" : [ + [ + "alternateProperties": [ + [ + "subPropertyOf": "hasTitle" + ], + [ + "subPropertyOf": "value" + ], + [ + "noise": "should be ignored" + ], + "hasTitle", + "value" + ] + ], + "publication" + ] + ] + ] + ] + ] + ] + + def vocab = [ + "@graph": [ + ["@id": ns + "QualifiedRole", "@type": "Class"], + ["@id" : ns + "Publication", "@type": "Class", + "subClassOf": ["@id": ns + "QualifiedRole"]], + ["@id" : ns + "Title", "@type": "Class", + "subClassOf": ["@id": ns + "StructuredValue"]], + ["@id" : ns + "hasTitle", "@type": "ObjectProperty", + "range": [["@id": ns + "Title"]]], + ["@id" : ns + "publication", "@type": "ObjectProperty", + "range": [["@id": ns + "Publication"]]], + ["@id" : ns + "agent", "@type": "ObjectProperty", + "range": [["@id": ns + "Agent"]]], + ["@id": ns + "value", "@type": "DatatypeProperty"], + ["@id": ns + "comment", "@type": "DatatypeProperty"] + ] + ] + + def jsonld = new JsonLd(context, display, vocab) + def lensBoost = new EsBoost(jsonld) + + when: + def boostFields = lensBoost.getBoostFields(["Instance"]) + + then: + boostFields == [ + 'comment^200', 'hasTitle._str^200', '_str^100', 'publication.agent._str^10', 'value^10' + ] + } +} From eec3cb9dc869f99059b766b9f2b8000b9026ae4f Mon Sep 17 00:00:00 2001 From: kwahlin Date: Wed, 11 Dec 2024 13:55:27 +0100 Subject: [PATCH 12/12] Don't try to collect non-existing score --- whelk-core/src/main/groovy/whelk/search2/QueryResult.java | 1 + 1 file changed, 1 insertion(+) diff --git a/whelk-core/src/main/groovy/whelk/search2/QueryResult.java b/whelk-core/src/main/groovy/whelk/search2/QueryResult.java index 7110fa1704..89a5c1b167 100644 --- a/whelk-core/src/main/groovy/whelk/search2/QueryResult.java +++ b/whelk-core/src/main/groovy/whelk/search2/QueryResult.java @@ -58,6 +58,7 @@ private static List collectEsItems(Map esResponse) { private static List> collectScores(Map esResponse) { return ((List) getAtPath(esResponse, List.of("hits", "hits"), Collections.emptyList())) .stream() + .filter(m -> ((Map) m).get("_score") != null) .map(QueryUtil::castToStringObjectMap) .filter(m -> m.keySet().retainAll(List.of("_id", "_score", "_explanation"))) .toList();