diff --git a/.travis.yml b/.travis.yml index da8a207dc..91a065566 100644 --- a/.travis.yml +++ b/.travis.yml @@ -33,10 +33,12 @@ install: - pip install -r requirements.txt - python -m textblob.download_corpora - cp config.json.sample config.json - - export SIMPLIFIED_CONFIGURATION_FILE="$TRAVIS_BUILD_DIR/config.json" + - export SIMPLIFIED_CONFIGURATION_FILE="${TRAVIS_BUILD_DIR}/config.json" - wget ${ES_DOWNLOAD} - - tar -xzf elasticsearch-${ES_VERSION}.tar.gz - - ./elasticsearch-${ES_VERSION}/bin/elasticsearch & + - export ES_PATH=elasticsearch-${ES_VERSION} + - tar -xzf ${ES_PATH}.tar.gz + - cp elasticsearch.yml.testing ./${ES_PATH}/config/elasticsearch.yml + - ./${ES_PATH}/bin/elasticsearch & before_script: - psql -c 'create user simplified_test;' -U postgres diff --git a/elasticsearch.yml.testing b/elasticsearch.yml.testing new file mode 100644 index 000000000..0b1514d2a --- /dev/null +++ b/elasticsearch.yml.testing @@ -0,0 +1,5 @@ +# Scores for search results are calculated on a per-shard basis. +# Keeping everything in one shard while running unit testsensures that +# we have complete control over how the scores are calculated. +index.number_of_shards: 1 +index.number_of_replicas: 0 diff --git a/external_search.py b/external_search.py index 67b8817f3..7b8fcbeed 100644 --- a/external_search.py +++ b/external_search.py @@ -30,20 +30,22 @@ class ExternalSearchIndex(object): NAME = ExternalIntegration.ELASTICSEARCH - WORKS_INDEX_KEY = u'works_index' - WORKS_ALIAS_KEY = u'works_alias' + WORKS_INDEX_PREFIX_KEY = u'works_index_prefix' - DEFAULT_WORKS_INDEX = u'works' + DEFAULT_WORKS_INDEX_PREFIX = u'circulation-works' work_document_type = 'work-type' __client = None - CURRENT_ALIAS_SUFFIX = '-current' + CURRENT_ALIAS_SUFFIX = 'current' VERSION_RE = re.compile('-v([0-9]+)$') SETTINGS = [ { "key": ExternalIntegration.URL, "label": _("URL") }, - { "key": WORKS_INDEX_KEY, "label": _("Works index"), "default": DEFAULT_WORKS_INDEX }, + { "key": WORKS_INDEX_PREFIX_KEY, "label": _("Index prefix"), + "default": DEFAULT_WORKS_INDEX_PREFIX, + "description": _("Any Elasticsearch indexes needed for this application will be created with this unique prefix. In most cases, the default will work fine. You may need to change this if you have multiple application servers using a single Elasticsearch server.") + }, ] SITEWIDE = True @@ -66,13 +68,35 @@ def search_integration(cls, _db): ) @classmethod - def works_index_name(cls, _db): - """Look up the name of the search index.""" + def works_prefixed(cls, _db, value): + """Prefix the given value with the prefix to use when generating index + and alias names. + + :return: A string "{prefix}-{value}", or None if no prefix is + configured. + """ integration = cls.search_integration(_db) if not integration: return None - setting = integration.setting(cls.WORKS_INDEX_KEY) - return setting.value_or_default(cls.DEFAULT_WORKS_INDEX) + setting = integration.setting(cls.WORKS_INDEX_PREFIX_KEY) + prefix = setting.value_or_default(cls.DEFAULT_WORKS_INDEX_PREFIX) + return prefix + '-' + value + + @classmethod + def works_index_name(cls, _db): + """Look up the name of the search index. + + It's possible, but unlikely, that the search index alias will + point to some other index. But if there were no indexes, and a + new one needed to be created, this would be the name of that + index. + """ + return cls.works_prefixed(_db, ExternalSearchIndexVersions.latest()) + + @classmethod + def works_alias_name(cls, _db): + """Look up the name of the search index alias.""" + return cls.works_prefixed(_db, cls.CURRENT_ALIAS_SUFFIX) def __init__(self, _db, url=None, works_index=None): @@ -118,87 +142,36 @@ def __init__(self, _db, url=None, works_index=None): # Document upload runs against the works_index. # Search queries run against works_alias. if works_index and integration: - self.set_works_index_and_alias(works_index) - self.update_integration_settings(integration) + self.set_works_index_and_alias(_db) def bulk(docs, **kwargs): return elasticsearch_bulk(self.__client, docs, **kwargs) self.bulk = bulk - - def update_integration_settings(self, integration, force=False): - """Updates the integration with an appropriate index and alias - setting if the index and alias have been updated. - """ - if not integration or not (self.works_index and self.works_alias): - return - - if self.works_index==self.works_alias: - # An index is being used as the alias. There is no alias - # to update with. - return - - if integration.setting(self.WORKS_ALIAS_KEY).value and not force: - # This integration already has an alias and we don't want to - # force an update. - return - - index_or_alias = [self.works_index, self.works_alias] - if (integration.setting(self.WORKS_INDEX_KEY).value not in index_or_alias - and not force - ): - # This ExternalSearchIndex was created for a different index and - # alias, and we don't want to force an update. - return - integration.setting(self.WORKS_INDEX_KEY).value = unicode(self.works_index) - integration.setting(self.WORKS_ALIAS_KEY).value = unicode(self.works_alias) - - def set_works_index_and_alias(self, current_alias): + def set_works_index_and_alias(self, _db): """Finds or creates the works_index and works_alias based on - provided configuration. + the current configuration. """ - if current_alias: - index_details = self.indices.get_alias(name=current_alias, ignore=[404]) - found = bool(index_details) and not (index_details.get('status')==404 or 'error' in index_details) - else: - found = False - - def _set_works_index(name): - self.works_index = self.__client.works_index = name - - if found: - # We found an index for the alias in configuration. Assume - # there is only one. - _set_works_index(index_details.keys()[0]) - else: - if current_alias.endswith(self.CURRENT_ALIAS_SUFFIX): - # The alias culled from configuration is intended to be - # a current alias, but an index with that alias wasn't - # found. Find or create an appropriate index. - base_index_name = self.base_index_name(current_alias) - new_index = base_index_name+'-'+ExternalSearchIndexVersions.latest() - _set_works_index(new_index) - else: - # Without the CURRENT_ALIAS_SUFFIX, assume the index string - # from config is the index itself and needs to be swapped. - _set_works_index(current_alias) - + # The index name to use is the one known to be right for this + # version. + self.works_index = self.__client.works_index = self.works_index_name(_db) if not self.indices.exists(self.works_index): + # That index doesn't actually exist. Set it up. self.setup_index() - self.setup_current_alias() - def setup_current_alias(self): - """Finds or creates a works_alias based on the base works_index - name and ending in the expected CURRENT_ALIAS_SUFFIX. + # Make sure the alias points to the most recent index. + self.setup_current_alias(_db) + + def setup_current_alias(self, _db): + """Finds or creates the works_alias as named by the current site + settings. If the resulting alias exists and is affixed to a different index or if it can't be generated for any reason, the alias will not be created or moved. Instead, the search client will use the the works_index directly for search queries. """ - - base_works_index = self.base_index_name(self.works_index) - alias_name = base_works_index+self.CURRENT_ALIAS_SUFFIX + alias_name = self.works_alias_name(_db) exists = self.indices.exists_alias(name=alias_name) def _set_works_alias(name): @@ -244,7 +217,7 @@ def setup_index(self, new_index=None): body = ExternalSearchIndexVersions.latest_body() self.indices.create(index=index, body=body) - def transfer_current_alias(self, new_index): + def transfer_current_alias(self, _db, new_index): """Force -current alias onto a new index""" if not self.indices.exists(index=new_index): raise ValueError( @@ -260,11 +233,11 @@ def transfer_current_alias(self, new_index): "is the same.") % (new_index, self.works_index)) self.works_index = self.__client.works_index = new_index - alias_name = self.base_index_name(new_index)+self.CURRENT_ALIAS_SUFFIX + alias_name = self.works_alias_name(_db) exists = self.indices.exists_alias(name=alias_name) if not exists: - self.setup_current_alias() + self.setup_current_alias(_db) return exists_on_works_index = self.indices.get_alias( @@ -324,9 +297,10 @@ def query_works(self, library, query_string, media, languages, fiction, audience ) if fields is not None: search_args['fields'] = fields - #print "Args looks like: %r" % args + # search_args['explain'] = True + # print "Args looks like: %r" % search_args results = self.search(**search_args) - #print "Results: %r" % results + # print "Results: %r" % results return results def make_query(self, query_string): @@ -339,7 +313,7 @@ def make_query_string_query(query_string, fields): } } - def make_phrase_query(query_string, fields): + def make_phrase_query(query_string, fields, boost=100): field_queries = [] for field in fields: field_query = { @@ -352,7 +326,7 @@ def make_phrase_query(query_string, fields): 'bool': { 'should': field_queries, 'minimum_should_match': 1, - 'boost': 100, + 'boost': boost, } } @@ -453,6 +427,13 @@ def make_target_age_query(target_age): match_phrase = make_phrase_query(query_string, ['title.minimal', 'author', 'series.minimal']) must_match_options.append(match_phrase) + # An exact title or author match outweighs a match that is split + # across fields. + match_title = make_phrase_query(query_string, ['title.standard'], 200) + must_match_options.append(match_title) + match_author = make_phrase_query(query_string, ['author.standard'], 200) + must_match_options.append(match_author) + if not fuzzy_blacklist_re.search(query_string): fuzzy_query = make_fuzzy_query(query_string, fuzzy_fields) must_match_options.append(fuzzy_query) @@ -730,7 +711,7 @@ def remove_work(self, work): class ExternalSearchIndexVersions(object): - VERSIONS = ['v2'] + VERSIONS = ['v2', 'v3'] @classmethod def latest(cls): @@ -751,6 +732,64 @@ def map_fields(cls, fields, field_description): mapping["properties"][field] = field_description return mapping + @classmethod + def v3_body(cls): + """The v3 body is the same as the v2 except for the inclusion of the + '.standard' version of fields, analyzed using the standard + analyzer for near-exact matches. + """ + settings = { + "analysis": { + "filter": { + "en_stop_filter": { + "type": "stop", + "stopwords": ["_english_"] + }, + "en_stem_filter": { + "type": "stemmer", + "name": "english" + }, + "en_stem_minimal_filter": { + "type": "stemmer", + "name": "english" + }, + }, + "analyzer" : { + "en_analyzer": { + "type": "custom", + "char_filter": ["html_strip"], + "tokenizer": "standard", + "filter": ["lowercase", "asciifolding", "en_stop_filter", "en_stem_filter"] + }, + "en_minimal_analyzer": { + "type": "custom", + "char_filter": ["html_strip"], + "tokenizer": "standard", + "filter": ["lowercase", "asciifolding", "en_stop_filter", "en_stem_minimal_filter"] + }, + } + } + } + + mapping = cls.map_fields( + fields=["title", "series", "subtitle", "summary", "classifications.term"], + field_description={ + "type": "string", + "analyzer": "en_analyzer", + "fields": { + "minimal": { + "type": "string", + "analyzer": "en_minimal_analyzer"}, + "standard": { + "type": "string", + "analyzer": "standard" + } + }} + ) + mappings = { ExternalSearchIndex.work_document_type : mapping } + + return dict(settings=settings, mappings=mappings) + @classmethod def v2_body(cls): @@ -873,7 +912,7 @@ class SearchIndexMonitor(WorkSweepMonitor): def __init__(self, _db, collection, index_name=None, index_client=None, **kwargs): super(SearchIndexMonitor, self).__init__(_db, collection, **kwargs) - + if index_client: # This would only happen during a test. self.search_index_client = index_client diff --git a/migration/20171201-rebuild-search-index.py b/migration/20171201-rebuild-search-index.py new file mode 100755 index 000000000..3646796b8 --- /dev/null +++ b/migration/20171201-rebuild-search-index.py @@ -0,0 +1,12 @@ +#!/usr/bin/env python +"""Running the search index updater script will create the new +circulation-works-v3 index and change the circulation-works-current +alias to point to it. +""" +import os +import sys +bin_dir = os.path.split(__file__)[0] +package_dir = os.path.join(bin_dir, "..") +sys.path.append(os.path.abspath(package_dir)) +from scripts import UpdateSearchIndexScript +UpdateSearchIndexScript().run() diff --git a/tests/test_external_search.py b/tests/test_external_search.py index e24d5debf..a7f835043 100644 --- a/tests/test_external_search.py +++ b/tests/test_external_search.py @@ -44,7 +44,7 @@ def setup(self): ExternalIntegration.ELASTICSEARCH, goal=ExternalIntegration.SEARCH_GOAL, url=u'http://localhost:9200', - settings={ExternalSearchIndex.WORKS_INDEX_KEY : u'test_index-v0'} + settings={ExternalSearchIndex.WORKS_INDEX_PREFIX_KEY : u'test_index'} ) try: @@ -63,13 +63,28 @@ def teardown(self): ExternalSearchIndex.reset() super(ExternalSearchTest, self).teardown() + def default_work(self, *args, **kwargs): + """Convenience method to create a work with a license pool + in the default collection. + """ + work = self._work( + *args, with_license_pool=True, + collection=self._default_collection, **kwargs + ) + work.set_presentation_ready() + return work + class TestExternalSearch(ExternalSearchTest): def test_works_index_name(self): + """The name of the search index is the prefix (defined in + ExternalSearchTest.setup) plus a version number associated + with this version of the core code. + """ if not self.search: return - eq_("test_index-v0", self.search.works_index_name(self._db)) + eq_("test_index-v3", self.search.works_index_name(self._db)) def test_setup_index_creates_new_index(self): if not self.search: @@ -86,7 +101,7 @@ def test_setup_index_creates_new_index(self): eq_(current_index, self.search.works_index) # The alias hasn't been passed over to the new index. - alias = 'test_index' + self.search.CURRENT_ALIAS_SUFFIX + alias = 'test_index-' + self.search.CURRENT_ALIAS_SUFFIX eq_(alias, self.search.works_alias) eq_(True, self.search.indices.exists_alias(current_index, alias)) eq_(False, self.search.indices.exists_alias('the_other_index', alias)) @@ -95,25 +110,34 @@ def test_set_works_index_and_alias(self): if not self.search: return - # If -current alias is given but doesn't exist, the appropriate - # index and alias will be created. - self.search.set_works_index_and_alias('banana-current') + # If the index or alias don't exist, set_works_index_and_alias + # will create them. + self.integration.set_setting(ExternalSearchIndex.WORKS_INDEX_PREFIX_KEY, u'banana') + self.search.set_works_index_and_alias(self._db) expected_index = 'banana-' + ExternalSearchIndexVersions.latest() + expected_alias = 'banana-' + self.search.CURRENT_ALIAS_SUFFIX + eq_(expected_index, self.search.works_index) + eq_(expected_alias, self.search.works_alias) + + # If the index and alias already exist, set_works_index_and_alias + # does nothing. + self.search.set_works_index_and_alias(self._db) eq_(expected_index, self.search.works_index) - eq_('banana-current', self.search.works_alias) + eq_(expected_alias, self.search.works_alias) def test_setup_current_alias(self): if not self.search: return # The index was generated from the string in configuration. - index_name = 'test_index-v0' + version = ExternalSearchIndexVersions.VERSIONS[-1] + index_name = 'test_index-' + version eq_(index_name, self.search.works_index) eq_(True, self.search.indices.exists(index_name)) # The alias is also created from the configuration. - alias = 'test_index' + self.search.CURRENT_ALIAS_SUFFIX + alias = 'test_index-' + self.search.CURRENT_ALIAS_SUFFIX eq_(alias, self.search.works_alias) eq_(True, self.search.indices.exists_alias(index_name, alias)) @@ -121,19 +145,22 @@ def test_setup_current_alias(self): # won't be reassigned. Instead, search will occur against the # index itself. ExternalSearchIndex.reset() - self.integration.set_setting(ExternalSearchIndex.WORKS_INDEX_KEY, u'test_index-v100') + self.integration.set_setting(ExternalSearchIndex.WORKS_INDEX_PREFIX_KEY, u'my-app') self.search = ExternalSearchIndex(self._db) - eq_('test_index-v100', self.search.works_index) - eq_('test_index-v100', self.search.works_alias) + eq_('my-app-%s' % version, self.search.works_index) + eq_('my-app-' + self.search.CURRENT_ALIAS_SUFFIX, self.search.works_alias) def test_transfer_current_alias(self): if not self.search: return - # If the index doesn't exist, an error is raised. + # An error is raised if you try to set the alias to point to + # an index that doesn't already exist. assert_raises( - ValueError, self.search.transfer_current_alias, 'test_index-v3') + ValueError, self.search.transfer_current_alias, self._db, + 'no-such-index' + ) original_index = self.search.works_index @@ -142,34 +169,37 @@ def test_transfer_current_alias(self): self.search.indices.delete_alias( index=original_index, name='test_index-current' ) - self.search.setup_index(new_index='test_index-v100') - self.search.transfer_current_alias('test_index-v100') - eq_('test_index-v100', self.search.works_index) + self.search.setup_index(new_index='test_index-v9999') + self.search.transfer_current_alias(self._db, 'test_index-v9999') + eq_('test_index-v9999', self.search.works_index) eq_('test_index-current', self.search.works_alias) # If the -current alias already exists on the index, # it's used without a problem. - self.search.transfer_current_alias('test_index-v100') - eq_('test_index-v100', self.search.works_index) + self.search.transfer_current_alias(self._db, 'test_index-v9999') + eq_('test_index-v9999', self.search.works_index) eq_('test_index-current', self.search.works_alias) # If the -current alias is being used on a different version of the # index, it's deleted from that index and placed on the new one. self.search.setup_index(original_index) - self.search.transfer_current_alias(original_index) + self.search.transfer_current_alias(self._db, original_index) eq_(original_index, self.search.works_index) eq_('test_index-current', self.search.works_alias) # It has been removed from other index. eq_(False, self.search.indices.exists_alias( - index='test_index-v100', name='test_index-current')) + index='test_index-v9999', name='test_index-current')) + # And only exists on the new index. alias_indices = self.search.indices.get_alias(name='test_index-current').keys() - eq_(['test_index-v0'], alias_indices) + eq_([original_index], alias_indices) # If the index doesn't have the same base name, an error is raised. assert_raises( - ValueError, self.search.transfer_current_alias, 'banana-v10') + ValueError, self.search.transfer_current_alias, self._db, + 'banana-v10' + ) class TestExternalSearchWithWorks(ExternalSearchTest): """These tests run against a real search index with works in it. @@ -180,15 +210,7 @@ class TestExternalSearchWithWorks(ExternalSearchTest): def setup(self): super(TestExternalSearchWithWorks, self).setup() - - def _work(*args, **kwargs): - """Convenience method to create a work with a license pool - in the default collection. - """ - return self._work( - *args, with_license_pool=True, - collection=self._default_collection, **kwargs - ) + _work = self.default_work if self.search: @@ -230,6 +252,10 @@ def _work(*args, **kwargs): self.les_mis.presentation_edition.title = u"Les Mis\u00E9rables" self.les_mis.set_presentation_ready() + self.modern_romance = _work() + self.modern_romance.presentation_edition.title = u"Modern Romance" + self.modern_romance.set_presentation_ready() + self.lincoln = _work(genre="Biography & Memoir", title="Abraham Lincoln") self.lincoln.set_presentation_ready() @@ -248,7 +274,13 @@ def _work(*args, **kwargs): self.adult_work = _work(title="Still Alice", audience=Classifier.AUDIENCE_ADULT) self.adult_work.set_presentation_ready() - self.ya_romance = _work(audience=Classifier.AUDIENCE_YOUNG_ADULT, genre="Romance") + self.ya_romance = _work( + title="Gumby In Love", + audience=Classifier.AUDIENCE_YOUNG_ADULT, genre="Romance" + ) + self.ya_romance.presentation_edition.subtitle = ( + "Modern Fairytale Series, Volume 7" + ) self.ya_romance.set_presentation_ready() self.no_age = _work() @@ -467,11 +499,28 @@ def query(*args, **kwargs): # Matches genre - results = query("romance", None, None, None, None, None, None, None) - hits = results["hits"]["hits"] - eq_(1, len(hits)) - eq_(unicode(self.ya_romance.id), hits[0]['_id']) + def expect_ids(works, *query_args): + original_query_args = list(query_args) + query_args = list(original_query_args) + while len(query_args) < 8: + query_args.append(None) + results = query(*query_args) + hits = results["hits"]["hits"] + expect = [unicode(x.id) for x in works] + actual = [x['_id'] for x in hits] + expect_titles = ", ".join([x.title for x in works]) + actual_titles = ", ".join([x['_source']['title'] for x in hits]) + eq_( + expect, actual, + "Query args %r did not find %d works (%s), instead found %d (%s)" % ( + original_query_args, len(expect), expect_titles, + len(actual), actual_titles + ) + ) + # Search by genre. The name of the genre also shows up in the + # title of a book, but the genre comes up first. + expect_ids([self.ya_romance, self.modern_romance], "romance") # Matches audience @@ -528,9 +577,14 @@ def query(*args, **kwargs): results = query("young adult romance", None, None, None, None, None, None, None) hits = results["hits"]["hits"] - eq_(1, len(hits)) - eq_(unicode(self.ya_romance.id), hits[0]['_id']) + # The book with 'Romance' in the title also shows up, but it + # shows up after the book whose audience matches 'young adult' + # and whose genre matches 'romance'. + eq_( + map(unicode, [self.ya_romance.id, self.modern_romance.id]), + [x['_id'] for x in hits] + ) # Matches age + fiction @@ -748,6 +802,10 @@ def query(*args, **kwargs): hits = results["hits"]["hits"] eq_(2, len(hits)) + # + # Test searching across collections. + # + # If we add the missing collection to the default library, "A # Tiny Book" starts showing up in searches against that # library. @@ -778,6 +836,141 @@ def query(*args, **kwargs): ] eq_(set(collections), set(expect_collections)) +class TestExactMatches(ExternalSearchTest): + """Verify that exact or near-exact title and author matches are + privileged over matches that span fields. + """ + + def setup(self): + super(TestExactMatches, self).setup() + _work = self.default_work + + # Here the title is 'Modern Romance' + self.modern_romance = _work( + title="Modern Romance", + authors=["Aziz Ansari", "Eric Klinenberg"], + ) + + # Here 'Modern' is in the subtitle and 'Romance' is the genre. + self.ya_romance = _work( + title="Gumby In Love", + authors="Pokey", + audience=Classifier.AUDIENCE_YOUNG_ADULT, genre="Romance" + ) + self.ya_romance.presentation_edition.subtitle = ( + "Modern Fairytale Series, Book 3" + ) + + self.parent_book = _work( + title="Our Son Aziz", + authors=["Fatima Ansari", "Shoukath Ansari"], + genre="Biography & Memoir", + ) + + self.behind_the_scenes = _work( + title="The Making of Biography With Peter Graves", + genre="Entertainment", + ) + + self.biography_of_peter_graves = _work( + "He Is Peter Graves", + authors="Kelly Ghostwriter", + genre="Biography & Memoir", + ) + + self.book_by_peter_graves = _work( + title="My Experience At The University of Minnesota", + authors="Peter Graves", + genre="Entertainment", + ) + + self.book_by_someone_else = _work( + title="The Deadly Graves", + authors="Peter Ansari", + genre="Mystery" + ) + + # Add all the works created in the setup to the search index. + SearchIndexCoverageProvider( + self._db, search_index_client=self.search + ).run_once_and_update_timestamp() + + # Sleep to give the index time to catch up. + time.sleep(2) + + def test_exact_matches(self): + + # Convenience method to query the default library. + def query(*args, **kwargs): + return self.search.query_works( + self._default_library, *args, **kwargs + ) + + def expect_ids(works, *query_args): + original_query_args = list(query_args) + query_args = list(original_query_args) + while len(query_args) < 8: + query_args.append(None) + results = query(*query_args) + hits = results["hits"]["hits"] + expect = [unicode(x.id) for x in works] + actual = [x['_id'] for x in hits] + expect_titles = ", ".join([x.title for x in works]) + actual_titles = ", ".join([x['_source']['title'] for x in hits]) + eq_( + expect, actual, + "Query args %r did not find %d works (%s), instead found %d (%s)" % ( + original_query_args, len(expect), expect_titles, + len(actual), actual_titles + ) + ) + + # A full title match takes precedence over a match that's + # split across genre and subtitle. + expect_ids( + [ + self.modern_romance, # "modern romance" in title + self.ya_romance # "modern" in subtitle, genre "romance" + ], + "modern romance" + ) + + # A full author match takes precedence over a partial author + # match. A partial author match that matches the entire search + # string takes precedence over a partial author match that doesn't. + expect_ids( + [ + self.modern_romance, # "Aziz Ansari" in author + self.parent_book, # "Aziz" in title, "Ansari" in author + self.book_by_someone_else # "Ansari" in author + ], + "aziz ansari" + ) + + # When a string exactly matches both a title and an author, + # the books that match exactly are promoted. + expect_ids( + [self.biography_of_peter_graves, self.behind_the_scenes, + self.book_by_peter_graves, self.book_by_someone_else], + "peter graves" + ) + + # 'The Making of Biography With Peter Graves' does worse in a + # search for 'peter graves biography' than a biography whose + # title includes the phrase 'peter graves'. Although the title + # contains all three search terms, it's not an exact token + # match. But "The Making of..." still does better than + # books that match 'peter graves' (or 'peter' and 'graves'), + # but not 'biography'. + expect_ids( + [self.biography_of_peter_graves, # title + genre 'biography' + self.behind_the_scenes, # all words match in title + self.book_by_peter_graves, # author (no 'biography') + self.book_by_someone_else # title + author (no 'biography') + ], + "peter graves biography" + ) + class TestSearchQuery(DatabaseTest): def test_make_query(self): @@ -787,39 +980,85 @@ def test_make_query(self): # Basic query query = search.make_query("test") + # The query runs a number of matching techniques on a search + # result and picks the best one. must = query['dis_max']['queries'] - eq_(3, len(must)) - stemmed_query = must[0]['simple_query_string'] + # Here are the matching techniques. + stemmed, minimal, standard_title, standard_author, fuzzy = must + + # The search string is stemmed and matched against a number + # of fields such as title and publisher. + stemmed_query = stemmed['simple_query_string'] eq_("test", stemmed_query['query']) assert "title^4" in stemmed_query['fields'] assert 'publisher' in stemmed_query['fields'] - phrase_queries = must[1]['bool']['should'] - eq_(3, len(phrase_queries)) - title_phrase_query = phrase_queries[0]['match_phrase'] - assert 'title.minimal' in title_phrase_query - eq_("test", title_phrase_query['title.minimal']) + def assert_field_names(query, *expect): + """Validate that a query is set up to match the string 'test' + against one or more of a number of fields. + """ + actual_keys = set() + + # For this part of the query to be considered successful, + # it's sufficient for a single field to match. + eq_(1, query['bool']['minimum_should_match']) + + for possibility in query['bool']['should']: + [(key, value)] = possibility['match_phrase'].items() + actual_keys.add(key) + eq_(value, 'test') + eq_(set(actual_keys), set(expect)) + + # The search string is matched against a number of + # minimally processed fields. + assert_field_names( + minimal, + 'title.minimal', 'author', 'series.minimal' + ) - # Query with fuzzy blacklist keyword - query = search.make_query("basketball") + # The search string is matched more or less as-is against + # the title alone... + assert_field_names(standard_title, 'title.standard') + + # ... and the author alone. + assert_field_names(standard_author, 'author.standard') + + # The search string is matched fuzzily against a number of + # minimally-processed fields such as title and publisher. + fuzzy_query = fuzzy['multi_match'] + eq_('AUTO', fuzzy_query['fuzziness']) + fuzzy_fields = fuzzy_query['fields'] + assert 'title.minimal^4' in fuzzy_fields + assert 'author^4' in fuzzy_fields + assert 'publisher' in fuzzy_fields + + # Of those fields, the single one that best matches the search + # request is chosen to represent this match technique's score. + # https://www.elastic.co/guide/en/elasticsearch/guide/current/_best_fields.html + eq_('best_fields', fuzzy_query['type']) + + # If we create a query using a fuzzy blacklist keyword... + query = search.make_query("basketball") must = query['dis_max']['queries'] - eq_(2, len(must)) + # ... the fuzzy match technique is not used because it's too + # unreliable. + eq_(4, len(must)) # Query with genre query = search.make_query("test romance") must = query['dis_max']['queries'] - eq_(4, len(must)) + eq_(6, len(must)) full_query = must[0]['simple_query_string'] eq_("test romance", full_query['query']) assert "title^4" in full_query['fields'] assert 'publisher' in full_query['fields'] - classification_query = must[3]['bool']['must'] + classification_query = must[5]['bool']['must'] eq_(2, len(classification_query)) genre_query = classification_query[0]['match'] assert 'genres.name' in genre_query @@ -835,9 +1074,9 @@ def test_make_query(self): must = query['dis_max']['queries'] - eq_(4, len(must)) + eq_(6, len(must)) - classification_query = must[3]['bool']['must'] + classification_query = must[5]['bool']['must'] eq_(2, len(classification_query)) fiction_query = classification_query[0]['match'] assert 'fiction' in fiction_query @@ -853,9 +1092,9 @@ def test_make_query(self): must = query['dis_max']['queries'] - eq_(4, len(must)) + eq_(6, len(must)) - classification_query = must[3]['bool']['must'] + classification_query = must[5]['bool']['must'] eq_(3, len(classification_query)) genre_query = classification_query[0]['match'] assert 'genres.name' in genre_query @@ -874,11 +1113,11 @@ def test_make_query(self): must = query['dis_max']['queries'] - eq_(4, len(must)) + eq_(6, len(must)) full_query = must[0]['simple_query_string'] eq_("test young adult", full_query['query']) - classification_query = must[3]['bool']['must'] + classification_query = must[5]['bool']['must'] eq_(2, len(classification_query)) audience_query = classification_query[0]['match'] assert 'audience' in audience_query @@ -892,11 +1131,11 @@ def test_make_query(self): must = query['dis_max']['queries'] - eq_(4, len(must)) + eq_(6, len(must)) full_query = must[0]['simple_query_string'] eq_("test grade 6", full_query['query']) - classification_query = must[3]['bool']['must'] + classification_query = must[5]['bool']['must'] eq_(2, len(classification_query)) grade_query = classification_query[0]['bool'] assert 'must' in grade_query @@ -915,11 +1154,11 @@ def test_make_query(self): must = query['dis_max']['queries'] - eq_(4, len(must)) + eq_(6, len(must)) full_query = must[0]['simple_query_string'] eq_("test 5-10 years", full_query['query']) - classification_query = must[3]['bool']['must'] + classification_query = must[5]['bool']['must'] eq_(2, len(classification_query)) grade_query = classification_query[0]['bool'] assert 'must' in grade_query