From 2d88fc1a24dd8c1d06131c8fc991d804d4fc1459 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Tue, 28 Nov 2017 09:37:02 -0500 Subject: [PATCH 01/20] Added a (failing) test describing the failure condition people are seeing. --- tests/test_external_search.py | 45 +++++++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 5 deletions(-) diff --git a/tests/test_external_search.py b/tests/test_external_search.py index 9090ae91b..af0c6175b 100644 --- a/tests/test_external_search.py +++ b/tests/test_external_search.py @@ -230,6 +230,10 @@ def _work(*args, **kwargs): self.les_mis.presentation_edition.title = u"Les Mis\u00E9rables" self.les_mis.set_presentation_ready() + self.modern_romance = _work() + self.modern_romance.presentation_edition.title = u"Modern Romance" + self.modern_romance.set_presentation_ready() + self.lincoln = _work(genre="Biography & Memoir", title="Abraham Lincoln") self.lincoln.set_presentation_ready() @@ -248,7 +252,13 @@ def _work(*args, **kwargs): self.adult_work = _work(title="Still Alice", audience=Classifier.AUDIENCE_ADULT) self.adult_work.set_presentation_ready() - self.ya_romance = _work(audience=Classifier.AUDIENCE_YOUNG_ADULT, genre="Romance") + self.ya_romance = _work( + title="Gumby In Love", + audience=Classifier.AUDIENCE_YOUNG_ADULT, genre="Romance" + ) + self.ya_romance.presentation_edition.subtitle = ( + "Modern Fairytale Series, Book 3" + ) self.ya_romance.set_presentation_ready() self.no_age = _work() @@ -467,11 +477,32 @@ def query(*args, **kwargs): # Matches genre - results = query("romance", None, None, None, None, None, None, None) - hits = results["hits"]["hits"] - eq_(1, len(hits)) - eq_(unicode(self.ya_romance.id), hits[0]['_id']) + def expect_ids(works, *query_args): + original_query_args = list(query_args) + query_args = list(original_query_args) + while len(query_args) < 8: + query_args.append(None) + results = query(*query_args) + hits = results["hits"]["hits"] + expect = [unicode(x.id) for x in works] + actual = [x['_id'] for x in hits] + expect_titles = ", ".join([x.title for x in works]) + actual_titles = ", ".join([x['_source']['title'] for x in hits]) + eq_( + expect, actual, + "Query args %r did not find %d works (%s), instead found %d (%s)" % ( + original_query_args, len(expect), expect_titles, + len(actual), actual_titles + ) + ) + # Search by genre. The name of the genre also shows up in the + # title of a book, but the genre comes up first. + expect_ids([self.ya_romance, self.modern_romance], "romance") + + # A full title match takes precedence over a match that's + # split across genre and subtitle. + expect_ids([self.modern_romance, self.ya_romance], "modern romance") # Matches audience @@ -753,6 +784,10 @@ def query(*args, **kwargs): hits = results["hits"]["hits"] eq_(2, len(hits)) + # + # Test searching across collections. + # + # If we add the missing collection to the default library, "A # Tiny Book" starts showing up in searches against that # library. From b7f66838c12037b1ec0596df1dd42ee5636cde71 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Tue, 28 Nov 2017 12:41:46 -0500 Subject: [PATCH 02/20] Added a standalone test that runs in less time. --- external_search.py | 8 ++--- tests/test_external_search.py | 66 +++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 4 deletions(-) diff --git a/external_search.py b/external_search.py index faee1a947..c912183b3 100644 --- a/external_search.py +++ b/external_search.py @@ -324,9 +324,9 @@ def query_works(self, library, query_string, media, languages, exclude_languages ) if fields is not None: search_args['fields'] = fields - #print "Args looks like: %r" % args + print "Args looks like: %r" % search_args results = self.search(**search_args) - #print "Results: %r" % results + print "Results: %r" % results return results def make_query(self, query_string): @@ -527,7 +527,7 @@ def without_match(original_string, match): # However, it's possible that they're searching for a subject that's not # mentioned in the summary (eg, a person's name in a biography). So title # is a possible match, but is less important than author, subtitle, and summary. - match_rest_of_query = make_query_string_query(remaining_string, ["author^4", "subtitle^3", "summary^5", "title^1", "series^1"]) + match_rest_of_query = make_query_string_query(remaining_string, ["author^3", "subtitle^2", "summary^4", "title^1", "series^1"]) classification_queries.append(match_rest_of_query) # If classification queries and the remaining string all match, the result will @@ -536,7 +536,7 @@ def without_match(original_string, match): match_classification_and_rest_of_query = { 'bool': { 'must': classification_queries, - 'boost': 200.0 + 'boost': 100 } } diff --git a/tests/test_external_search.py b/tests/test_external_search.py index af0c6175b..6643e86a8 100644 --- a/tests/test_external_search.py +++ b/tests/test_external_search.py @@ -818,6 +818,72 @@ def expect_ids(works, *query_args): ] eq_(set(collections), set(expect_collections)) +class TestModernRomance(ExternalSearchTest): + + def setup(self): + super(TestModernRomance, self).setup() + def _work(*args, **kwargs): + """Convenience method to create a work with a license pool + in the default collection. + """ + return self._work( + *args, with_license_pool=True, + collection=self._default_collection, **kwargs + ) + + self.modern_romance = _work() + self.modern_romance.presentation_edition.title = u"Modern Romance" + self.modern_romance.set_presentation_ready() + + self.ya_romance = _work( + title="Gumby In Love", + audience=Classifier.AUDIENCE_YOUNG_ADULT, genre="Romance" + ) + self.ya_romance.presentation_edition.subtitle = ( + "Modern Fairytale Series, Book 3" + ) + self.ya_romance.set_presentation_ready() + + # Add all the works created in the setup to the search index. + SearchIndexCoverageProvider( + self._db, search_index_client=self.search + ).run_once_and_update_timestamp() + + # Sleep to give the index time to catch up. + time.sleep(2) + + def test_modern_romance(self): + + # Convenience method to query the default library. + def query(*args, **kwargs): + return self.search.query_works( + self._default_library, *args, **kwargs + ) + + def expect_ids(works, *query_args): + original_query_args = list(query_args) + query_args = list(original_query_args) + while len(query_args) < 8: + query_args.append(None) + results = query(*query_args) + hits = results["hits"]["hits"] + expect = [unicode(x.id) for x in works] + actual = [x['_id'] for x in hits] + expect_titles = ", ".join([x.title for x in works]) + actual_titles = ", ".join([x['_source']['title'] for x in hits]) + eq_( + expect, actual, + "Query args %r did not find %d works (%s), instead found %d (%s)" % ( + original_query_args, len(expect), expect_titles, + len(actual), actual_titles + ) + ) + + # A full title match takes precedence over a match that's + # split across genre and subtitle. + expect_ids([self.modern_romance, self.ya_romance], "modern romance") + + class TestSearchQuery(DatabaseTest): def test_make_query(self): From 19eb0477e7dbd25da4a09c389f112d0b9cb84202 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Thu, 30 Nov 2017 15:33:33 -0500 Subject: [PATCH 03/20] This might work. --- external_search.py | 9 +++++++-- tests/test_external_search.py | 2 ++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/external_search.py b/external_search.py index cd3ef83ef..cce8ba1ce 100644 --- a/external_search.py +++ b/external_search.py @@ -339,7 +339,7 @@ def make_query_string_query(query_string, fields): } } - def make_phrase_query(query_string, fields): + def make_phrase_query(query_string, fields, boost=100): field_queries = [] for field in fields: field_query = { @@ -352,7 +352,7 @@ def make_phrase_query(query_string, fields): 'bool': { 'should': field_queries, 'minimum_should_match': 1, - 'boost': 100, + 'boost': boost, } } @@ -453,6 +453,11 @@ def make_target_age_query(target_age): match_phrase = make_phrase_query(query_string, ['title.minimal', 'author', 'series.minimal']) must_match_options.append(match_phrase) + match_title = make_phrase_query(query_string, ['title.minimal']) + must_match_options.append(match_title) + match_author = make_phrase_query(query_string, ['author.minimal']) + must_match_options.append(match_author) + if not fuzzy_blacklist_re.search(query_string): fuzzy_query = make_fuzzy_query(query_string, fuzzy_fields) must_match_options.append(fuzzy_query) diff --git a/tests/test_external_search.py b/tests/test_external_search.py index 38c7aa1cf..d0228f665 100644 --- a/tests/test_external_search.py +++ b/tests/test_external_search.py @@ -878,6 +878,8 @@ def expect_ids(works, *query_args): # split across genre and subtitle. expect_ids([self.modern_romance, self.ya_romance], "modern romance") + expect_ids([self.modern_romance, self.ya_romance], "modern romance") + class TestSearchQuery(DatabaseTest): From eb08061189b9fdaccd11c59981e4b3037d22672e Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Thu, 30 Nov 2017 16:30:09 -0500 Subject: [PATCH 04/20] Got a test that I'm more or less happy with. --- external_search.py | 6 +- tests/test_external_search.py | 105 ++++++++++++++++++++++++++-------- 2 files changed, 84 insertions(+), 27 deletions(-) diff --git a/external_search.py b/external_search.py index cce8ba1ce..4ec01cab2 100644 --- a/external_search.py +++ b/external_search.py @@ -453,9 +453,11 @@ def make_target_age_query(target_age): match_phrase = make_phrase_query(query_string, ['title.minimal', 'author', 'series.minimal']) must_match_options.append(match_phrase) - match_title = make_phrase_query(query_string, ['title.minimal']) + # An exact title or author match outweighs a match that is split + # across fields. + match_title = make_phrase_query(query_string, ['title.minimal'], 150) must_match_options.append(match_title) - match_author = make_phrase_query(query_string, ['author.minimal']) + match_author = make_phrase_query(query_string, ['author.minimal'], 150) must_match_options.append(match_author) if not fuzzy_blacklist_re.search(query_string): diff --git a/tests/test_external_search.py b/tests/test_external_search.py index d0228f665..039edaddd 100644 --- a/tests/test_external_search.py +++ b/tests/test_external_search.py @@ -63,6 +63,17 @@ def teardown(self): ExternalSearchIndex.reset() super(ExternalSearchTest, self).teardown() + def default_work(self, *args, **kwargs): + """Convenience method to create a work with a license pool + in the default collection. + """ + work = self._work( + *args, with_license_pool=True, + collection=self._default_collection, **kwargs + ) + work.set_presentation_ready() + return work + class TestExternalSearch(ExternalSearchTest): @@ -180,15 +191,7 @@ class TestExternalSearchWithWorks(ExternalSearchTest): def setup(self): super(TestExternalSearchWithWorks, self).setup() - - def _work(*args, **kwargs): - """Convenience method to create a work with a license pool - in the default collection. - """ - return self._work( - *args, with_license_pool=True, - collection=self._default_collection, **kwargs - ) + _work = self.default_work if self.search: @@ -813,31 +816,61 @@ def expect_ids(works, *query_args): ] eq_(set(collections), set(expect_collections)) -class TestModernRomance(ExternalSearchTest): +class TestExactMatches(ExternalSearchTest): + """Verify that exact or near-exact title and author matches are + privileged over matches that span fields. + """ def setup(self): - super(TestModernRomance, self).setup() - def _work(*args, **kwargs): - """Convenience method to create a work with a license pool - in the default collection. - """ - return self._work( - *args, with_license_pool=True, - collection=self._default_collection, **kwargs - ) + super(TestExactMatches, self).setup() + _work = self.default_work - self.modern_romance = _work() - self.modern_romance.presentation_edition.title = u"Modern Romance" - self.modern_romance.set_presentation_ready() + # Here the title is 'Modern Romance' + self.modern_romance = _work( + title="Modern Romance", + authors=["Aziz Ansari", "Eric Klinenberg"], + ) + # Here 'Modern' is in the subtitle and 'Romance' is the genre. self.ya_romance = _work( title="Gumby In Love", + authors="Pokey", audience=Classifier.AUDIENCE_YOUNG_ADULT, genre="Romance" ) self.ya_romance.presentation_edition.subtitle = ( "Modern Fairytale Series, Book 3" ) - self.ya_romance.set_presentation_ready() + + # TODO: Uncomment these lines and the 'modern romance' + # test fails for some reason. + # self.parent_book = _work( + # title="Our Son Aziz", + # authors=["Fatima Ansari", "Shoukath Ansari"], + # genre="Biography & Memoir", + # ) + + self.behind_the_scenes = _work( + title="The Making of Biography With Peter Graves", + genre="Entertainment", + ) + + self.biography_of_peter_graves = _work( + "He Is Peter Graves", + authors="Kelly Ghostwriter", + genre="Biography & Memoir", + ) + + self.book_by_peter_graves = _work( + title="My Experience At The University of Minnesota", + authors="Peter Graves", + genre="Entertainment", + ) + + self.book_by_someone_else = _work( + title="The Deadly Graves", + authors="Peter Ansari", + genre="Mystery" + ) # Add all the works created in the setup to the search index. SearchIndexCoverageProvider( @@ -847,7 +880,7 @@ def _work(*args, **kwargs): # Sleep to give the index time to catch up. time.sleep(2) - def test_modern_romance(self): + def test_exact_matches(self): # Convenience method to query the default library. def query(*args, **kwargs): @@ -878,8 +911,30 @@ def expect_ids(works, *query_args): # split across genre and subtitle. expect_ids([self.modern_romance, self.ya_romance], "modern romance") - expect_ids([self.modern_romance, self.ya_romance], "modern romance") + # A full author match takes precedence over a partial author + # match. + expect_ids([self.modern_romance, self.book_by_someone_else], + "aziz ansari") + + # When a string exactly matches both a title and an author, + # the books that match exactly are promoted. + expect_ids( + [self.biography_of_peter_graves, self.behind_the_scenes, + self.book_by_peter_graves, self.book_by_someone_else], + "peter graves" + ) + # 'The Making of Biography With Peter Graves' does worse in a + # search for 'peter graves biography' than a biography whose + # title includes the phrase 'peter graves'. Although the title + # contains all three search terms, it's not an exact token + # match. But "The Making of..." still does better than book + # that matches the query string against two different fields. + expect_ids( + [self.biography_of_peter_graves, self.book_by_peter_graves, + self.behind_the_scenes, self.book_by_someone_else], + "peter graves biography" + ) class TestSearchQuery(DatabaseTest): From 68f93d0da7bfca1c728adc0bfbaf5e8e3a1f689c Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Fri, 1 Dec 2017 09:17:41 -0500 Subject: [PATCH 05/20] Got the tests working in the case where the tests are running on one shard. --- external_search.py | 16 +++++++++---- tests/test_external_search.py | 43 ++++++++++++++++++++++++----------- 2 files changed, 41 insertions(+), 18 deletions(-) diff --git a/external_search.py b/external_search.py index 4ec01cab2..845bf239c 100644 --- a/external_search.py +++ b/external_search.py @@ -324,9 +324,10 @@ def query_works(self, library, query_string, media, languages, fiction, audience ) if fields is not None: search_args['fields'] = fields - print "Args looks like: %r" % search_args + # search_args['explain'] = True + # print "Args looks like: %r" % search_args results = self.search(**search_args) - print "Results: %r" % results + # print "Results: %r" % results return results def make_query(self, query_string): @@ -455,9 +456,9 @@ def make_target_age_query(target_age): # An exact title or author match outweighs a match that is split # across fields. - match_title = make_phrase_query(query_string, ['title.minimal'], 150) + match_title = make_phrase_query(query_string, ['title.standard'], 200) must_match_options.append(match_title) - match_author = make_phrase_query(query_string, ['author.minimal'], 150) + match_author = make_phrase_query(query_string, ['author.standard'], 200) must_match_options.append(match_author) if not fuzzy_blacklist_re.search(query_string): @@ -802,7 +803,12 @@ def v2_body(cls): "fields": { "minimal": { "type": "string", - "analyzer": "en_minimal_analyzer"}}} + "analyzer": "en_minimal_analyzer"}, + "standard": { + "type": "string", + "analyzer": "standard" + } + }} ) mappings = { ExternalSearchIndex.work_document_type : mapping } diff --git a/tests/test_external_search.py b/tests/test_external_search.py index 039edaddd..f6327e75d 100644 --- a/tests/test_external_search.py +++ b/tests/test_external_search.py @@ -843,11 +843,11 @@ def setup(self): # TODO: Uncomment these lines and the 'modern romance' # test fails for some reason. - # self.parent_book = _work( - # title="Our Son Aziz", - # authors=["Fatima Ansari", "Shoukath Ansari"], - # genre="Biography & Memoir", - # ) + self.parent_book = _work( + title="Our Son Aziz", + authors=["Fatima Ansari", "Shoukath Ansari"], + genre="Biography & Memoir", + ) self.behind_the_scenes = _work( title="The Making of Biography With Peter Graves", @@ -909,12 +909,25 @@ def expect_ids(works, *query_args): # A full title match takes precedence over a match that's # split across genre and subtitle. - expect_ids([self.modern_romance, self.ya_romance], "modern romance") + expect_ids( + [ + self.modern_romance, # "modern romance" in title + self.ya_romance # "modern" in subtitle, genre "romance" + ], + "modern romance" + ) # A full author match takes precedence over a partial author - # match. - expect_ids([self.modern_romance, self.book_by_someone_else], - "aziz ansari") + # match. A partial author match that matches the entire search + # string takes precedence over a partial author match that doesn't. + expect_ids( + [ + self.modern_romance, # "Aziz Ansari" in author + self.parent_book, # "Aziz" in title, "Ansari" in author + self.book_by_someone_else # "Ansari" in author + ], + "aziz ansari" + ) # When a string exactly matches both a title and an author, # the books that match exactly are promoted. @@ -928,11 +941,15 @@ def expect_ids(works, *query_args): # search for 'peter graves biography' than a biography whose # title includes the phrase 'peter graves'. Although the title # contains all three search terms, it's not an exact token - # match. But "The Making of..." still does better than book - # that matches the query string against two different fields. + # match. But "The Making of..." still does better than + # books that match 'peter graves' (or 'peter' and 'graves'), + # but not 'biography'. expect_ids( - [self.biography_of_peter_graves, self.book_by_peter_graves, - self.behind_the_scenes, self.book_by_someone_else], + [self.biography_of_peter_graves, # title + genre 'biography' + self.behind_the_scenes, # all words match in title + self.book_by_peter_graves, # author (no 'biography') + self.book_by_someone_else # title + author (no 'biography') + ], "peter graves biography" ) From e9eed6fe4637ca8dab8125bbbd3760f43f2ee05e Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Fri, 1 Dec 2017 09:41:39 -0500 Subject: [PATCH 06/20] First attempt at configuring Elasticsearch within Travis. --- .travis.yml | 8 +++++--- elasticsearch.yml.testing | 5 +++++ 2 files changed, 10 insertions(+), 3 deletions(-) create mode 100644 elasticsearch.yml.testing diff --git a/.travis.yml b/.travis.yml index da8a207dc..8cb95007f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -33,10 +33,12 @@ install: - pip install -r requirements.txt - python -m textblob.download_corpora - cp config.json.sample config.json - - export SIMPLIFIED_CONFIGURATION_FILE="$TRAVIS_BUILD_DIR/config.json" + - export SIMPLIFIED_CONFIGURATION_FILE="${TRAVIS_BUILD_DIR}/config.json" - wget ${ES_DOWNLOAD} - - tar -xzf elasticsearch-${ES_VERSION}.tar.gz - - ./elasticsearch-${ES_VERSION}/bin/elasticsearch & + - export ES_PATH=elasticsearch-${ES_VERSION} + - tar -xzf ${ES_PATH}.tar.gz + - cp elasticsearch.yml.testing ./${ES_PATH}/config + - ./${ES_PATH}/bin/elasticsearch & before_script: - psql -c 'create user simplified_test;' -U postgres diff --git a/elasticsearch.yml.testing b/elasticsearch.yml.testing new file mode 100644 index 000000000..0b1514d2a --- /dev/null +++ b/elasticsearch.yml.testing @@ -0,0 +1,5 @@ +# Scores for search results are calculated on a per-shard basis. +# Keeping everything in one shard while running unit testsensures that +# we have complete control over how the scores are calculated. +index.number_of_shards: 1 +index.number_of_replicas: 0 From 5e19ffb47f167959a8f881b3a2ec1d0c7d5fa63d Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Fri, 1 Dec 2017 09:44:18 -0500 Subject: [PATCH 07/20] When copying the Elasticsearch configuration, give it the right extension. --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 8cb95007f..91a065566 100644 --- a/.travis.yml +++ b/.travis.yml @@ -37,7 +37,7 @@ install: - wget ${ES_DOWNLOAD} - export ES_PATH=elasticsearch-${ES_VERSION} - tar -xzf ${ES_PATH}.tar.gz - - cp elasticsearch.yml.testing ./${ES_PATH}/config + - cp elasticsearch.yml.testing ./${ES_PATH}/config/elasticsearch.yml - ./${ES_PATH}/bin/elasticsearch & before_script: From 88efecb3653647c9680512232e1992519f59c470 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Fri, 1 Dec 2017 10:48:41 -0500 Subject: [PATCH 08/20] Fix, improve and document the test that explains how a search query object is created. --- external_search.py | 6 +-- tests/test_external_search.py | 96 +++++++++++++++++++++++++---------- 2 files changed, 72 insertions(+), 30 deletions(-) diff --git a/external_search.py b/external_search.py index 845bf239c..7ea83f620 100644 --- a/external_search.py +++ b/external_search.py @@ -456,9 +456,9 @@ def make_target_age_query(target_age): # An exact title or author match outweighs a match that is split # across fields. - match_title = make_phrase_query(query_string, ['title.standard'], 200) + match_title = make_phrase_query(query_string, ['title.standard'], 100) must_match_options.append(match_title) - match_author = make_phrase_query(query_string, ['author.standard'], 200) + match_author = make_phrase_query(query_string, ['author.standard'], 100) must_match_options.append(match_author) if not fuzzy_blacklist_re.search(query_string): @@ -535,7 +535,7 @@ def without_match(original_string, match): # However, it's possible that they're searching for a subject that's not # mentioned in the summary (eg, a person's name in a biography). So title # is a possible match, but is less important than author, subtitle, and summary. - match_rest_of_query = make_query_string_query(remaining_string, ["author^3", "subtitle^2", "summary^4", "title^1", "series^1"]) + match_rest_of_query = make_query_string_query(remaining_string, ["author^4", "subtitle^2", "summary^4", "title^1", "series^1"]) classification_queries.append(match_rest_of_query) # If classification queries and the remaining string all match, the result will diff --git a/tests/test_external_search.py b/tests/test_external_search.py index f6327e75d..37de03be8 100644 --- a/tests/test_external_search.py +++ b/tests/test_external_search.py @@ -260,7 +260,7 @@ def setup(self): audience=Classifier.AUDIENCE_YOUNG_ADULT, genre="Romance" ) self.ya_romance.presentation_edition.subtitle = ( - "Modern Fairytale Series, Book 3" + "Modern Fairytale Series, Book 7" ) self.ya_romance.set_presentation_ready() @@ -503,10 +503,6 @@ def expect_ids(works, *query_args): # title of a book, but the genre comes up first. expect_ids([self.ya_romance, self.modern_romance], "romance") - # A full title match takes precedence over a match that's - # split across genre and subtitle. - expect_ids([self.modern_romance, self.ya_romance], "modern romance") - # Matches audience results = query("children's", None, None, None, None, None, None, None) @@ -962,39 +958,85 @@ def test_make_query(self): # Basic query query = search.make_query("test") + # The query runs a number of matching techniques on a search + # result and picks the best one. must = query['dis_max']['queries'] - eq_(3, len(must)) - stemmed_query = must[0]['simple_query_string'] + # Here are the matching techniques. + stemmed, minimal, standard_title, standard_author, fuzzy = must + + # The search string is stemmed and matched against a number + # of fields such as title and publisher. + stemmed_query = stemmed['simple_query_string'] eq_("test", stemmed_query['query']) assert "title^4" in stemmed_query['fields'] assert 'publisher' in stemmed_query['fields'] - phrase_queries = must[1]['bool']['should'] - eq_(3, len(phrase_queries)) - title_phrase_query = phrase_queries[0]['match_phrase'] - assert 'title.minimal' in title_phrase_query - eq_("test", title_phrase_query['title.minimal']) + def assert_field_names(query, *expect): + """Validate that a query is set up to match the string 'test' + against one or more of a number of fields. + """ + actual_keys = set() + + # For this part of the query to be considered successful, + # it's sufficient for a single field to match. + eq_(1, query['bool']['minimum_should_match']) + + for possibility in query['bool']['should']: + [(key, value)] = possibility['match_phrase'].items() + actual_keys.add(key) + eq_(value, 'test') + eq_(set(actual_keys), set(expect)) + + # The search string is matched against a number of + # minimally processed fields. + assert_field_names( + minimal, + 'title.minimal', 'author', 'series.minimal' + ) - # Query with fuzzy blacklist keyword - query = search.make_query("basketball") + # The search string is matched more or less as-is against + # the title alone... + assert_field_names(standard_title, 'title.standard') + + # ... and the author alone. + assert_field_names(standard_author, 'author.standard') + + # The search string is matched fuzzily against a number of + # minimally-processed fields such as title and publisher. + fuzzy_query = fuzzy['multi_match'] + eq_('AUTO', fuzzy_query['fuzziness']) + + fuzzy_fields = fuzzy_query['fields'] + assert 'title.minimal^4' in fuzzy_fields + assert 'author^4' in fuzzy_fields + assert 'publisher' in fuzzy_fields + # Of those fields, the single one that best matches the search + # request is chosen to represent this match technique's score. + # https://www.elastic.co/guide/en/elasticsearch/guide/current/_best_fields.html + eq_('best_fields', fuzzy_query['type']) + + # If we create a query using a fuzzy blacklist keyword... + query = search.make_query("basketball") must = query['dis_max']['queries'] - eq_(2, len(must)) + # ... the fuzzy match technique is not used because it's too + # unreliable. + eq_(4, len(must)) # Query with genre query = search.make_query("test romance") must = query['dis_max']['queries'] - eq_(4, len(must)) + eq_(6, len(must)) full_query = must[0]['simple_query_string'] eq_("test romance", full_query['query']) assert "title^4" in full_query['fields'] assert 'publisher' in full_query['fields'] - classification_query = must[3]['bool']['must'] + classification_query = must[5]['bool']['must'] eq_(2, len(classification_query)) genre_query = classification_query[0]['match'] assert 'genres.name' in genre_query @@ -1010,9 +1052,9 @@ def test_make_query(self): must = query['dis_max']['queries'] - eq_(4, len(must)) + eq_(6, len(must)) - classification_query = must[3]['bool']['must'] + classification_query = must[5]['bool']['must'] eq_(2, len(classification_query)) fiction_query = classification_query[0]['match'] assert 'fiction' in fiction_query @@ -1028,9 +1070,9 @@ def test_make_query(self): must = query['dis_max']['queries'] - eq_(4, len(must)) + eq_(6, len(must)) - classification_query = must[3]['bool']['must'] + classification_query = must[5]['bool']['must'] eq_(3, len(classification_query)) genre_query = classification_query[0]['match'] assert 'genres.name' in genre_query @@ -1049,11 +1091,11 @@ def test_make_query(self): must = query['dis_max']['queries'] - eq_(4, len(must)) + eq_(6, len(must)) full_query = must[0]['simple_query_string'] eq_("test young adult", full_query['query']) - classification_query = must[3]['bool']['must'] + classification_query = must[5]['bool']['must'] eq_(2, len(classification_query)) audience_query = classification_query[0]['match'] assert 'audience' in audience_query @@ -1067,11 +1109,11 @@ def test_make_query(self): must = query['dis_max']['queries'] - eq_(4, len(must)) + eq_(6, len(must)) full_query = must[0]['simple_query_string'] eq_("test grade 6", full_query['query']) - classification_query = must[3]['bool']['must'] + classification_query = must[5]['bool']['must'] eq_(2, len(classification_query)) grade_query = classification_query[0]['bool'] assert 'must' in grade_query @@ -1090,11 +1132,11 @@ def test_make_query(self): must = query['dis_max']['queries'] - eq_(4, len(must)) + eq_(6, len(must)) full_query = must[0]['simple_query_string'] eq_("test 5-10 years", full_query['query']) - classification_query = must[3]['bool']['must'] + classification_query = must[5]['bool']['must'] eq_(2, len(classification_query)) grade_query = classification_query[0]['bool'] assert 'must' in grade_query From 40fefd69412f0e8c8df31904ecf1bb2e09bb2c55 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Fri, 1 Dec 2017 10:53:56 -0500 Subject: [PATCH 09/20] Got the rest of the tests to pass. --- tests/test_external_search.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/test_external_search.py b/tests/test_external_search.py index 37de03be8..c3dc3369c 100644 --- a/tests/test_external_search.py +++ b/tests/test_external_search.py @@ -260,7 +260,7 @@ def setup(self): audience=Classifier.AUDIENCE_YOUNG_ADULT, genre="Romance" ) self.ya_romance.presentation_edition.subtitle = ( - "Modern Fairytale Series, Book 7" + "Modern Fairytale Series, Volume 7" ) self.ya_romance.set_presentation_ready() @@ -558,9 +558,14 @@ def expect_ids(works, *query_args): results = query("young adult romance", None, None, None, None, None, None, None) hits = results["hits"]["hits"] - eq_(1, len(hits)) - eq_(unicode(self.ya_romance.id), hits[0]['_id']) + # The book with 'Romance' in the title also shows up, but it + # shows up after the book whose audience matches 'young adult' + # and whose genre matches 'romance'. + eq_( + map(unicode, [self.ya_romance.id, self.modern_romance.id]), + [x['_id'] for x in hits] + ) # Matches age + fiction From 9627d5d4922a4bfcd8dd7148401dbdd7bcb6fe26 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Fri, 1 Dec 2017 11:37:55 -0500 Subject: [PATCH 10/20] Separated out the v2 and v3 versions of the body. --- external_search.py | 62 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 58 insertions(+), 4 deletions(-) diff --git a/external_search.py b/external_search.py index 7ea83f620..a1f47aba8 100644 --- a/external_search.py +++ b/external_search.py @@ -738,7 +738,7 @@ def remove_work(self, work): class ExternalSearchIndexVersions(object): - VERSIONS = ['v2'] + VERSIONS = ['v2', 'v3'] @classmethod def latest(cls): @@ -760,8 +760,11 @@ def map_fields(cls, fields, field_description): return mapping @classmethod - def v2_body(cls): - + def v3_body(cls): + """The v3 body is the same as the v2 except for the inclusion of the + '.standard' version of fields, analyzed using the standard + analyzer for near-exact matches. + """ settings = { "analysis": { "filter": { @@ -814,6 +817,57 @@ def v2_body(cls): return dict(settings=settings, mappings=mappings) + @classmethod + def v2_body(cls): + + settings = { + "analysis": { + "filter": { + "en_stop_filter": { + "type": "stop", + "stopwords": ["_english_"] + }, + "en_stem_filter": { + "type": "stemmer", + "name": "english" + }, + "en_stem_minimal_filter": { + "type": "stemmer", + "name": "english" + }, + }, + "analyzer" : { + "en_analyzer": { + "type": "custom", + "char_filter": ["html_strip"], + "tokenizer": "standard", + "filter": ["lowercase", "asciifolding", "en_stop_filter", "en_stem_filter"] + }, + "en_minimal_analyzer": { + "type": "custom", + "char_filter": ["html_strip"], + "tokenizer": "standard", + "filter": ["lowercase", "asciifolding", "en_stop_filter", "en_stem_minimal_filter"] + }, + } + } + } + + mapping = cls.map_fields( + fields=["title", "series", "subtitle", "summary", "classifications.term"], + field_description={ + "type": "string", + "analyzer": "en_analyzer", + "fields": { + "minimal": { + "type": "string", + "analyzer": "en_minimal_analyzer"}, + }} + ) + mappings = { ExternalSearchIndex.work_document_type : mapping } + + return dict(settings=settings, mappings=mappings) + @classmethod def create_new_version(cls, search_client, base_index_name, version=None): """Creates an index for a new version @@ -886,7 +940,7 @@ class SearchIndexMonitor(WorkSweepMonitor): def __init__(self, _db, collection, index_name=None, index_client=None, **kwargs): super(SearchIndexMonitor, self).__init__(_db, collection, **kwargs) - + if index_client: # This would only happen during a test. self.search_index_client = index_client From 70ef0754c299058f6ee5ce6d4db5870507be3df9 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Fri, 1 Dec 2017 11:55:09 -0500 Subject: [PATCH 11/20] Separated out the v2 and v3 versions of the body. --- external_search.py | 71 ++++++++++++----------------------- tests/test_external_search.py | 4 +- 2 files changed, 25 insertions(+), 50 deletions(-) diff --git a/external_search.py b/external_search.py index a1f47aba8..c2e3a0502 100644 --- a/external_search.py +++ b/external_search.py @@ -30,10 +30,9 @@ class ExternalSearchIndex(object): NAME = ExternalIntegration.ELASTICSEARCH - WORKS_INDEX_KEY = u'works_index' - WORKS_ALIAS_KEY = u'works_alias' + WORKS_INDEX_PREFIX_KEY = u'works_index_prefix' - DEFAULT_WORKS_INDEX = u'works' + DEFAULT_WORKS_INDEX_PREFIX = u'works' work_document_type = 'work-type' __client = None @@ -43,7 +42,10 @@ class ExternalSearchIndex(object): SETTINGS = [ { "key": ExternalIntegration.URL, "label": _("URL") }, - { "key": WORKS_INDEX_KEY, "label": _("Works index"), "default": DEFAULT_WORKS_INDEX }, + { "key": WORKS_INDEX_PREFIX_KEY, "label": _("Index prefix"), + "default": DEFAULT_WORKS_INDEX_PREFIX, + "description": _("Any Elasticsearch indexes needed for this application will be created with this unique prefix. In most cases, the default will work fine. You may need to change this if you have multiple application servers using a single Elasticsearch server.") + }, ] SITEWIDE = True @@ -67,12 +69,19 @@ def search_integration(cls, _db): @classmethod def works_index_name(cls, _db): - """Look up the name of the search index.""" + """Look up the name of the search index. + + It's possible, but unlikely, that the search index alias will + point to some other index. But if there were no indexes, and a + new one needed to be created, this would be the name of that + index. + """ integration = cls.search_integration(_db) if not integration: return None - setting = integration.setting(cls.WORKS_INDEX_KEY) - return setting.value_or_default(cls.DEFAULT_WORKS_INDEX) + setting = integration.setting(cls.WORKS_INDEX_PREFIX_KEY) + prefix = setting.value_or_default(cls.DEFAULT_WORKS_INDEX_PREFIX) + return prefix + '-' + ExternalSearchIndexVersions.latest() def __init__(self, _db, url=None, works_index=None): @@ -118,42 +127,14 @@ def __init__(self, _db, url=None, works_index=None): # Document upload runs against the works_index. # Search queries run against works_alias. if works_index and integration: - self.set_works_index_and_alias(works_index) + self.set_works_index_and_alias(_db, works_index) self.update_integration_settings(integration) def bulk(docs, **kwargs): return elasticsearch_bulk(self.__client, docs, **kwargs) self.bulk = bulk - - def update_integration_settings(self, integration, force=False): - """Updates the integration with an appropriate index and alias - setting if the index and alias have been updated. - """ - if not integration or not (self.works_index and self.works_alias): - return - - if self.works_index==self.works_alias: - # An index is being used as the alias. There is no alias - # to update with. - return - if integration.setting(self.WORKS_ALIAS_KEY).value and not force: - # This integration already has an alias and we don't want to - # force an update. - return - - index_or_alias = [self.works_index, self.works_alias] - if (integration.setting(self.WORKS_INDEX_KEY).value not in index_or_alias - and not force - ): - # This ExternalSearchIndex was created for a different index and - # alias, and we don't want to force an update. - return - - integration.setting(self.WORKS_INDEX_KEY).value = unicode(self.works_index) - integration.setting(self.WORKS_ALIAS_KEY).value = unicode(self.works_alias) - - def set_works_index_and_alias(self, current_alias): + def set_works_index_and_alias(self, _db, current_alias): """Finds or creates the works_index and works_alias based on provided configuration. """ @@ -171,17 +152,11 @@ def _set_works_index(name): # there is only one. _set_works_index(index_details.keys()[0]) else: - if current_alias.endswith(self.CURRENT_ALIAS_SUFFIX): - # The alias culled from configuration is intended to be - # a current alias, but an index with that alias wasn't - # found. Find or create an appropriate index. - base_index_name = self.base_index_name(current_alias) - new_index = base_index_name+'-'+ExternalSearchIndexVersions.latest() - _set_works_index(new_index) - else: - # Without the CURRENT_ALIAS_SUFFIX, assume the index string - # from config is the index itself and needs to be swapped. - _set_works_index(current_alias) + # The alias culled from configuration is intended to be + # a current alias, but an index with that alias wasn't + # found. Find or create an index with the name known to + # be right for this version. + _set_works_index(self.works_index_name(_db)) if not self.indices.exists(self.works_index): self.setup_index() diff --git a/tests/test_external_search.py b/tests/test_external_search.py index c3dc3369c..c91efaef6 100644 --- a/tests/test_external_search.py +++ b/tests/test_external_search.py @@ -44,7 +44,7 @@ def setup(self): ExternalIntegration.ELASTICSEARCH, goal=ExternalIntegration.SEARCH_GOAL, url=u'http://localhost:9200', - settings={ExternalSearchIndex.WORKS_INDEX_KEY : u'test_index-v0'} + settings={ExternalSearchIndex.WORKS_INDEX_PREFIX_KEY : u'test_index'} ) try: @@ -108,7 +108,7 @@ def test_set_works_index_and_alias(self): # If -current alias is given but doesn't exist, the appropriate # index and alias will be created. - self.search.set_works_index_and_alias('banana-current') + self.search.set_works_index_and_alias(self._db, 'banana-current') expected_index = 'banana-' + ExternalSearchIndexVersions.latest() eq_(expected_index, self.search.works_index) From 4307f95311cd468f1f69f6e017e1d3d9958db605 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Fri, 1 Dec 2017 12:30:20 -0500 Subject: [PATCH 12/20] Changed from having separate configuration settings for index and alias to having a single setting for the prefix. Index and alias are always calculated from the prefix. --- external_search.py | 83 +++++++++++++++++++++-------------- tests/test_external_search.py | 63 ++++++++++++++++---------- 2 files changed, 92 insertions(+), 54 deletions(-) diff --git a/external_search.py b/external_search.py index c2e3a0502..21c82993d 100644 --- a/external_search.py +++ b/external_search.py @@ -37,7 +37,7 @@ class ExternalSearchIndex(object): work_document_type = 'work-type' __client = None - CURRENT_ALIAS_SUFFIX = '-current' + CURRENT_ALIAS_SUFFIX = 'current' VERSION_RE = re.compile('-v([0-9]+)$') SETTINGS = [ @@ -67,6 +67,21 @@ def search_integration(cls, _db): goal=ExternalIntegration.SEARCH_GOAL ) + @classmethod + def works_prefixed(cls, _db, value): + """Prefix the given value with the prefix to use when generating index + and alias names. + + :return: A string "{prefix}-{value}", or None if no prefix is + configured. + """ + integration = cls.search_integration(_db) + if not integration: + return None + setting = integration.setting(cls.WORKS_INDEX_PREFIX_KEY) + prefix = setting.value_or_default(cls.DEFAULT_WORKS_INDEX_PREFIX) + return prefix + '-' + value + @classmethod def works_index_name(cls, _db): """Look up the name of the search index. @@ -76,12 +91,12 @@ def works_index_name(cls, _db): new one needed to be created, this would be the name of that index. """ - integration = cls.search_integration(_db) - if not integration: - return None - setting = integration.setting(cls.WORKS_INDEX_PREFIX_KEY) - prefix = setting.value_or_default(cls.DEFAULT_WORKS_INDEX_PREFIX) - return prefix + '-' + ExternalSearchIndexVersions.latest() + return cls.works_prefixed(_db, ExternalSearchIndexVersions.latest()) + + @classmethod + def works_alias_name(cls, _db): + """Look up the name of the search index alias.""" + return cls.works_prefixed(_db, cls.CURRENT_ALIAS_SUFFIX) def __init__(self, _db, url=None, works_index=None): @@ -127,53 +142,57 @@ def __init__(self, _db, url=None, works_index=None): # Document upload runs against the works_index. # Search queries run against works_alias. if works_index and integration: - self.set_works_index_and_alias(_db, works_index) - self.update_integration_settings(integration) + self.set_works_index_and_alias(_db) def bulk(docs, **kwargs): return elasticsearch_bulk(self.__client, docs, **kwargs) self.bulk = bulk - def set_works_index_and_alias(self, _db, current_alias): + def set_works_index_and_alias(self, _db): """Finds or creates the works_index and works_alias based on - provided configuration. + the current configuration. """ - if current_alias: - index_details = self.indices.get_alias(name=current_alias, ignore=[404]) - found = bool(index_details) and not (index_details.get('status')==404 or 'error' in index_details) - else: - found = False + + # We always know what the name of the alias _should_ be. + alias_name = self.works_alias_name(_db) + index_details = self.indices.get_alias(name=alias_name, ignore=[404]) + + # But the alias may not exist in Elasticsearch. + found = ( + bool(index_details) and not ( + index_details.get('status')==404 or 'error' in index_details + ) + ) def _set_works_index(name): self.works_index = self.__client.works_index = name if found: - # We found an index for the alias in configuration. Assume - # there is only one. + # The alias exists and points to a specific index. Assume + # there is only one such index. _set_works_index(index_details.keys()[0]) else: - # The alias culled from configuration is intended to be - # a current alias, but an index with that alias wasn't - # found. Find or create an index with the name known to - # be right for this version. + # The alias doesn't point to anything. The index name to + # use is the one known to be right for this version. _set_works_index(self.works_index_name(_db)) if not self.indices.exists(self.works_index): + # The index doesn't actually exist. Set it up. self.setup_index() - self.setup_current_alias() - def setup_current_alias(self): - """Finds or creates a works_alias based on the base works_index - name and ending in the expected CURRENT_ALIAS_SUFFIX. + # Make sure the alias points to the index we're using. + self.setup_current_alias(_db) + + def setup_current_alias(self, _db): + """Finds or creates the works_alias as named by the current site + settings. If the resulting alias exists and is affixed to a different index or if it can't be generated for any reason, the alias will not be created or moved. Instead, the search client will use the the works_index directly for search queries. """ - - base_works_index = self.base_index_name(self.works_index) - alias_name = base_works_index+self.CURRENT_ALIAS_SUFFIX + alias_name = self.works_alias_name(_db) exists = self.indices.exists_alias(name=alias_name) def _set_works_alias(name): @@ -219,7 +238,7 @@ def setup_index(self, new_index=None): body = ExternalSearchIndexVersions.latest_body() self.indices.create(index=index, body=body) - def transfer_current_alias(self, new_index): + def transfer_current_alias(self, _db, new_index): """Force -current alias onto a new index""" if not self.indices.exists(index=new_index): raise ValueError( @@ -235,11 +254,11 @@ def transfer_current_alias(self, new_index): "is the same.") % (new_index, self.works_index)) self.works_index = self.__client.works_index = new_index - alias_name = self.base_index_name(new_index)+self.CURRENT_ALIAS_SUFFIX + alias_name = self.works_alias_name(_db) exists = self.indices.exists_alias(name=alias_name) if not exists: - self.setup_current_alias() + self.setup_current_alias(_db) return exists_on_works_index = self.indices.get_alias( diff --git a/tests/test_external_search.py b/tests/test_external_search.py index c91efaef6..bb14f673d 100644 --- a/tests/test_external_search.py +++ b/tests/test_external_search.py @@ -78,9 +78,13 @@ def default_work(self, *args, **kwargs): class TestExternalSearch(ExternalSearchTest): def test_works_index_name(self): + """The name of the search index is the prefix (defined in + ExternalSearchTest.setup) plus a version number associated + with this version of the core code. + """ if not self.search: return - eq_("test_index-v0", self.search.works_index_name(self._db)) + eq_("test_index-v3", self.search.works_index_name(self._db)) def test_setup_index_creates_new_index(self): if not self.search: @@ -97,7 +101,7 @@ def test_setup_index_creates_new_index(self): eq_(current_index, self.search.works_index) # The alias hasn't been passed over to the new index. - alias = 'test_index' + self.search.CURRENT_ALIAS_SUFFIX + alias = 'test_index-' + self.search.CURRENT_ALIAS_SUFFIX eq_(alias, self.search.works_alias) eq_(True, self.search.indices.exists_alias(current_index, alias)) eq_(False, self.search.indices.exists_alias('the_other_index', alias)) @@ -106,25 +110,34 @@ def test_set_works_index_and_alias(self): if not self.search: return - # If -current alias is given but doesn't exist, the appropriate - # index and alias will be created. - self.search.set_works_index_and_alias(self._db, 'banana-current') + # If the index or alias don't exist, set_works_index_and_alias + # will create them. + self.integration.set_setting(ExternalSearchIndex.WORKS_INDEX_PREFIX_KEY, u'banana') + self.search.set_works_index_and_alias(self._db) expected_index = 'banana-' + ExternalSearchIndexVersions.latest() + expected_alias = 'banana-' + self.search.CURRENT_ALIAS_SUFFIX + eq_(expected_index, self.search.works_index) + eq_(expected_alias, self.search.works_alias) + + # If the index and alias already exist, set_works_index_and_alias + # does nothing. + self.search.set_works_index_and_alias(self._db) eq_(expected_index, self.search.works_index) - eq_('banana-current', self.search.works_alias) + eq_(expected_alias, self.search.works_alias) def test_setup_current_alias(self): if not self.search: return # The index was generated from the string in configuration. - index_name = 'test_index-v0' + version = ExternalSearchIndexVersions.VERSIONS[-1] + index_name = 'test_index-' + version eq_(index_name, self.search.works_index) eq_(True, self.search.indices.exists(index_name)) # The alias is also created from the configuration. - alias = 'test_index' + self.search.CURRENT_ALIAS_SUFFIX + alias = 'test_index-' + self.search.CURRENT_ALIAS_SUFFIX eq_(alias, self.search.works_alias) eq_(True, self.search.indices.exists_alias(index_name, alias)) @@ -132,19 +145,22 @@ def test_setup_current_alias(self): # won't be reassigned. Instead, search will occur against the # index itself. ExternalSearchIndex.reset() - self.integration.set_setting(ExternalSearchIndex.WORKS_INDEX_KEY, u'test_index-v100') + self.integration.set_setting(ExternalSearchIndex.WORKS_INDEX_PREFIX_KEY, u'my-app') self.search = ExternalSearchIndex(self._db) - eq_('test_index-v100', self.search.works_index) - eq_('test_index-v100', self.search.works_alias) + eq_('my-app-%s' % version, self.search.works_index) + eq_('my-app-' + self.search.CURRENT_ALIAS_SUFFIX, self.search.works_alias) def test_transfer_current_alias(self): if not self.search: return - # If the index doesn't exist, an error is raised. + # An error is raised if you try to set the alias to point to + # an index that doesn't already exist. assert_raises( - ValueError, self.search.transfer_current_alias, 'test_index-v3') + ValueError, self.search.transfer_current_alias, self._db, + 'no-such-index' + ) original_index = self.search.works_index @@ -153,34 +169,37 @@ def test_transfer_current_alias(self): self.search.indices.delete_alias( index=original_index, name='test_index-current' ) - self.search.setup_index(new_index='test_index-v100') - self.search.transfer_current_alias('test_index-v100') - eq_('test_index-v100', self.search.works_index) + self.search.setup_index(new_index='test_index-v9999') + self.search.transfer_current_alias(self._db, 'test_index-v9999') + eq_('test_index-v9999', self.search.works_index) eq_('test_index-current', self.search.works_alias) # If the -current alias already exists on the index, # it's used without a problem. - self.search.transfer_current_alias('test_index-v100') - eq_('test_index-v100', self.search.works_index) + self.search.transfer_current_alias(self._db, 'test_index-v9999') + eq_('test_index-v9999', self.search.works_index) eq_('test_index-current', self.search.works_alias) # If the -current alias is being used on a different version of the # index, it's deleted from that index and placed on the new one. self.search.setup_index(original_index) - self.search.transfer_current_alias(original_index) + self.search.transfer_current_alias(self._db, original_index) eq_(original_index, self.search.works_index) eq_('test_index-current', self.search.works_alias) # It has been removed from other index. eq_(False, self.search.indices.exists_alias( - index='test_index-v100', name='test_index-current')) + index='test_index-v9999', name='test_index-current')) + # And only exists on the new index. alias_indices = self.search.indices.get_alias(name='test_index-current').keys() - eq_(['test_index-v0'], alias_indices) + eq_([original_index], alias_indices) # If the index doesn't have the same base name, an error is raised. assert_raises( - ValueError, self.search.transfer_current_alias, 'banana-v10') + ValueError, self.search.transfer_current_alias, self._db, + 'banana-v10' + ) class TestExternalSearchWithWorks(ExternalSearchTest): """These tests run against a real search index with works in it. From fb2e04081c753f72511ff535d9873df0c7bcc4c3 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Fri, 1 Dec 2017 12:31:48 -0500 Subject: [PATCH 13/20] Renamed the default works index to match the most common current value. --- external_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external_search.py b/external_search.py index 21c82993d..7da5e2fe4 100644 --- a/external_search.py +++ b/external_search.py @@ -32,7 +32,7 @@ class ExternalSearchIndex(object): WORKS_INDEX_PREFIX_KEY = u'works_index_prefix' - DEFAULT_WORKS_INDEX_PREFIX = u'works' + DEFAULT_WORKS_INDEX_PREFIX = u'circulation-works' work_document_type = 'work-type' __client = None From 5281775b8e47c9d0c923993035078e1803a62dba Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Fri, 1 Dec 2017 12:39:22 -0500 Subject: [PATCH 14/20] Restored the classification query boost to its old value -- so long as the other queries get the same boost it works out. --- external_search.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/external_search.py b/external_search.py index 7da5e2fe4..f5acbdc76 100644 --- a/external_search.py +++ b/external_search.py @@ -450,9 +450,9 @@ def make_target_age_query(target_age): # An exact title or author match outweighs a match that is split # across fields. - match_title = make_phrase_query(query_string, ['title.standard'], 100) + match_title = make_phrase_query(query_string, ['title.standard'], 200) must_match_options.append(match_title) - match_author = make_phrase_query(query_string, ['author.standard'], 100) + match_author = make_phrase_query(query_string, ['author.standard'], 200) must_match_options.append(match_author) if not fuzzy_blacklist_re.search(query_string): @@ -538,7 +538,7 @@ def without_match(original_string, match): match_classification_and_rest_of_query = { 'bool': { 'must': classification_queries, - 'boost': 100 + 'boost': 200 } } From 085cd9743fd84ab642c1d32442cf9a5c26e7d960 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Fri, 1 Dec 2017 13:59:37 -0500 Subject: [PATCH 15/20] Added missing migration script. --- migration/20171201-rebuild-search-index.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100755 migration/20171201-rebuild-search-index.py diff --git a/migration/20171201-rebuild-search-index.py b/migration/20171201-rebuild-search-index.py new file mode 100755 index 000000000..3646796b8 --- /dev/null +++ b/migration/20171201-rebuild-search-index.py @@ -0,0 +1,12 @@ +#!/usr/bin/env python +"""Running the search index updater script will create the new +circulation-works-v3 index and change the circulation-works-current +alias to point to it. +""" +import os +import sys +bin_dir = os.path.split(__file__)[0] +package_dir = os.path.join(bin_dir, "..") +sys.path.append(os.path.abspath(package_dir)) +from scripts import UpdateSearchIndexScript +UpdateSearchIndexScript().run() From b76bdb775bcbf5ddbd4ee54fad0dd763428f5117 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Fri, 1 Dec 2017 15:27:21 -0500 Subject: [PATCH 16/20] Removed reference to problems that have been fixed. --- tests/test_external_search.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_external_search.py b/tests/test_external_search.py index bb14f673d..a7f835043 100644 --- a/tests/test_external_search.py +++ b/tests/test_external_search.py @@ -861,8 +861,6 @@ def setup(self): "Modern Fairytale Series, Book 3" ) - # TODO: Uncomment these lines and the 'modern romance' - # test fails for some reason. self.parent_book = _work( title="Our Son Aziz", authors=["Fatima Ansari", "Shoukath Ansari"], From c3eb29afe4b02a9d22ac1e22423766bf1b3de674 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Fri, 1 Dec 2017 15:35:33 -0500 Subject: [PATCH 17/20] Restore old field weights -- tweaking them didn't result in any relevant changes. --- external_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external_search.py b/external_search.py index f5acbdc76..a849383f9 100644 --- a/external_search.py +++ b/external_search.py @@ -529,7 +529,7 @@ def without_match(original_string, match): # However, it's possible that they're searching for a subject that's not # mentioned in the summary (eg, a person's name in a biography). So title # is a possible match, but is less important than author, subtitle, and summary. - match_rest_of_query = make_query_string_query(remaining_string, ["author^4", "subtitle^2", "summary^4", "title^1", "series^1"]) + match_rest_of_query = make_query_string_query(remaining_string, ["author^4", "subtitle^3", "summary^5", "title^1", "series^1"]) classification_queries.append(match_rest_of_query) # If classification queries and the remaining string all match, the result will From f41e44a7efba7405ce6d883585e69bca31a7c8e6 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Fri, 1 Dec 2017 15:36:44 -0500 Subject: [PATCH 18/20] Restore boost as a float. --- external_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external_search.py b/external_search.py index a849383f9..e30ba4bd3 100644 --- a/external_search.py +++ b/external_search.py @@ -538,7 +538,7 @@ def without_match(original_string, match): match_classification_and_rest_of_query = { 'bool': { 'must': classification_queries, - 'boost': 200 + 'boost': 200.0 } } From 6b57d680b5eb76fb87ae01094873a4abd8d57f64 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Fri, 1 Dec 2017 15:37:52 -0500 Subject: [PATCH 19/20] Removed cosmetic change that clutters diff. --- external_search.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/external_search.py b/external_search.py index e30ba4bd3..90a1d9e40 100644 --- a/external_search.py +++ b/external_search.py @@ -855,8 +855,7 @@ def v2_body(cls): "fields": { "minimal": { "type": "string", - "analyzer": "en_minimal_analyzer"}, - }} + "analyzer": "en_minimal_analyzer"}}} ) mappings = { ExternalSearchIndex.work_document_type : mapping } From f712b1a5be92b604461ed47fbb8386f598efa5f8 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Fri, 1 Dec 2017 15:53:28 -0500 Subject: [PATCH 20/20] For now, always make the alias point to the index named after the current version. --- external_search.py | 31 +++++-------------------------- 1 file changed, 5 insertions(+), 26 deletions(-) diff --git a/external_search.py b/external_search.py index 90a1d9e40..7b8fcbeed 100644 --- a/external_search.py +++ b/external_search.py @@ -152,35 +152,14 @@ def set_works_index_and_alias(self, _db): """Finds or creates the works_index and works_alias based on the current configuration. """ - - # We always know what the name of the alias _should_ be. - alias_name = self.works_alias_name(_db) - index_details = self.indices.get_alias(name=alias_name, ignore=[404]) - - # But the alias may not exist in Elasticsearch. - found = ( - bool(index_details) and not ( - index_details.get('status')==404 or 'error' in index_details - ) - ) - - def _set_works_index(name): - self.works_index = self.__client.works_index = name - - if found: - # The alias exists and points to a specific index. Assume - # there is only one such index. - _set_works_index(index_details.keys()[0]) - else: - # The alias doesn't point to anything. The index name to - # use is the one known to be right for this version. - _set_works_index(self.works_index_name(_db)) - + # The index name to use is the one known to be right for this + # version. + self.works_index = self.__client.works_index = self.works_index_name(_db) if not self.indices.exists(self.works_index): - # The index doesn't actually exist. Set it up. + # That index doesn't actually exist. Set it up. self.setup_index() - # Make sure the alias points to the index we're using. + # Make sure the alias points to the most recent index. self.setup_current_alias(_db) def setup_current_alias(self, _db):