NYPL-Simplified · leonardr · Dec 4, 2017 · Nov 28, 2017 · Nov 28, 2017 · Nov 30, 2017
diff --git a/external_search.py b/external_search.py
@@ -324,9 +324,9 @@ def query_works(self, library, query_string, media, languages, fiction, audience
         )
         if fields is not None:
             search_args['fields'] = fields
-        #print "Args looks like: %r" % args
+        print "Args looks like: %r" % search_args
         results = self.search(**search_args)
-        #print "Results: %r" % results
+        print "Results: %r" % results
         return results
 
     def make_query(self, query_string):
@@ -339,7 +339,7 @@ def make_query_string_query(query_string, fields):
                 }
             }
 
-        def make_phrase_query(query_string, fields):
+        def make_phrase_query(query_string, fields, boost=100):
             field_queries = []
             for field in fields:
                 field_query = {
@@ -352,7 +352,7 @@ def make_phrase_query(query_string, fields):
                 'bool': {
                   'should': field_queries,
                   'minimum_should_match': 1,
-                  'boost': 100,
+                  'boost': boost,
                 }
               }
 
@@ -453,6 +453,13 @@ def make_target_age_query(target_age):
         match_phrase = make_phrase_query(query_string, ['title.minimal', 'author', 'series.minimal'])
         must_match_options.append(match_phrase)
 
+        # An exact title or author match outweighs a match that is split
+        # across fields.
+        match_title = make_phrase_query(query_string, ['title.minimal'], 150)
+        must_match_options.append(match_title)
+        match_author = make_phrase_query(query_string, ['author.minimal'], 150)
+        must_match_options.append(match_author)
+
         if not fuzzy_blacklist_re.search(query_string):
             fuzzy_query = make_fuzzy_query(query_string, fuzzy_fields)
             must_match_options.append(fuzzy_query)
@@ -527,7 +534,7 @@ def without_match(original_string, match):
                 # However, it's possible that they're searching for a subject that's not
                 # mentioned in the summary (eg, a person's name in a biography). So title
                 # is a possible match, but is less important than author, subtitle, and summary.
-                match_rest_of_query = make_query_string_query(remaining_string, ["author^4", "subtitle^3", "summary^5", "title^1", "series^1"])
+                match_rest_of_query = make_query_string_query(remaining_string, ["author^3", "subtitle^2", "summary^4", "title^1", "series^1"])
                 classification_queries.append(match_rest_of_query)
 
             # If classification queries and the remaining string all match, the result will
@@ -536,7 +543,7 @@ def without_match(original_string, match):
             match_classification_and_rest_of_query = {
                 'bool': {
                     'must': classification_queries,
-                    'boost': 200.0
+                    'boost': 100
                 }
             }
 

diff --git a/tests/test_external_search.py b/tests/test_external_search.py
@@ -63,6 +63,17 @@ def teardown(self):
             ExternalSearchIndex.reset()
         super(ExternalSearchTest, self).teardown()
 
+    def default_work(self, *args, **kwargs):
+        """Convenience method to create a work with a license pool
+        in the default collection.
+        """
+        work = self._work(
+            *args, with_license_pool=True, 
+            collection=self._default_collection, **kwargs
+        )
+        work.set_presentation_ready()
+        return work
+
 
 class TestExternalSearch(ExternalSearchTest):
 
@@ -180,15 +191,7 @@ class TestExternalSearchWithWorks(ExternalSearchTest):
 
     def setup(self):
         super(TestExternalSearchWithWorks, self).setup()
-
-        def _work(*args, **kwargs):
-            """Convenience method to create a work with a license pool
-            in the default collection.
-            """
-            return self._work(
-                *args, with_license_pool=True, 
-                collection=self._default_collection, **kwargs
-            )
+        _work = self.default_work
 
         if self.search:
 
@@ -230,6 +233,10 @@ def _work(*args, **kwargs):
             self.les_mis.presentation_edition.title = u"Les Mis\u00E9rables"
             self.les_mis.set_presentation_ready()
 
+            self.modern_romance = _work()
+            self.modern_romance.presentation_edition.title = u"Modern Romance"
+            self.modern_romance.set_presentation_ready()
+
             self.lincoln = _work(genre="Biography & Memoir", title="Abraham Lincoln")
             self.lincoln.set_presentation_ready()
 
@@ -248,7 +255,13 @@ def _work(*args, **kwargs):
             self.adult_work = _work(title="Still Alice", audience=Classifier.AUDIENCE_ADULT)
             self.adult_work.set_presentation_ready()
 
-            self.ya_romance = _work(audience=Classifier.AUDIENCE_YOUNG_ADULT, genre="Romance")
+            self.ya_romance = _work(
+                title="Gumby In Love",
+                audience=Classifier.AUDIENCE_YOUNG_ADULT, genre="Romance"
+            )
+            self.ya_romance.presentation_edition.subtitle = (
+                "Modern Fairytale Series, Book 3"
+            )
             self.ya_romance.set_presentation_ready()
 
             self.no_age = _work()
@@ -467,11 +480,32 @@ def query(*args, **kwargs):
 
         # Matches genre
 
-        results = query("romance", None, None, None, None, None, None, None)
-        hits = results["hits"]["hits"]
-        eq_(1, len(hits))
-        eq_(unicode(self.ya_romance.id), hits[0]['_id'])
+        def expect_ids(works, *query_args):
+            original_query_args = list(query_args)
+            query_args = list(original_query_args)
+            while len(query_args) < 8:
+                query_args.append(None)
+            results = query(*query_args)
+            hits = results["hits"]["hits"]
+            expect = [unicode(x.id) for x in works]
+            actual = [x['_id'] for x in hits]
+            expect_titles = ", ".join([x.title for x in works])
+            actual_titles = ", ".join([x['_source']['title'] for x in hits])
+            eq_(
+                expect, actual,
+                "Query args %r did not find %d works (%s), instead found %d (%s)" % (
+                    original_query_args, len(expect), expect_titles,
+                    len(actual), actual_titles
+                )
+            )
+
+        # Search by genre. The name of the genre also shows up in the
+        # title of a book, but the genre comes up first.
+        expect_ids([self.ya_romance, self.modern_romance], "romance")
 
+        # A full title match takes precedence over a match that's
+        # split across genre and subtitle.
+        expect_ids([self.modern_romance, self.ya_romance], "modern romance")
 
         # Matches audience
 
@@ -748,6 +782,10 @@ def query(*args, **kwargs):
         hits = results["hits"]["hits"]
         eq_(2, len(hits))        
 
+        #
+        # Test searching across collections.
+        #
+
         # If we add the missing collection to the default library, "A
         # Tiny Book" starts showing up in searches against that
         # library.
@@ -778,6 +816,126 @@ def query(*args, **kwargs):
         ]
         eq_(set(collections), set(expect_collections))
 
+class TestExactMatches(ExternalSearchTest):
+    """Verify that exact or near-exact title and author matches are
+    privileged over matches that span fields.
+    """
+
+    def setup(self):
+        super(TestExactMatches, self).setup()
+        _work = self.default_work
+
+        # Here the title is 'Modern Romance'
+        self.modern_romance = _work(
+            title="Modern Romance",
+            authors=["Aziz Ansari", "Eric Klinenberg"],
+        )
+
+        # Here 'Modern' is in the subtitle and 'Romance' is the genre.
+        self.ya_romance = _work(
+            title="Gumby In Love",
+            authors="Pokey",
+            audience=Classifier.AUDIENCE_YOUNG_ADULT, genre="Romance"
+        )
+        self.ya_romance.presentation_edition.subtitle = (
+            "Modern Fairytale Series, Book 3"
+        )
+
+        # TODO: Uncomment these lines and the 'modern romance'
+        # test fails for some reason.
+        # self.parent_book = _work(
+        #     title="Our Son Aziz",
+        #     authors=["Fatima Ansari", "Shoukath Ansari"],
+        #     genre="Biography & Memoir",
+        # )
+
+        self.behind_the_scenes = _work(
+            title="The Making of Biography With Peter Graves",
+            genre="Entertainment",
+        )
+
+        self.biography_of_peter_graves = _work(
+            "He Is Peter Graves",
+            authors="Kelly Ghostwriter",
+            genre="Biography & Memoir",
+        )
+
+        self.book_by_peter_graves = _work(
+            title="My Experience At The University of Minnesota",
+            authors="Peter Graves",
+            genre="Entertainment",
+        )
+
+        self.book_by_someone_else = _work(
+            title="The Deadly Graves",
+            authors="Peter Ansari",
+            genre="Mystery"
+        )
+
+        # Add all the works created in the setup to the search index.
+        SearchIndexCoverageProvider(
+            self._db, search_index_client=self.search
+        ).run_once_and_update_timestamp()
+
+        # Sleep to give the index time to catch up.
+        time.sleep(2)
+
+    def test_exact_matches(self):
+
+        # Convenience method to query the default library.
+        def query(*args, **kwargs):
+            return self.search.query_works(
+                self._default_library, *args, **kwargs
+            )
+
+        def expect_ids(works, *query_args):
+            original_query_args = list(query_args)
+            query_args = list(original_query_args)
+            while len(query_args) < 8:
+                query_args.append(None)
+            results = query(*query_args)
+            hits = results["hits"]["hits"]
+            expect = [unicode(x.id) for x in works]
+            actual = [x['_id'] for x in hits]
+            expect_titles = ", ".join([x.title for x in works])
+            actual_titles = ", ".join([x['_source']['title'] for x in hits])
+            eq_(
+                expect, actual,
+                "Query args %r did not find %d works (%s), instead found %d (%s)" % (
+                    original_query_args, len(expect), expect_titles,
+                    len(actual), actual_titles
+                )
+            )
+
+        # A full title match takes precedence over a match that's
+        # split across genre and subtitle.
+        expect_ids([self.modern_romance, self.ya_romance], "modern romance")
+
+        # A full author match takes precedence over a partial author
+        # match.
+        expect_ids([self.modern_romance, self.book_by_someone_else],
+                   "aziz ansari")
+
+        # When a string exactly matches both a title and an author,
+        # the books that match exactly are promoted.
+        expect_ids(
+            [self.biography_of_peter_graves, self.behind_the_scenes,
+             self.book_by_peter_graves, self.book_by_someone_else],
+            "peter graves"
+        )
+
+        # 'The Making of Biography With Peter Graves' does worse in a
+        # search for 'peter graves biography' than a biography whose
+        # title includes the phrase 'peter graves'. Although the title
+        # contains all three search terms, it's not an exact token
+        # match. But "The Making of..." still does better than book
+        # that matches the query string against two different fields.
+        expect_ids(
+            [self.biography_of_peter_graves, self.book_by_peter_graves,
+             self.behind_the_scenes, self.book_by_someone_else],
+            "peter graves biography"
+        )
+
 
 class TestSearchQuery(DatabaseTest):
     def test_make_query(self):