This repository has been archived by the owner on Apr 11, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 11
Full title match is better than split match #739
Merged
Merged
Changes from 5 commits
Commits
Show all changes
21 commits
Select commit
Hold shift + click to select a range
2d88fc1
Added a (failing) test describing the failure condition people are se…
leonardr b7f6683
Added a standalone test that runs in less time.
leonardr 209ee1c
Merge branch 'master' into full-title-match-is-better-than-split-match
leonardr 19eb047
This might work.
leonardr eb08061
Got a test that I'm more or less happy with.
leonardr 68f93d0
Got the tests working in the case where the tests are running on one …
leonardr e9eed6f
First attempt at configuring Elasticsearch within Travis.
leonardr 5e19ffb
When copying the Elasticsearch configuration, give it the right exten…
leonardr 88efecb
Fix, improve and document the test that explains how a search query o…
leonardr 40fefd6
Got the rest of the tests to pass.
leonardr 9627d5d
Separated out the v2 and v3 versions of the body.
leonardr 70ef075
Separated out the v2 and v3 versions of the body.
leonardr 4307f95
Changed from having separate configuration settings for index and ali…
leonardr fb2e040
Renamed the default works index to match the most common current value.
leonardr 5281775
Restored the classification query boost to its old value -- so long a…
leonardr 085cd97
Added missing migration script.
leonardr b76bdb7
Removed reference to problems that have been fixed.
leonardr c3eb29a
Restore old field weights -- tweaking them didn't result in any relev…
leonardr f41e44a
Restore boost as a float.
leonardr 6b57d68
Removed cosmetic change that clutters diff.
leonardr f712b1a
For now, always make the alias point to the index named after the cur…
leonardr File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -63,6 +63,17 @@ def teardown(self): | |
ExternalSearchIndex.reset() | ||
super(ExternalSearchTest, self).teardown() | ||
|
||
def default_work(self, *args, **kwargs): | ||
"""Convenience method to create a work with a license pool | ||
in the default collection. | ||
""" | ||
work = self._work( | ||
*args, with_license_pool=True, | ||
collection=self._default_collection, **kwargs | ||
) | ||
work.set_presentation_ready() | ||
return work | ||
|
||
|
||
class TestExternalSearch(ExternalSearchTest): | ||
|
||
|
@@ -180,15 +191,7 @@ class TestExternalSearchWithWorks(ExternalSearchTest): | |
|
||
def setup(self): | ||
super(TestExternalSearchWithWorks, self).setup() | ||
|
||
def _work(*args, **kwargs): | ||
"""Convenience method to create a work with a license pool | ||
in the default collection. | ||
""" | ||
return self._work( | ||
*args, with_license_pool=True, | ||
collection=self._default_collection, **kwargs | ||
) | ||
_work = self.default_work | ||
|
||
if self.search: | ||
|
||
|
@@ -230,6 +233,10 @@ def _work(*args, **kwargs): | |
self.les_mis.presentation_edition.title = u"Les Mis\u00E9rables" | ||
self.les_mis.set_presentation_ready() | ||
|
||
self.modern_romance = _work() | ||
self.modern_romance.presentation_edition.title = u"Modern Romance" | ||
self.modern_romance.set_presentation_ready() | ||
|
||
self.lincoln = _work(genre="Biography & Memoir", title="Abraham Lincoln") | ||
self.lincoln.set_presentation_ready() | ||
|
||
|
@@ -248,7 +255,13 @@ def _work(*args, **kwargs): | |
self.adult_work = _work(title="Still Alice", audience=Classifier.AUDIENCE_ADULT) | ||
self.adult_work.set_presentation_ready() | ||
|
||
self.ya_romance = _work(audience=Classifier.AUDIENCE_YOUNG_ADULT, genre="Romance") | ||
self.ya_romance = _work( | ||
title="Gumby In Love", | ||
audience=Classifier.AUDIENCE_YOUNG_ADULT, genre="Romance" | ||
) | ||
self.ya_romance.presentation_edition.subtitle = ( | ||
"Modern Fairytale Series, Book 3" | ||
) | ||
self.ya_romance.set_presentation_ready() | ||
|
||
self.no_age = _work() | ||
|
@@ -467,11 +480,32 @@ def query(*args, **kwargs): | |
|
||
# Matches genre | ||
|
||
results = query("romance", None, None, None, None, None, None, None) | ||
hits = results["hits"]["hits"] | ||
eq_(1, len(hits)) | ||
eq_(unicode(self.ya_romance.id), hits[0]['_id']) | ||
def expect_ids(works, *query_args): | ||
original_query_args = list(query_args) | ||
query_args = list(original_query_args) | ||
while len(query_args) < 8: | ||
query_args.append(None) | ||
results = query(*query_args) | ||
hits = results["hits"]["hits"] | ||
expect = [unicode(x.id) for x in works] | ||
actual = [x['_id'] for x in hits] | ||
expect_titles = ", ".join([x.title for x in works]) | ||
actual_titles = ", ".join([x['_source']['title'] for x in hits]) | ||
eq_( | ||
expect, actual, | ||
"Query args %r did not find %d works (%s), instead found %d (%s)" % ( | ||
original_query_args, len(expect), expect_titles, | ||
len(actual), actual_titles | ||
) | ||
) | ||
|
||
# Search by genre. The name of the genre also shows up in the | ||
# title of a book, but the genre comes up first. | ||
expect_ids([self.ya_romance, self.modern_romance], "romance") | ||
|
||
# A full title match takes precedence over a match that's | ||
# split across genre and subtitle. | ||
expect_ids([self.modern_romance, self.ya_romance], "modern romance") | ||
|
||
# Matches audience | ||
|
||
|
@@ -748,6 +782,10 @@ def query(*args, **kwargs): | |
hits = results["hits"]["hits"] | ||
eq_(2, len(hits)) | ||
|
||
# | ||
# Test searching across collections. | ||
# | ||
|
||
# If we add the missing collection to the default library, "A | ||
# Tiny Book" starts showing up in searches against that | ||
# library. | ||
|
@@ -778,6 +816,126 @@ def query(*args, **kwargs): | |
] | ||
eq_(set(collections), set(expect_collections)) | ||
|
||
class TestExactMatches(ExternalSearchTest): | ||
"""Verify that exact or near-exact title and author matches are | ||
privileged over matches that span fields. | ||
""" | ||
|
||
def setup(self): | ||
super(TestExactMatches, self).setup() | ||
_work = self.default_work | ||
|
||
# Here the title is 'Modern Romance' | ||
self.modern_romance = _work( | ||
title="Modern Romance", | ||
authors=["Aziz Ansari", "Eric Klinenberg"], | ||
) | ||
|
||
# Here 'Modern' is in the subtitle and 'Romance' is the genre. | ||
self.ya_romance = _work( | ||
title="Gumby In Love", | ||
authors="Pokey", | ||
audience=Classifier.AUDIENCE_YOUNG_ADULT, genre="Romance" | ||
) | ||
self.ya_romance.presentation_edition.subtitle = ( | ||
"Modern Fairytale Series, Book 3" | ||
) | ||
|
||
# TODO: Uncomment these lines and the 'modern romance' | ||
# test fails for some reason. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you fixed this! |
||
# self.parent_book = _work( | ||
# title="Our Son Aziz", | ||
# authors=["Fatima Ansari", "Shoukath Ansari"], | ||
# genre="Biography & Memoir", | ||
# ) | ||
|
||
self.behind_the_scenes = _work( | ||
title="The Making of Biography With Peter Graves", | ||
genre="Entertainment", | ||
) | ||
|
||
self.biography_of_peter_graves = _work( | ||
"He Is Peter Graves", | ||
authors="Kelly Ghostwriter", | ||
genre="Biography & Memoir", | ||
) | ||
|
||
self.book_by_peter_graves = _work( | ||
title="My Experience At The University of Minnesota", | ||
authors="Peter Graves", | ||
genre="Entertainment", | ||
) | ||
|
||
self.book_by_someone_else = _work( | ||
title="The Deadly Graves", | ||
authors="Peter Ansari", | ||
genre="Mystery" | ||
) | ||
|
||
# Add all the works created in the setup to the search index. | ||
SearchIndexCoverageProvider( | ||
self._db, search_index_client=self.search | ||
).run_once_and_update_timestamp() | ||
|
||
# Sleep to give the index time to catch up. | ||
time.sleep(2) | ||
|
||
def test_exact_matches(self): | ||
|
||
# Convenience method to query the default library. | ||
def query(*args, **kwargs): | ||
return self.search.query_works( | ||
self._default_library, *args, **kwargs | ||
) | ||
|
||
def expect_ids(works, *query_args): | ||
original_query_args = list(query_args) | ||
query_args = list(original_query_args) | ||
while len(query_args) < 8: | ||
query_args.append(None) | ||
results = query(*query_args) | ||
hits = results["hits"]["hits"] | ||
expect = [unicode(x.id) for x in works] | ||
actual = [x['_id'] for x in hits] | ||
expect_titles = ", ".join([x.title for x in works]) | ||
actual_titles = ", ".join([x['_source']['title'] for x in hits]) | ||
eq_( | ||
expect, actual, | ||
"Query args %r did not find %d works (%s), instead found %d (%s)" % ( | ||
original_query_args, len(expect), expect_titles, | ||
len(actual), actual_titles | ||
) | ||
) | ||
|
||
# A full title match takes precedence over a match that's | ||
# split across genre and subtitle. | ||
expect_ids([self.modern_romance, self.ya_romance], "modern romance") | ||
|
||
# A full author match takes precedence over a partial author | ||
# match. | ||
expect_ids([self.modern_romance, self.book_by_someone_else], | ||
"aziz ansari") | ||
|
||
# When a string exactly matches both a title and an author, | ||
# the books that match exactly are promoted. | ||
expect_ids( | ||
[self.biography_of_peter_graves, self.behind_the_scenes, | ||
self.book_by_peter_graves, self.book_by_someone_else], | ||
"peter graves" | ||
) | ||
|
||
# 'The Making of Biography With Peter Graves' does worse in a | ||
# search for 'peter graves biography' than a biography whose | ||
# title includes the phrase 'peter graves'. Although the title | ||
# contains all three search terms, it's not an exact token | ||
# match. But "The Making of..." still does better than book | ||
# that matches the query string against two different fields. | ||
expect_ids( | ||
[self.biography_of_peter_graves, self.book_by_peter_graves, | ||
self.behind_the_scenes, self.book_by_someone_else], | ||
"peter graves biography" | ||
) | ||
|
||
|
||
class TestSearchQuery(DatabaseTest): | ||
def test_make_query(self): | ||
|
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What happened to
title.standard
?