Skip to content
This repository has been archived by the owner on Apr 11, 2022. It is now read-only.

Full title match is better than split match #739

Merged
merged 21 commits into from
Dec 4, 2017
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
2d88fc1
Added a (failing) test describing the failure condition people are se…
leonardr Nov 28, 2017
b7f6683
Added a standalone test that runs in less time.
leonardr Nov 28, 2017
209ee1c
Merge branch 'master' into full-title-match-is-better-than-split-match
leonardr Nov 30, 2017
19eb047
This might work.
leonardr Nov 30, 2017
eb08061
Got a test that I'm more or less happy with.
leonardr Nov 30, 2017
68f93d0
Got the tests working in the case where the tests are running on one …
leonardr Dec 1, 2017
e9eed6f
First attempt at configuring Elasticsearch within Travis.
leonardr Dec 1, 2017
5e19ffb
When copying the Elasticsearch configuration, give it the right exten…
leonardr Dec 1, 2017
88efecb
Fix, improve and document the test that explains how a search query o…
leonardr Dec 1, 2017
40fefd6
Got the rest of the tests to pass.
leonardr Dec 1, 2017
9627d5d
Separated out the v2 and v3 versions of the body.
leonardr Dec 1, 2017
70ef075
Separated out the v2 and v3 versions of the body.
leonardr Dec 1, 2017
4307f95
Changed from having separate configuration settings for index and ali…
leonardr Dec 1, 2017
fb2e040
Renamed the default works index to match the most common current value.
leonardr Dec 1, 2017
5281775
Restored the classification query boost to its old value -- so long a…
leonardr Dec 1, 2017
085cd97
Added missing migration script.
leonardr Dec 1, 2017
b76bdb7
Removed reference to problems that have been fixed.
leonardr Dec 1, 2017
c3eb29a
Restore old field weights -- tweaking them didn't result in any relev…
leonardr Dec 1, 2017
f41e44a
Restore boost as a float.
leonardr Dec 1, 2017
6b57d68
Removed cosmetic change that clutters diff.
leonardr Dec 1, 2017
f712b1a
For now, always make the alias point to the index named after the cur…
leonardr Dec 1, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 13 additions & 6 deletions external_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,9 +324,9 @@ def query_works(self, library, query_string, media, languages, fiction, audience
)
if fields is not None:
search_args['fields'] = fields
#print "Args looks like: %r" % args
print "Args looks like: %r" % search_args
results = self.search(**search_args)
#print "Results: %r" % results
print "Results: %r" % results
return results

def make_query(self, query_string):
Expand All @@ -339,7 +339,7 @@ def make_query_string_query(query_string, fields):
}
}

def make_phrase_query(query_string, fields):
def make_phrase_query(query_string, fields, boost=100):
field_queries = []
for field in fields:
field_query = {
Expand All @@ -352,7 +352,7 @@ def make_phrase_query(query_string, fields):
'bool': {
'should': field_queries,
'minimum_should_match': 1,
'boost': 100,
'boost': boost,
}
}

Expand Down Expand Up @@ -453,6 +453,13 @@ def make_target_age_query(target_age):
match_phrase = make_phrase_query(query_string, ['title.minimal', 'author', 'series.minimal'])
must_match_options.append(match_phrase)

# An exact title or author match outweighs a match that is split
# across fields.
match_title = make_phrase_query(query_string, ['title.minimal'], 150)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What happened to title.standard?

must_match_options.append(match_title)
match_author = make_phrase_query(query_string, ['author.minimal'], 150)
must_match_options.append(match_author)

if not fuzzy_blacklist_re.search(query_string):
fuzzy_query = make_fuzzy_query(query_string, fuzzy_fields)
must_match_options.append(fuzzy_query)
Expand Down Expand Up @@ -527,7 +534,7 @@ def without_match(original_string, match):
# However, it's possible that they're searching for a subject that's not
# mentioned in the summary (eg, a person's name in a biography). So title
# is a possible match, but is less important than author, subtitle, and summary.
match_rest_of_query = make_query_string_query(remaining_string, ["author^4", "subtitle^3", "summary^5", "title^1", "series^1"])
match_rest_of_query = make_query_string_query(remaining_string, ["author^3", "subtitle^2", "summary^4", "title^1", "series^1"])
classification_queries.append(match_rest_of_query)

# If classification queries and the remaining string all match, the result will
Expand All @@ -536,7 +543,7 @@ def without_match(original_string, match):
match_classification_and_rest_of_query = {
'bool': {
'must': classification_queries,
'boost': 200.0
'boost': 100
}
}

Expand Down
186 changes: 172 additions & 14 deletions tests/test_external_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,17 @@ def teardown(self):
ExternalSearchIndex.reset()
super(ExternalSearchTest, self).teardown()

def default_work(self, *args, **kwargs):
"""Convenience method to create a work with a license pool
in the default collection.
"""
work = self._work(
*args, with_license_pool=True,
collection=self._default_collection, **kwargs
)
work.set_presentation_ready()
return work


class TestExternalSearch(ExternalSearchTest):

Expand Down Expand Up @@ -180,15 +191,7 @@ class TestExternalSearchWithWorks(ExternalSearchTest):

def setup(self):
super(TestExternalSearchWithWorks, self).setup()

def _work(*args, **kwargs):
"""Convenience method to create a work with a license pool
in the default collection.
"""
return self._work(
*args, with_license_pool=True,
collection=self._default_collection, **kwargs
)
_work = self.default_work

if self.search:

Expand Down Expand Up @@ -230,6 +233,10 @@ def _work(*args, **kwargs):
self.les_mis.presentation_edition.title = u"Les Mis\u00E9rables"
self.les_mis.set_presentation_ready()

self.modern_romance = _work()
self.modern_romance.presentation_edition.title = u"Modern Romance"
self.modern_romance.set_presentation_ready()

self.lincoln = _work(genre="Biography & Memoir", title="Abraham Lincoln")
self.lincoln.set_presentation_ready()

Expand All @@ -248,7 +255,13 @@ def _work(*args, **kwargs):
self.adult_work = _work(title="Still Alice", audience=Classifier.AUDIENCE_ADULT)
self.adult_work.set_presentation_ready()

self.ya_romance = _work(audience=Classifier.AUDIENCE_YOUNG_ADULT, genre="Romance")
self.ya_romance = _work(
title="Gumby In Love",
audience=Classifier.AUDIENCE_YOUNG_ADULT, genre="Romance"
)
self.ya_romance.presentation_edition.subtitle = (
"Modern Fairytale Series, Book 3"
)
self.ya_romance.set_presentation_ready()

self.no_age = _work()
Expand Down Expand Up @@ -467,11 +480,32 @@ def query(*args, **kwargs):

# Matches genre

results = query("romance", None, None, None, None, None, None, None)
hits = results["hits"]["hits"]
eq_(1, len(hits))
eq_(unicode(self.ya_romance.id), hits[0]['_id'])
def expect_ids(works, *query_args):
original_query_args = list(query_args)
query_args = list(original_query_args)
while len(query_args) < 8:
query_args.append(None)
results = query(*query_args)
hits = results["hits"]["hits"]
expect = [unicode(x.id) for x in works]
actual = [x['_id'] for x in hits]
expect_titles = ", ".join([x.title for x in works])
actual_titles = ", ".join([x['_source']['title'] for x in hits])
eq_(
expect, actual,
"Query args %r did not find %d works (%s), instead found %d (%s)" % (
original_query_args, len(expect), expect_titles,
len(actual), actual_titles
)
)

# Search by genre. The name of the genre also shows up in the
# title of a book, but the genre comes up first.
expect_ids([self.ya_romance, self.modern_romance], "romance")

# A full title match takes precedence over a match that's
# split across genre and subtitle.
expect_ids([self.modern_romance, self.ya_romance], "modern romance")

# Matches audience

Expand Down Expand Up @@ -748,6 +782,10 @@ def query(*args, **kwargs):
hits = results["hits"]["hits"]
eq_(2, len(hits))

#
# Test searching across collections.
#

# If we add the missing collection to the default library, "A
# Tiny Book" starts showing up in searches against that
# library.
Expand Down Expand Up @@ -778,6 +816,126 @@ def query(*args, **kwargs):
]
eq_(set(collections), set(expect_collections))

class TestExactMatches(ExternalSearchTest):
"""Verify that exact or near-exact title and author matches are
privileged over matches that span fields.
"""

def setup(self):
super(TestExactMatches, self).setup()
_work = self.default_work

# Here the title is 'Modern Romance'
self.modern_romance = _work(
title="Modern Romance",
authors=["Aziz Ansari", "Eric Klinenberg"],
)

# Here 'Modern' is in the subtitle and 'Romance' is the genre.
self.ya_romance = _work(
title="Gumby In Love",
authors="Pokey",
audience=Classifier.AUDIENCE_YOUNG_ADULT, genre="Romance"
)
self.ya_romance.presentation_edition.subtitle = (
"Modern Fairytale Series, Book 3"
)

# TODO: Uncomment these lines and the 'modern romance'
# test fails for some reason.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you fixed this!

# self.parent_book = _work(
# title="Our Son Aziz",
# authors=["Fatima Ansari", "Shoukath Ansari"],
# genre="Biography & Memoir",
# )

self.behind_the_scenes = _work(
title="The Making of Biography With Peter Graves",
genre="Entertainment",
)

self.biography_of_peter_graves = _work(
"He Is Peter Graves",
authors="Kelly Ghostwriter",
genre="Biography & Memoir",
)

self.book_by_peter_graves = _work(
title="My Experience At The University of Minnesota",
authors="Peter Graves",
genre="Entertainment",
)

self.book_by_someone_else = _work(
title="The Deadly Graves",
authors="Peter Ansari",
genre="Mystery"
)

# Add all the works created in the setup to the search index.
SearchIndexCoverageProvider(
self._db, search_index_client=self.search
).run_once_and_update_timestamp()

# Sleep to give the index time to catch up.
time.sleep(2)

def test_exact_matches(self):

# Convenience method to query the default library.
def query(*args, **kwargs):
return self.search.query_works(
self._default_library, *args, **kwargs
)

def expect_ids(works, *query_args):
original_query_args = list(query_args)
query_args = list(original_query_args)
while len(query_args) < 8:
query_args.append(None)
results = query(*query_args)
hits = results["hits"]["hits"]
expect = [unicode(x.id) for x in works]
actual = [x['_id'] for x in hits]
expect_titles = ", ".join([x.title for x in works])
actual_titles = ", ".join([x['_source']['title'] for x in hits])
eq_(
expect, actual,
"Query args %r did not find %d works (%s), instead found %d (%s)" % (
original_query_args, len(expect), expect_titles,
len(actual), actual_titles
)
)

# A full title match takes precedence over a match that's
# split across genre and subtitle.
expect_ids([self.modern_romance, self.ya_romance], "modern romance")

# A full author match takes precedence over a partial author
# match.
expect_ids([self.modern_romance, self.book_by_someone_else],
"aziz ansari")

# When a string exactly matches both a title and an author,
# the books that match exactly are promoted.
expect_ids(
[self.biography_of_peter_graves, self.behind_the_scenes,
self.book_by_peter_graves, self.book_by_someone_else],
"peter graves"
)

# 'The Making of Biography With Peter Graves' does worse in a
# search for 'peter graves biography' than a biography whose
# title includes the phrase 'peter graves'. Although the title
# contains all three search terms, it's not an exact token
# match. But "The Making of..." still does better than book
# that matches the query string against two different fields.
expect_ids(
[self.biography_of_peter_graves, self.book_by_peter_graves,
self.behind_the_scenes, self.book_by_someone_else],
"peter graves biography"
)


class TestSearchQuery(DatabaseTest):
def test_make_query(self):
Expand Down