Skip to content
This repository has been archived by the owner on Apr 11, 2022. It is now read-only.

Full title match is better than split match #739

Merged
merged 21 commits into from
Dec 4, 2017
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
2d88fc1
Added a (failing) test describing the failure condition people are se…
leonardr Nov 28, 2017
b7f6683
Added a standalone test that runs in less time.
leonardr Nov 28, 2017
209ee1c
Merge branch 'master' into full-title-match-is-better-than-split-match
leonardr Nov 30, 2017
19eb047
This might work.
leonardr Nov 30, 2017
eb08061
Got a test that I'm more or less happy with.
leonardr Nov 30, 2017
68f93d0
Got the tests working in the case where the tests are running on one …
leonardr Dec 1, 2017
e9eed6f
First attempt at configuring Elasticsearch within Travis.
leonardr Dec 1, 2017
5e19ffb
When copying the Elasticsearch configuration, give it the right exten…
leonardr Dec 1, 2017
88efecb
Fix, improve and document the test that explains how a search query o…
leonardr Dec 1, 2017
40fefd6
Got the rest of the tests to pass.
leonardr Dec 1, 2017
9627d5d
Separated out the v2 and v3 versions of the body.
leonardr Dec 1, 2017
70ef075
Separated out the v2 and v3 versions of the body.
leonardr Dec 1, 2017
4307f95
Changed from having separate configuration settings for index and ali…
leonardr Dec 1, 2017
fb2e040
Renamed the default works index to match the most common current value.
leonardr Dec 1, 2017
5281775
Restored the classification query boost to its old value -- so long a…
leonardr Dec 1, 2017
085cd97
Added missing migration script.
leonardr Dec 1, 2017
b76bdb7
Removed reference to problems that have been fixed.
leonardr Dec 1, 2017
c3eb29a
Restore old field weights -- tweaking them didn't result in any relev…
leonardr Dec 1, 2017
f41e44a
Restore boost as a float.
leonardr Dec 1, 2017
6b57d68
Removed cosmetic change that clutters diff.
leonardr Dec 1, 2017
f712b1a
For now, always make the alias point to the index named after the cur…
leonardr Dec 1, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,12 @@ install:
- pip install -r requirements.txt
- python -m textblob.download_corpora
- cp config.json.sample config.json
- export SIMPLIFIED_CONFIGURATION_FILE="$TRAVIS_BUILD_DIR/config.json"
- export SIMPLIFIED_CONFIGURATION_FILE="${TRAVIS_BUILD_DIR}/config.json"
- wget ${ES_DOWNLOAD}
- tar -xzf elasticsearch-${ES_VERSION}.tar.gz
- ./elasticsearch-${ES_VERSION}/bin/elasticsearch &
- export ES_PATH=elasticsearch-${ES_VERSION}
- tar -xzf ${ES_PATH}.tar.gz
- cp elasticsearch.yml.testing ./${ES_PATH}/config/elasticsearch.yml
- ./${ES_PATH}/bin/elasticsearch &

before_script:
- psql -c 'create user simplified_test;' -U postgres
Expand Down
5 changes: 5 additions & 0 deletions elasticsearch.yml.testing
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Scores for search results are calculated on a per-shard basis.
# Keeping everything in one shard while running unit testsensures that
# we have complete control over how the scores are calculated.
index.number_of_shards: 1
index.number_of_replicas: 0
217 changes: 139 additions & 78 deletions external_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,20 +30,22 @@ class ExternalSearchIndex(object):

NAME = ExternalIntegration.ELASTICSEARCH

WORKS_INDEX_KEY = u'works_index'
WORKS_ALIAS_KEY = u'works_alias'
WORKS_INDEX_PREFIX_KEY = u'works_index_prefix'

DEFAULT_WORKS_INDEX = u'works'
DEFAULT_WORKS_INDEX_PREFIX = u'circulation-works'

work_document_type = 'work-type'
__client = None

CURRENT_ALIAS_SUFFIX = '-current'
CURRENT_ALIAS_SUFFIX = 'current'
VERSION_RE = re.compile('-v([0-9]+)$')

SETTINGS = [
{ "key": ExternalIntegration.URL, "label": _("URL") },
{ "key": WORKS_INDEX_KEY, "label": _("Works index"), "default": DEFAULT_WORKS_INDEX },
{ "key": WORKS_INDEX_PREFIX_KEY, "label": _("Index prefix"),
"default": DEFAULT_WORKS_INDEX_PREFIX,
"description": _("Any Elasticsearch indexes needed for this application will be created with this unique prefix. In most cases, the default will work fine. You may need to change this if you have multiple application servers using a single Elasticsearch server.")
},
]

SITEWIDE = True
Expand All @@ -66,13 +68,35 @@ def search_integration(cls, _db):
)

@classmethod
def works_index_name(cls, _db):
"""Look up the name of the search index."""
def works_prefixed(cls, _db, value):
"""Prefix the given value with the prefix to use when generating index
and alias names.

:return: A string "{prefix}-{value}", or None if no prefix is
configured.
"""
integration = cls.search_integration(_db)
if not integration:
return None
setting = integration.setting(cls.WORKS_INDEX_KEY)
return setting.value_or_default(cls.DEFAULT_WORKS_INDEX)
setting = integration.setting(cls.WORKS_INDEX_PREFIX_KEY)
prefix = setting.value_or_default(cls.DEFAULT_WORKS_INDEX_PREFIX)
return prefix + '-' + value

@classmethod
def works_index_name(cls, _db):
"""Look up the name of the search index.

It's possible, but unlikely, that the search index alias will
point to some other index. But if there were no indexes, and a
new one needed to be created, this would be the name of that
index.
"""
return cls.works_prefixed(_db, ExternalSearchIndexVersions.latest())

@classmethod
def works_alias_name(cls, _db):
"""Look up the name of the search index alias."""
return cls.works_prefixed(_db, cls.CURRENT_ALIAS_SUFFIX)

def __init__(self, _db, url=None, works_index=None):

Expand Down Expand Up @@ -118,87 +142,57 @@ def __init__(self, _db, url=None, works_index=None):
# Document upload runs against the works_index.
# Search queries run against works_alias.
if works_index and integration:
self.set_works_index_and_alias(works_index)
self.update_integration_settings(integration)
self.set_works_index_and_alias(_db)

def bulk(docs, **kwargs):
return elasticsearch_bulk(self.__client, docs, **kwargs)
self.bulk = bulk

def update_integration_settings(self, integration, force=False):
"""Updates the integration with an appropriate index and alias
setting if the index and alias have been updated.
"""
if not integration or not (self.works_index and self.works_alias):
return

if self.works_index==self.works_alias:
# An index is being used as the alias. There is no alias
# to update with.
return

if integration.setting(self.WORKS_ALIAS_KEY).value and not force:
# This integration already has an alias and we don't want to
# force an update.
return

index_or_alias = [self.works_index, self.works_alias]
if (integration.setting(self.WORKS_INDEX_KEY).value not in index_or_alias
and not force
):
# This ExternalSearchIndex was created for a different index and
# alias, and we don't want to force an update.
return

integration.setting(self.WORKS_INDEX_KEY).value = unicode(self.works_index)
integration.setting(self.WORKS_ALIAS_KEY).value = unicode(self.works_alias)

def set_works_index_and_alias(self, current_alias):
def set_works_index_and_alias(self, _db):
"""Finds or creates the works_index and works_alias based on
provided configuration.
the current configuration.
"""
if current_alias:
index_details = self.indices.get_alias(name=current_alias, ignore=[404])
found = bool(index_details) and not (index_details.get('status')==404 or 'error' in index_details)
else:
found = False

# We always know what the name of the alias _should_ be.
alias_name = self.works_alias_name(_db)
index_details = self.indices.get_alias(name=alias_name, ignore=[404])

# But the alias may not exist in Elasticsearch.
found = (
bool(index_details) and not (
index_details.get('status')==404 or 'error' in index_details
)
)

def _set_works_index(name):
self.works_index = self.__client.works_index = name

if found:
# We found an index for the alias in configuration. Assume
# there is only one.
# The alias exists and points to a specific index. Assume
# there is only one such index.
_set_works_index(index_details.keys()[0])
else:
if current_alias.endswith(self.CURRENT_ALIAS_SUFFIX):
# The alias culled from configuration is intended to be
# a current alias, but an index with that alias wasn't
# found. Find or create an appropriate index.
base_index_name = self.base_index_name(current_alias)
new_index = base_index_name+'-'+ExternalSearchIndexVersions.latest()
_set_works_index(new_index)
else:
# Without the CURRENT_ALIAS_SUFFIX, assume the index string
# from config is the index itself and needs to be swapped.
_set_works_index(current_alias)
# The alias doesn't point to anything. The index name to
# use is the one known to be right for this version.
_set_works_index(self.works_index_name(_db))

if not self.indices.exists(self.works_index):
# The index doesn't actually exist. Set it up.
self.setup_index()
self.setup_current_alias()

def setup_current_alias(self):
"""Finds or creates a works_alias based on the base works_index
name and ending in the expected CURRENT_ALIAS_SUFFIX.
# Make sure the alias points to the index we're using.
self.setup_current_alias(_db)

def setup_current_alias(self, _db):
"""Finds or creates the works_alias as named by the current site
settings.

If the resulting alias exists and is affixed to a different
index or if it can't be generated for any reason, the alias will
not be created or moved. Instead, the search client will use the
the works_index directly for search queries.
"""

base_works_index = self.base_index_name(self.works_index)
alias_name = base_works_index+self.CURRENT_ALIAS_SUFFIX
alias_name = self.works_alias_name(_db)
exists = self.indices.exists_alias(name=alias_name)

def _set_works_alias(name):
Expand Down Expand Up @@ -244,7 +238,7 @@ def setup_index(self, new_index=None):
body = ExternalSearchIndexVersions.latest_body()
self.indices.create(index=index, body=body)

def transfer_current_alias(self, new_index):
def transfer_current_alias(self, _db, new_index):
"""Force -current alias onto a new index"""
if not self.indices.exists(index=new_index):
raise ValueError(
Expand All @@ -260,11 +254,11 @@ def transfer_current_alias(self, new_index):
"is the same.") % (new_index, self.works_index))

self.works_index = self.__client.works_index = new_index
alias_name = self.base_index_name(new_index)+self.CURRENT_ALIAS_SUFFIX
alias_name = self.works_alias_name(_db)

exists = self.indices.exists_alias(name=alias_name)
if not exists:
self.setup_current_alias()
self.setup_current_alias(_db)
return

exists_on_works_index = self.indices.get_alias(
Expand Down Expand Up @@ -324,9 +318,10 @@ def query_works(self, library, query_string, media, languages, fiction, audience
)
if fields is not None:
search_args['fields'] = fields
#print "Args looks like: %r" % args
# search_args['explain'] = True
# print "Args looks like: %r" % search_args
results = self.search(**search_args)
#print "Results: %r" % results
# print "Results: %r" % results
return results

def make_query(self, query_string):
Expand All @@ -339,7 +334,7 @@ def make_query_string_query(query_string, fields):
}
}

def make_phrase_query(query_string, fields):
def make_phrase_query(query_string, fields, boost=100):
field_queries = []
for field in fields:
field_query = {
Expand All @@ -352,7 +347,7 @@ def make_phrase_query(query_string, fields):
'bool': {
'should': field_queries,
'minimum_should_match': 1,
'boost': 100,
'boost': boost,
}
}

Expand Down Expand Up @@ -453,6 +448,13 @@ def make_target_age_query(target_age):
match_phrase = make_phrase_query(query_string, ['title.minimal', 'author', 'series.minimal'])
must_match_options.append(match_phrase)

# An exact title or author match outweighs a match that is split
# across fields.
match_title = make_phrase_query(query_string, ['title.standard'], 200)
must_match_options.append(match_title)
match_author = make_phrase_query(query_string, ['author.standard'], 200)
must_match_options.append(match_author)

if not fuzzy_blacklist_re.search(query_string):
fuzzy_query = make_fuzzy_query(query_string, fuzzy_fields)
must_match_options.append(fuzzy_query)
Expand Down Expand Up @@ -527,7 +529,7 @@ def without_match(original_string, match):
# However, it's possible that they're searching for a subject that's not
# mentioned in the summary (eg, a person's name in a biography). So title
# is a possible match, but is less important than author, subtitle, and summary.
match_rest_of_query = make_query_string_query(remaining_string, ["author^4", "subtitle^3", "summary^5", "title^1", "series^1"])
match_rest_of_query = make_query_string_query(remaining_string, ["author^4", "subtitle^2", "summary^4", "title^1", "series^1"])
classification_queries.append(match_rest_of_query)

# If classification queries and the remaining string all match, the result will
Expand All @@ -536,7 +538,7 @@ def without_match(original_string, match):
match_classification_and_rest_of_query = {
'bool': {
'must': classification_queries,
'boost': 200.0
'boost': 200
}
}

Expand Down Expand Up @@ -730,7 +732,7 @@ def remove_work(self, work):

class ExternalSearchIndexVersions(object):

VERSIONS = ['v2']
VERSIONS = ['v2', 'v3']

@classmethod
def latest(cls):
Expand All @@ -751,6 +753,64 @@ def map_fields(cls, fields, field_description):
mapping["properties"][field] = field_description
return mapping

@classmethod
def v3_body(cls):
"""The v3 body is the same as the v2 except for the inclusion of the
'.standard' version of fields, analyzed using the standard
analyzer for near-exact matches.
"""
settings = {
"analysis": {
"filter": {
"en_stop_filter": {
"type": "stop",
"stopwords": ["_english_"]
},
"en_stem_filter": {
"type": "stemmer",
"name": "english"
},
"en_stem_minimal_filter": {
"type": "stemmer",
"name": "english"
},
},
"analyzer" : {
"en_analyzer": {
"type": "custom",
"char_filter": ["html_strip"],
"tokenizer": "standard",
"filter": ["lowercase", "asciifolding", "en_stop_filter", "en_stem_filter"]
},
"en_minimal_analyzer": {
"type": "custom",
"char_filter": ["html_strip"],
"tokenizer": "standard",
"filter": ["lowercase", "asciifolding", "en_stop_filter", "en_stem_minimal_filter"]
},
}
}
}

mapping = cls.map_fields(
fields=["title", "series", "subtitle", "summary", "classifications.term"],
field_description={
"type": "string",
"analyzer": "en_analyzer",
"fields": {
"minimal": {
"type": "string",
"analyzer": "en_minimal_analyzer"},
"standard": {
"type": "string",
"analyzer": "standard"
}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the part that's different.

}}
)
mappings = { ExternalSearchIndex.work_document_type : mapping }

return dict(settings=settings, mappings=mappings)

@classmethod
def v2_body(cls):

Expand Down Expand Up @@ -795,7 +855,8 @@ def v2_body(cls):
"fields": {
"minimal": {
"type": "string",
"analyzer": "en_minimal_analyzer"}}}
"analyzer": "en_minimal_analyzer"},
}}
)
mappings = { ExternalSearchIndex.work_document_type : mapping }

Expand Down Expand Up @@ -873,7 +934,7 @@ class SearchIndexMonitor(WorkSweepMonitor):
def __init__(self, _db, collection, index_name=None, index_client=None,
**kwargs):
super(SearchIndexMonitor, self).__init__(_db, collection, **kwargs)

if index_client:
# This would only happen during a test.
self.search_index_client = index_client
Expand Down
12 changes: 12 additions & 0 deletions migration/20171201-rebuild-search-index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/usr/bin/env python
"""Running the search index updater script will create the new
circulation-works-v3 index and change the circulation-works-current
alias to point to it.
"""
import os
import sys
bin_dir = os.path.split(__file__)[0]
package_dir = os.path.join(bin_dir, "..")
sys.path.append(os.path.abspath(package_dir))
from scripts import UpdateSearchIndexScript
UpdateSearchIndexScript().run()
Loading