Skip to content

Commit

Permalink
Merge pull request #819 from HEPData/search-range
Browse files Browse the repository at this point in the history
Add support for range-based ID searching
  • Loading branch information
GraemeWatt authored Dec 4, 2024
2 parents 50a2498 + 16d3e8c commit b6e9df9
Show file tree
Hide file tree
Showing 8 changed files with 471 additions and 29 deletions.
1 change: 1 addition & 0 deletions hepdata/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ def _(x):
CFG_DATA_TYPE = 'datatable'
CFG_SUBMISSIONS_TYPE = 'submission'
CFG_DATA_KEYWORDS = ['observables', 'reactions', 'cmenergies', 'phrases']
CFG_SEARCH_RANGE_TERMS = ["recid", "publication_recid", "inspire_id"] # Possible terms used to OpenSearch API range searches

CFG_CONVERTER_URL = 'https://converter.hepdata.net'
CFG_SUPPORTED_FORMATS = ['yaml', 'root', 'csv', 'yoda', 'yoda1', 'original']
Expand Down
59 changes: 40 additions & 19 deletions hepdata/ext/opensearch/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,9 @@ def search(query,
('collaboration', collaboration_name), ('date', date)
:param size: [int] max number of hits that should be returned
:param offset: [int] offset for the results (used for pagination)
:param sort_by: [string] sorting field. Currently supported fields:
"title", "collaboration", "date", "relevance"
:param sort_field: [string] sorting field. Currently supported fields:
"title", "collaboration", "date", "relevance",
"recid", "inspire_id"
:param sort_order: [string] order of the sorting either original
(for a particular field) or reversed. Supported:
'' or 'rev'
Expand All @@ -108,23 +109,41 @@ def search(query,
if query == '' and not sort_field:
sort_field = 'date'

query = HEPDataQueryParser.parse_query(query)
# Create search with preference param to ensure consistency of results across shards
search = RecordsSearch(using=os, index=index).with_preference_param()

# Determine if the query is range-based, and get it, or the default search order
range_terms, exclude_tables, parsed_query = HEPDataQueryParser.parse_range_query(query)

# We passed the newly range-parsed query to be parsed
query = HEPDataQueryParser.parse_query(parsed_query)
fuzzy_query = QueryString(query=query, fuzziness='AUTO')

if query:
fuzzy_query = QueryString(query=query, fuzziness='AUTO')
if exclude_tables:
search.query = fuzzy_query

if query and not exclude_tables:
search.query = fuzzy_query | \
Q('has_child', type="child_datatable", query=fuzzy_query)

# Add filter to search for only "publication" objects
search = search.filter("term", doc_type=CFG_PUB_TYPE)
search = QueryBuilder.add_filters(search, filters)


if range_terms and not sort_field and not sort_order:
# Set default search keyword, and set default sort to desc
sort_field = 'recid'
sort_order = 'desc'

try:
mapped_sort_field = sort_fields_mapping(sort_field)
except ValueError as ve:
return {'error': str(ve)}

search = search.sort({mapped_sort_field : {"order" : calculate_sort_order(sort_order, sort_field)}})

search = add_default_aggregations(search, filters)

if post_filter:
Expand All @@ -135,23 +154,25 @@ def search(query,

try:
pub_result = search.execute().to_dict()

parent_filter = {
"terms": {
"_id": [hit["_id"] for hit in pub_result['hits']['hits']]
data_result = {}
if not exclude_tables:
parent_filter = {
"terms": {
"_id": [hit["_id"] for hit in pub_result['hits']['hits']]
}
}
}

data_search = RecordsSearch(using=os, index=index)
data_search = data_search.query('has_parent',
parent_type="parent_publication",
query=parent_filter)
if query:
data_search = data_search.query(QueryString(query=query))
data_search = RecordsSearch(using=os, index=index)
data_search = data_search.query('has_parent',
parent_type="parent_publication",
query=parent_filter)

if query:
data_search = data_search.query(QueryString(query=query))

data_search_size = size * OPENSEARCH_MAX_RESULT_WINDOW // LIMIT_MAX_RESULTS_PER_PAGE
data_search = data_search[0:data_search_size]
data_result = data_search.execute().to_dict()
data_search_size = size * OPENSEARCH_MAX_RESULT_WINDOW // LIMIT_MAX_RESULTS_PER_PAGE
data_search = data_search[0:data_search_size]
data_result = data_search.execute().to_dict()

merged_results = merge_results(pub_result, data_result)
return map_result(merged_results, filters)
Expand All @@ -165,7 +186,7 @@ def search(query,
else:
log.error(f'An unexpected error occurred when searching: {e}')
reason = f'An unexpected error occurred: {e.error}'
return { 'error': reason }
return {'error': reason}


@author_index
Expand Down
6 changes: 6 additions & 0 deletions hepdata/ext/opensearch/config/os_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,12 @@ def sort_fields_mapping(sort_by):
return 'creation_date'
elif sort_by == 'latest':
return 'last_updated'
elif sort_by == 'recid':
return 'recid' # No change required
elif sort_by == 'publication_recid':
return 'publication_recid' # No change required
elif sort_by == 'inspire_id':
return 'inspire_id' # No change required
elif not sort_by or sort_by == 'relevance':
return '_score'
else:
Expand Down
2 changes: 1 addition & 1 deletion hepdata/ext/opensearch/config/record_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@
}
},
"inspire_id": {
"type": "text"
"type": "integer"
},
"keywords": {
"properties": {
Expand Down
22 changes: 19 additions & 3 deletions hepdata/ext/opensearch/process_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,26 @@
from hepdata.utils.miscellaneous import splitter


def merge_results(pub_result, data_result):
def merge_results(pub_result, data_result=None):
"""
Merge results dictionaries of publication and data table
search result data.
Data result does not exist in publication-only searches,
so defaults to None.
:param pub_result: Publication search data.
:param data_result: Data table search data.
:return: Merged search results dictionary.
"""
merge_dict = dict()
merge_dict['hits'] = pub_result['hits']['hits'] + \
data_result['hits']['hits']

# We don't need to merge if there is no data.
if data_result:
merge_dict['hits'] = pub_result['hits']['hits'] + \
data_result['hits']['hits']
else:
merge_dict['hits'] = pub_result['hits']['hits']

merge_dict['total'] = pub_result['hits']['total']['value']
merge_dict['aggregations'] = pub_result.get('aggregations', {})
return merge_dict
Expand Down
40 changes: 39 additions & 1 deletion hepdata/ext/opensearch/query_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
import re
from opensearch_dsl import Q

from hepdata.config import CFG_SEARCH_RANGE_TERMS


class QueryBuilder:

Expand Down Expand Up @@ -52,7 +54,8 @@ def parse_query(query_string):
"phrases": "data_keywords.phrases",
"reactions": "data_keywords.reactions",
"analysis": "analyses.type",
"resources": "resources.description" # Add shorthand for resource description
"resources": "resources.description", # Add shorthand for resource description
"publication_recid": "recid" # Shorthand for HEPData record ID
}
}

Expand Down Expand Up @@ -81,3 +84,38 @@ def _quote_phrase(phrase):
if '"' not in phrase and pattern.fullmatch(phrase):
return f'"{phrase}"'
return phrase

@staticmethod
def parse_range_query(query):
"""
Parses and verifies whether a parsed query string contains a range-based query.
If it does, return either that search keyword,
or the "default" keyword for default search ordering.
Also determines if the query is a publication-only search, where tables are excluded.
Returns the query with publication_recid replaced with 'recid' for opensearch.
Examples: publication_recid:[321 TO 321] inspire_id:[123 TO 123]
:param query: The full query string
:return: A tuple containing a list of parsed range terms,
and a boolean determining whether table exclusion should occur (if range term is publication_recid,
or inspire_id), and the query with term replaced.
"""
# Pattern matching docstring example with placeholder
pattern = rf"(?:^|\s)%s:\s*\[\d+\s+TO\s+\d+]"
range_terms = []
exclude_tables = False
# For all terms that can be range searched
for term in CFG_SEARCH_RANGE_TERMS:
result = re.findall(pattern % term, query)
if result:
range_terms.append(term)

# If we are doing a range search on non-table objects
if ("publication_recid" in range_terms or "inspire_id" in range_terms) and "recid" not in range_terms:
exclude_tables = True

# Finally, we replace publication_recid with the correct mapping for OpenSearch
query = query.replace("publication_recid", "recid")

return range_terms, exclude_tables, query
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,32 @@ <h4>Searching via Inspire</h4>
</ul>
</div>

<div class="well well-small">
<h4>Range-based Searching</h4>
<p>
We support searching for a range of records using their HEPData record ID or Inspire ID.
</p>
<ul>
<li>Range searching by HEPData record ID:
<ul>
<li>
<a href='/search?q=publication_recid:[1 TO 10]'
target="_new">publication_recid:[1 TO 10]</a>
</li>
</ul>
</li>
<br/>
<li>Range searching by Inspire ID:
<ul>
<li>
<a href='/search?q=inspire_id:[1 TO 10000]'
target="_new">inspire_id:[1 TO 10000]</a>
</li>
</ul>
</li>
</ul>
</div>

</div>
</div>
</div>
Expand Down
Loading

0 comments on commit b6e9df9

Please sign in to comment.