Merge pull request #819 from HEPData/search-range

Add support for range-based ID searching
HEPData · Dec 4, 2024 · b6e9df9 · b6e9df9
2 parents 50a2498 + 16d3e8c
commit b6e9df9
Show file tree

Hide file tree

Showing 8 changed files with 471 additions and 29 deletions.
diff --git a/hepdata/config.py b/hepdata/config.py
@@ -193,6 +193,7 @@ def _(x):
 CFG_DATA_TYPE = 'datatable'
 CFG_SUBMISSIONS_TYPE = 'submission'
 CFG_DATA_KEYWORDS = ['observables', 'reactions', 'cmenergies', 'phrases']
+CFG_SEARCH_RANGE_TERMS = ["recid", "publication_recid", "inspire_id"]  # Possible terms used to OpenSearch API range searches
 
 CFG_CONVERTER_URL = 'https://converter.hepdata.net'
 CFG_SUPPORTED_FORMATS = ['yaml', 'root', 'csv', 'yoda', 'yoda1', 'original']

diff --git a/hepdata/ext/opensearch/api.py b/hepdata/ext/opensearch/api.py
@@ -96,8 +96,9 @@ def search(query,
                     ('collaboration', collaboration_name), ('date', date)
     :param size: [int] max number of hits that should be returned
     :param offset: [int] offset for the results (used for pagination)
-    :param sort_by: [string] sorting field. Currently supported fields:
-                    "title", "collaboration", "date", "relevance"
+    :param sort_field: [string] sorting field. Currently supported fields:
+                    "title", "collaboration", "date", "relevance",
+                    "recid", "inspire_id"
     :param sort_order: [string] order of the sorting either original
                     (for a particular field) or reversed. Supported:
                     '' or 'rev'
@@ -108,23 +109,41 @@ def search(query,
     if query == '' and not sort_field:
         sort_field = 'date'
 
-    query = HEPDataQueryParser.parse_query(query)
     # Create search with preference param to ensure consistency of results across shards
     search = RecordsSearch(using=os, index=index).with_preference_param()
 
+    # Determine if the query is range-based, and get it, or the default search order
+    range_terms, exclude_tables, parsed_query = HEPDataQueryParser.parse_range_query(query)
+
+    # We passed the newly range-parsed query to be parsed
+    query = HEPDataQueryParser.parse_query(parsed_query)
+    fuzzy_query = QueryString(query=query, fuzziness='AUTO')
+
     if query:
-        fuzzy_query = QueryString(query=query, fuzziness='AUTO')
+        if exclude_tables:
+            search.query = fuzzy_query
+
+    if query and not exclude_tables:
         search.query = fuzzy_query | \
                        Q('has_child', type="child_datatable", query=fuzzy_query)
 
+    # Add filter to search for only "publication" objects
     search = search.filter("term", doc_type=CFG_PUB_TYPE)
     search = QueryBuilder.add_filters(search, filters)
 
+
+    if range_terms and not sort_field and not sort_order:
+        # Set default search keyword, and set default sort to desc
+        sort_field = 'recid'
+        sort_order = 'desc'
+
     try:
         mapped_sort_field = sort_fields_mapping(sort_field)
     except ValueError as ve:
         return {'error': str(ve)}
+
     search = search.sort({mapped_sort_field : {"order" : calculate_sort_order(sort_order, sort_field)}})
+
     search = add_default_aggregations(search, filters)
 
     if post_filter:
@@ -135,23 +154,25 @@ def search(query,
 
     try:
         pub_result = search.execute().to_dict()
-
-        parent_filter = {
-            "terms": {
-                        "_id": [hit["_id"] for hit in pub_result['hits']['hits']]
+        data_result = {}
+        if not exclude_tables:
+            parent_filter = {
+                "terms": {
+                            "_id": [hit["_id"] for hit in pub_result['hits']['hits']]
+                }
             }
-        }
 
-        data_search = RecordsSearch(using=os, index=index)
-        data_search = data_search.query('has_parent',
-                                        parent_type="parent_publication",
-                                        query=parent_filter)
-        if query:
-            data_search = data_search.query(QueryString(query=query))
+            data_search = RecordsSearch(using=os, index=index)
+            data_search = data_search.query('has_parent',
+                                                parent_type="parent_publication",
+                                                query=parent_filter)
+
+            if query:
+                data_search = data_search.query(QueryString(query=query))
 
-        data_search_size = size * OPENSEARCH_MAX_RESULT_WINDOW // LIMIT_MAX_RESULTS_PER_PAGE
-        data_search = data_search[0:data_search_size]
-        data_result = data_search.execute().to_dict()
+            data_search_size = size * OPENSEARCH_MAX_RESULT_WINDOW // LIMIT_MAX_RESULTS_PER_PAGE
+            data_search = data_search[0:data_search_size]
+            data_result = data_search.execute().to_dict()
 
         merged_results = merge_results(pub_result, data_result)
         return map_result(merged_results, filters)
@@ -165,7 +186,7 @@ def search(query,
         else:
             log.error(f'An unexpected error occurred when searching: {e}')
             reason = f'An unexpected error occurred: {e.error}'
-        return { 'error': reason }
+        return {'error': reason}
 
 
 @author_index

diff --git a/hepdata/ext/opensearch/config/os_config.py b/hepdata/ext/opensearch/config/os_config.py
@@ -113,6 +113,12 @@ def sort_fields_mapping(sort_by):
         return 'creation_date'
     elif sort_by == 'latest':
         return 'last_updated'
+    elif sort_by == 'recid':
+        return 'recid'  # No change required
+    elif sort_by == 'publication_recid':
+        return 'publication_recid'  # No change required
+    elif sort_by == 'inspire_id':
+        return 'inspire_id'  # No change required
     elif not sort_by or sort_by == 'relevance':
         return '_score'
     else:

diff --git a/hepdata/ext/opensearch/config/record_mapping.py b/hepdata/ext/opensearch/config/record_mapping.py
@@ -171,7 +171,7 @@
         }
     },
     "inspire_id": {
-        "type": "text"
+        "type": "integer"
     },
     "keywords": {
         "properties": {

diff --git a/hepdata/ext/opensearch/process_results.py b/hepdata/ext/opensearch/process_results.py
@@ -27,10 +27,26 @@
 from hepdata.utils.miscellaneous import splitter
 
 
-def merge_results(pub_result, data_result):
+def merge_results(pub_result, data_result=None):
+    """
+    Merge results dictionaries of publication and data table
+      search result data.
+    Data result does not exist in publication-only searches,
+      so defaults to None.
+
+    :param pub_result: Publication search data.
+    :param data_result: Data table search data.
+    :return: Merged search results dictionary.
+    """
     merge_dict = dict()
-    merge_dict['hits'] = pub_result['hits']['hits'] + \
-        data_result['hits']['hits']
+
+    # We don't need to merge if there is no data.
+    if data_result:
+        merge_dict['hits'] = pub_result['hits']['hits'] + \
+            data_result['hits']['hits']
+    else:
+        merge_dict['hits'] = pub_result['hits']['hits']
+
     merge_dict['total'] = pub_result['hits']['total']['value']
     merge_dict['aggregations'] = pub_result.get('aggregations', {})
     return merge_dict

diff --git a/hepdata/ext/opensearch/query_builder.py b/hepdata/ext/opensearch/query_builder.py
@@ -23,6 +23,8 @@
 import re
 from opensearch_dsl import Q
 
+from hepdata.config import CFG_SEARCH_RANGE_TERMS
+
 
 class QueryBuilder:
 
@@ -52,7 +54,8 @@ def parse_query(query_string):
                 "phrases": "data_keywords.phrases",
                 "reactions": "data_keywords.reactions",
                 "analysis": "analyses.type",
-                "resources": "resources.description"  # Add shorthand for resource description
+                "resources": "resources.description",  # Add shorthand for resource description
+                "publication_recid": "recid"  # Shorthand for HEPData record ID
             }
         }
 
@@ -81,3 +84,38 @@ def _quote_phrase(phrase):
         if '"' not in phrase and pattern.fullmatch(phrase):
             return f'"{phrase}"'
         return phrase
+
+    @staticmethod
+    def parse_range_query(query):
+        """
+            Parses and verifies whether a parsed query string contains a range-based query.
+            If it does, return either that search keyword,
+            or the "default" keyword for default search ordering.
+            Also determines if the query is a publication-only search, where tables are excluded.
+            Returns the query with publication_recid replaced with 'recid' for opensearch.
+
+            Examples: publication_recid:[321 TO 321] inspire_id:[123 TO 123]
+
+            :param query: The full query string
+            :return: A tuple containing a list of parsed range terms,
+                and a boolean determining whether table exclusion should occur (if range term is publication_recid,
+                or inspire_id), and the query with term replaced.
+        """
+        # Pattern matching docstring example with placeholder
+        pattern = rf"(?:^|\s)%s:\s*\[\d+\s+TO\s+\d+]"
+        range_terms = []
+        exclude_tables = False
+        # For all terms that can be range searched
+        for term in CFG_SEARCH_RANGE_TERMS:
+            result = re.findall(pattern % term, query)
+            if result:
+                range_terms.append(term)
+
+        # If we are doing a range search on non-table objects
+        if ("publication_recid" in range_terms or "inspire_id" in range_terms) and "recid" not in range_terms:
+            exclude_tables = True
+
+        # Finally, we replace publication_recid with the correct mapping for OpenSearch
+        query = query.replace("publication_recid", "recid")
+
+        return range_terms, exclude_tables, query
diff --git a/hepdata/modules/search/templates/hepdata_search/modals/search_help.html b/hepdata/modules/search/templates/hepdata_search/modals/search_help.html
@@ -279,6 +279,32 @@ <h4>Searching via Inspire</h4>
                     </ul>
                 </div>
 
+                <div class="well well-small">
+                    <h4>Range-based Searching</h4>
+                  <p>
+                    We support searching for a range of records using their HEPData record ID or Inspire ID.
+                  </p>
+                    <ul>
+                        <li>Range searching by HEPData record ID:
+                            <ul>
+                                <li>
+                                    <a href='/search?q=publication_recid:[1 TO 10]'
+                                       target="_new">publication_recid:[1 TO 10]</a>
+                                </li>
+                            </ul>
+                        </li>
+                        <br/>
+                        <li>Range searching by Inspire ID:
+                            <ul>
+                                <li>
+                                    <a href='/search?q=inspire_id:[1 TO 10000]'
+                                       target="_new">inspire_id:[1 TO 10000]</a>
+                                </li>
+                            </ul>
+                        </li>
+                    </ul>
+                </div>
+
             </div>
         </div>
     </div>