Merge pull request #62 from NatLibFi/simplye-345/scandinavian-sorting…

…-in-search Adjust search sort field analyzers to work with scandinavian letters
NatLibFi · Apr 25, 2024 · 6aea2ae · 6aea2ae
2 parents 9430f97 + 2b8ab5c
commit 6aea2ae
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 5 deletions.
diff --git a/core/search/document.py b/core/search/document.py
@@ -107,6 +107,14 @@ def sort_author_keyword() -> SearchMappingFieldTypeParameterized:
     return t
 
 
+# Finland: added for correct sorting on scandinavian letters in titles
+def sort_title_keyword() -> SearchMappingFieldTypeParameterized:
+    t = SearchMappingFieldTypeParameterized("text")
+    t.parameters["analyzer"] = "sort_title_analyzer"
+    t.parameters["fielddata"] = "true"
+    return t
+
+
 class SearchMappingFieldTypeObject(SearchMappingFieldType):
     """See: https://opensearch.org/docs/latest/field-types/supported-field-types/object/"""
 

diff --git a/core/search/v5.py b/core/search/v5.py
@@ -7,10 +7,10 @@
     LONG,
     SearchMappingDocument,
     SearchMappingFieldType,
-    icu_collation_keyword,
     keyword,
     nested,
     sort_author_keyword,
+    sort_title_keyword,
 )
 from core.search.revision import SearchSchemaRevision
 
@@ -208,8 +208,9 @@ def __init__(self):
         # Here's a special filter used only by that analyzer. It
         # duplicates the filter used by the icu_collation_keyword data
         # type.
-        self._filters["en_sortable_filter"] = dict(
-            type="icu_collation", language="en", country="US"
+        # Finland: change language to fi to correclty handle scandinavian letters
+        self._filters["sortable_filter"] = dict(
+            type="icu_collation", language="fi", country="FI"
         )
 
         # Here's the analyzer used by the 'sort_author' property.
@@ -221,10 +222,16 @@ def __init__(self):
         # fields can't specify char_filter.
         self._analyzers["en_sort_author_analyzer"] = dict(
             tokenizer="keyword",
-            filter=["en_sortable_filter"],
+            filter=["sortable_filter"],
             char_filter=self.AUTHOR_CHAR_FILTER_NAMES,
         )
 
+        # Finland
+        self._analyzers["sort_title_analyzer"] = dict(
+            tokenizer="keyword",
+            filter=["sortable_filter"],
+        )
+
         self._fields: dict[str, SearchMappingFieldType] = {
             "summary": BASIC_TEXT,
             "title": FILTERABLE_TEXT,
@@ -235,7 +242,7 @@ def __init__(self):
             "publisher": FILTERABLE_TEXT,
             "imprint": FILTERABLE_TEXT,
             "presentation_ready": BOOLEAN,
-            "sort_title": icu_collation_keyword(),
+            "sort_title": sort_title_keyword(),
             "sort_author": sort_author_keyword(),
             "series_position": INTEGER,
             "work_id": INTEGER,