Do not record sortedness in the StreamDescriptor when sorting on a no…

…n-ts index
man-group · Jan 20, 2025 · 268a601 · 268a601
1 parent cfdc854
commit 268a601
Show file tree

Hide file tree

Showing 3 changed files with 31 additions and 1 deletion.
diff --git a/cpp/arcticdb/stream/incompletes.cpp b/cpp/arcticdb/stream/incompletes.cpp
@@ -261,7 +261,9 @@ void do_sort(SegmentInMemory& mutable_seg, const std::vector<std::string> sort_c
     auto tsd = pack_timeseries_descriptor(frame->desc, frame->num_rows, std::nullopt, std::move(norm_meta));
     segment.set_timeseries_descriptor(tsd);
 
-    bool is_sorted = options.sort_on_index || (options.sort_columns && options.sort_columns->at(0) == segment.descriptor().field(0).name());
+    bool is_timestamp_index = std::holds_alternative<stream::TimeseriesIndex>(frame->index);
+    bool is_sorted = is_timestamp_index && (options.sort_on_index || (
+        is_timestamp_index && options.sort_columns && options.sort_columns->at(0) == segment.descriptor().field(0).name()));
 
     // Have to give each its own string pool for thread safety
     bool filter_down_stringpool{true};

diff --git a/cpp/arcticdb/stream/python_bindings.cpp b/cpp/arcticdb/stream/python_bindings.cpp
@@ -120,6 +120,7 @@ void register_types(py::module &m) {
         .def("fields", [](const StreamDescriptor& desc){
             return field_collection_to_ref_vector(desc.fields());
         })
+        .def("sorted", &StreamDescriptor::sorted)
     );
 
     py::class_<TimeseriesDescriptor>(m, "TimeseriesDescriptor")

diff --git a/python/tests/unit/arcticdb/version_store/test_sort.py b/python/tests/unit/arcticdb/version_store/test_sort.py
@@ -6,6 +6,7 @@
 import string
 
 from arcticdb_ext.storage import KeyType
+from arcticdb_ext.version_store import SortedValue
 
 
 def test_stage_finalize(arctic_library):
@@ -194,6 +195,7 @@ def test_stage_with_sort_index_chunking(lmdb_version_store_tiny_segment):
         df = lib_tool.read_to_dataframe(k)
         assert df.shape == (2, 3)  # we should apply row slicing but not column slicing
         assert df.index.is_monotonic_increasing
+        assert lib_tool.read_descriptor(k).sorted() == SortedValue.ASCENDING
 
     ref_keys = lib_tool.find_keys_for_symbol(KeyType.APPEND_REF, symbol)
     assert not ref_keys
@@ -204,6 +206,31 @@ def test_stage_with_sort_index_chunking(lmdb_version_store_tiny_segment):
     assert_frame_equal(df1, actual)
 
 
+def test_stage_with_sort_columns_not_ts(lmdb_version_store_v1):
+    symbol = "AAPL"
+    lib = lmdb_version_store_v1
+
+    df1 = pd.DataFrame({
+        "idx": np.arange(1, 51),
+        "col1": np.arange(1, 51),
+        "col2": [f"a{i:02d}" for i in range(1, 51)],
+        "col3": np.arange(1, 51)
+    }).set_index("idx")
+    df1_shuffled = df1.sample(frac=1)
+
+    lib.stage(symbol, df1_shuffled, validate_index=False, sort_on_index=False, sort_columns=["idx"])
+
+    lib_tool = lib.library_tool()
+    data_keys = lib_tool.find_keys_for_symbol(KeyType.APPEND_DATA, symbol)
+    assert len(data_keys) == 1
+    data_key = data_keys[0]
+    assert lib_tool.read_descriptor(data_key).sorted() == SortedValue.UNKNOWN
+
+    lib.compact_incomplete(symbol, append=False, convert_int_to_float=False)
+    actual = lib.read(symbol).data
+    assert_frame_equal(df1, actual)
+
+
 def test_stage_finalize_dynamic_with_chunking(arctic_client, lib_name):
     lib_opts = adb.LibraryOptions(dynamic_schema=True, rows_per_segment=2, columns_per_segment=2)
     lib = arctic_client.get_library(lib_name, create_if_missing=True, library_options=lib_opts)