From 60a27932460f0cbb3a3741d5ef605cd9679b5d5c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolas=20H=C3=B6ning?= <nicolas@seita.nl>
Date: Wed, 17 Jul 2024 19:24:26 +0200
Subject: [PATCH] Fast track strategy for searching single recent beliefs
 (#179)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add most_recent_event_only and most_recent_belief_only search filter strategy for fast access to single recent beliefs

Signed-off-by: Nicolas Höning <nicolas@seita.nl>

* add index for improving most_recent_event_only use case

Signed-off-by: Nicolas Höning <nicolas@seita.nl>

* Fix belief sorting order

Co-authored-by: Felix Claessen <30658763+Flix6x@users.noreply.github.com>

* docs/comment improvements from review

Signed-off-by: Nicolas Höning <nicolas@seita.nl>

* Use only most_recent_only filter, apply also in special case that has the same one-row outcome, and improve the docstring's section about speed

Signed-off-by: Nicolas Höning <nicolas@seita.nl>

* move sensor lookup further up, as error raising for no sources uses a sensor object

Signed-off-by: Nicolas Höning <nicolas@seita.nl>

---------

Signed-off-by: Nicolas Höning <nicolas@seita.nl>
Co-authored-by: Felix Claessen <30658763+Flix6x@users.noreply.github.com>
---
 timely_beliefs/beliefs/classes.py | 60 ++++++++++++++++++++++++++-----
 1 file changed, 52 insertions(+), 8 deletions(-)

diff --git a/timely_beliefs/beliefs/classes.py b/timely_beliefs/beliefs/classes.py
index dfee939..3d4bdbf 100644
--- a/timely_beliefs/beliefs/classes.py
+++ b/timely_beliefs/beliefs/classes.py
@@ -191,9 +191,14 @@ def __table_args__(cls):
                 "sensor_id",
                 "source_id",
                 postgresql_include=[
-                    "belief_horizon",  # we use min() on this
+                    "belief_horizon",  # we use min() on this (most_recent_beliefs_only)
                 ],
             ),
+            Index(
+                f"{cls.__tablename__}_search_session_singleevent_idx",
+                "event_start",
+                "sensor_id",
+            ),
         )
 
     event_start = Column(DateTime(timezone=True), primary_key=True, index=True)
@@ -342,6 +347,7 @@ def search_session(  # noqa: C901
         source: BeliefSource | list[BeliefSource] | None = None,
         most_recent_beliefs_only: bool = False,
         most_recent_events_only: bool = False,
+        most_recent_only: bool = False,
         place_beliefs_in_sensor_timezone: bool = True,
         place_events_in_sensor_timezone: bool = True,
         custom_filter_criteria: list[BinaryExpression] | None = None,
@@ -352,6 +358,14 @@ def search_session(  # noqa: C901
         The optional arguments represent optional filters, with two exceptions:
         - sensor_class makes it possible to create a query on sensor subclasses
         - custom_join_targets makes it possible to add custom filters using other (incl. subclassed) targets
+
+        A word about query speed:
+        As data sets can become quite large (and get wrapped into a BeliefsDataFrame), we recommend to filter by sensor and source, as well as a time range, in order to limit processing time.
+        You should also consider selecting only the most recent beliefs (for each event, there might have been more than one).
+        Also, look into selecting only the most recent events (per source).
+        These two latter tips decrease the dataset and thus post-processing time. They use sub-queries, though, so be sure to use the main filtering, as well, like time range.
+        Finally, if you only need one most recent value (one result row), there is a fast-track (most_recent_only).
+
         :param session: the database session to use
         :param sensor: sensor to which the beliefs pertain, or its unique sensor id
         :param sensor_class: optionally pass the sensor (sub)class explicitly (only needed if you pass a sensor id instead of a sensor, and your sensor class is not DBSensor); the class should be mapped to a database table
@@ -365,9 +379,10 @@ def search_session(  # noqa: C901
         :param beliefs_before: only return beliefs formed before this datetime (inclusive)
         :param horizons_at_least: only return beliefs with a belief horizon equal or greater than this timedelta (for example, use timedelta(0) to get ante knowledge time beliefs)
         :param horizons_at_most: only return beliefs with a belief horizon equal or less than this timedelta (for example, use timedelta(0) to get post knowledge time beliefs)
-        :param source: only return beliefs formed by the given source or list of sources
+        :param source: only return beliefs formed by the given source or list of sources. This speeds up your query, so if you know the source, let the query know.
         :param most_recent_beliefs_only: only return the most recent beliefs for each event from each source (minimum belief horizon)
-        :param most_recent_events_only: only return (post knowledge time) beliefs for the most recent event (maximum event start)
+        :param most_recent_events_only: only return (post knowledge time) beliefs for the most recent event (maximum event start) for each source
+        :param most_recent_only: only looks up one most recent event and then only returns the most recent belief about that event that it finds. This is a considerable fast-track if only one most recent belief is all you need.
         :param place_beliefs_in_sensor_timezone: if True (the default), belief times are converted to the timezone of the sensor
         :param place_events_in_sensor_timezone: if True (the default), event starts are converted to the timezone of the sensor
         :param custom_filter_criteria: additional filters, such as ones that rely on subclasses
@@ -415,9 +430,13 @@ def search_session(  # noqa: C901
             if sensor is None:
                 raise ValueError("No such sensor")
 
-        # Fast-track empty list of sources
-        if source == []:
-            return BeliefsDataFrame(sensor=sensor, beliefs=[])
+        # Parse source parameter
+        sources: list = []
+        if source is not None:
+            sources = [source] if not isinstance(source, list) else source
+            # Fast-track empty list of sources
+            if sources == []:
+                return BeliefsDataFrame(sensor=sensor, beliefs=[])
 
         # Get bounds on the knowledge horizon (so we can already roughly filter by belief time)
         (
@@ -514,10 +533,24 @@ def apply_belief_timing_filters(q):
         q = apply_belief_timing_filters(q)
 
         # Apply source filter
-        if source is not None:
-            sources: list = [source] if not isinstance(source, list) else source
+        if len(sources) > 0:
             q = q.join(source_class).filter(cls.source_id.in_([s.id for s in sources]))
 
+        # Switch to fast-track if user wants both most recent events & beliefs and one source is requested.
+        # In this case, we know only one row will be returned. (If only one source exists in the to-be-returned dataset,
+        #  we'd get one row without filtering for source, but we cannot know if that happens before we query.)
+        if (most_recent_beliefs_only and most_recent_events_only) and len(sources) == 1:
+            most_recent_only = True
+            most_recent_events_only = False
+            most_recent_beliefs_only = False
+        # Otherwise, we should not allow to mix the two most-recent-X approaches, for clarity
+        elif (most_recent_beliefs_only or most_recent_events_only) and (
+            most_recent_only
+        ):
+            raise ValueError(
+                "most_recent_events|beliefs_only can not be used with most_recent_only."
+            )
+
         # Apply most recent beliefs filter as subquery
         most_recent_beliefs_only_incompatible_criteria = (
             beliefs_before is not None or beliefs_after is not None
@@ -575,8 +608,19 @@ def apply_belief_timing_filters(q):
                 ),
             )
 
+        # Apply fast-track most-recent-only approach
+        # Note that currently, this only works for a deterministic belief. A probabilistic belief would have multiple rows
+        # (sharing the same event start and belief horizon).
+        if most_recent_only:
+            q = q.order_by(cls.event_start.desc(), cls.belief_horizon.asc()).limit(1)
+
+        # Useful debugging code, let's keep it here
+        # from sqlalchemy.dialects import postgresql
+        # print(q.compile(dialect=postgresql.dialect()))
+
         # Build our DataFrame of beliefs
         df = pd.DataFrame(session.execute(q))
+
         if df.empty:
             return BeliefsDataFrame(sensor=sensor)
         df.columns = [