From 60a27932460f0cbb3a3741d5ef605cd9679b5d5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolas=20H=C3=B6ning?= Date: Wed, 17 Jul 2024 19:24:26 +0200 Subject: [PATCH] Fast track strategy for searching single recent beliefs (#179) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add most_recent_event_only and most_recent_belief_only search filter strategy for fast access to single recent beliefs Signed-off-by: Nicolas Höning * add index for improving most_recent_event_only use case Signed-off-by: Nicolas Höning * Fix belief sorting order Co-authored-by: Felix Claessen <30658763+Flix6x@users.noreply.github.com> * docs/comment improvements from review Signed-off-by: Nicolas Höning * Use only most_recent_only filter, apply also in special case that has the same one-row outcome, and improve the docstring's section about speed Signed-off-by: Nicolas Höning * move sensor lookup further up, as error raising for no sources uses a sensor object Signed-off-by: Nicolas Höning --------- Signed-off-by: Nicolas Höning Co-authored-by: Felix Claessen <30658763+Flix6x@users.noreply.github.com> --- timely_beliefs/beliefs/classes.py | 60 ++++++++++++++++++++++++++----- 1 file changed, 52 insertions(+), 8 deletions(-) diff --git a/timely_beliefs/beliefs/classes.py b/timely_beliefs/beliefs/classes.py index dfee939..3d4bdbf 100644 --- a/timely_beliefs/beliefs/classes.py +++ b/timely_beliefs/beliefs/classes.py @@ -191,9 +191,14 @@ def __table_args__(cls): "sensor_id", "source_id", postgresql_include=[ - "belief_horizon", # we use min() on this + "belief_horizon", # we use min() on this (most_recent_beliefs_only) ], ), + Index( + f"{cls.__tablename__}_search_session_singleevent_idx", + "event_start", + "sensor_id", + ), ) event_start = Column(DateTime(timezone=True), primary_key=True, index=True) @@ -342,6 +347,7 @@ def search_session( # noqa: C901 source: BeliefSource | list[BeliefSource] | None = None, most_recent_beliefs_only: bool = False, most_recent_events_only: bool = False, + most_recent_only: bool = False, place_beliefs_in_sensor_timezone: bool = True, place_events_in_sensor_timezone: bool = True, custom_filter_criteria: list[BinaryExpression] | None = None, @@ -352,6 +358,14 @@ def search_session( # noqa: C901 The optional arguments represent optional filters, with two exceptions: - sensor_class makes it possible to create a query on sensor subclasses - custom_join_targets makes it possible to add custom filters using other (incl. subclassed) targets + + A word about query speed: + As data sets can become quite large (and get wrapped into a BeliefsDataFrame), we recommend to filter by sensor and source, as well as a time range, in order to limit processing time. + You should also consider selecting only the most recent beliefs (for each event, there might have been more than one). + Also, look into selecting only the most recent events (per source). + These two latter tips decrease the dataset and thus post-processing time. They use sub-queries, though, so be sure to use the main filtering, as well, like time range. + Finally, if you only need one most recent value (one result row), there is a fast-track (most_recent_only). + :param session: the database session to use :param sensor: sensor to which the beliefs pertain, or its unique sensor id :param sensor_class: optionally pass the sensor (sub)class explicitly (only needed if you pass a sensor id instead of a sensor, and your sensor class is not DBSensor); the class should be mapped to a database table @@ -365,9 +379,10 @@ def search_session( # noqa: C901 :param beliefs_before: only return beliefs formed before this datetime (inclusive) :param horizons_at_least: only return beliefs with a belief horizon equal or greater than this timedelta (for example, use timedelta(0) to get ante knowledge time beliefs) :param horizons_at_most: only return beliefs with a belief horizon equal or less than this timedelta (for example, use timedelta(0) to get post knowledge time beliefs) - :param source: only return beliefs formed by the given source or list of sources + :param source: only return beliefs formed by the given source or list of sources. This speeds up your query, so if you know the source, let the query know. :param most_recent_beliefs_only: only return the most recent beliefs for each event from each source (minimum belief horizon) - :param most_recent_events_only: only return (post knowledge time) beliefs for the most recent event (maximum event start) + :param most_recent_events_only: only return (post knowledge time) beliefs for the most recent event (maximum event start) for each source + :param most_recent_only: only looks up one most recent event and then only returns the most recent belief about that event that it finds. This is a considerable fast-track if only one most recent belief is all you need. :param place_beliefs_in_sensor_timezone: if True (the default), belief times are converted to the timezone of the sensor :param place_events_in_sensor_timezone: if True (the default), event starts are converted to the timezone of the sensor :param custom_filter_criteria: additional filters, such as ones that rely on subclasses @@ -415,9 +430,13 @@ def search_session( # noqa: C901 if sensor is None: raise ValueError("No such sensor") - # Fast-track empty list of sources - if source == []: - return BeliefsDataFrame(sensor=sensor, beliefs=[]) + # Parse source parameter + sources: list = [] + if source is not None: + sources = [source] if not isinstance(source, list) else source + # Fast-track empty list of sources + if sources == []: + return BeliefsDataFrame(sensor=sensor, beliefs=[]) # Get bounds on the knowledge horizon (so we can already roughly filter by belief time) ( @@ -514,10 +533,24 @@ def apply_belief_timing_filters(q): q = apply_belief_timing_filters(q) # Apply source filter - if source is not None: - sources: list = [source] if not isinstance(source, list) else source + if len(sources) > 0: q = q.join(source_class).filter(cls.source_id.in_([s.id for s in sources])) + # Switch to fast-track if user wants both most recent events & beliefs and one source is requested. + # In this case, we know only one row will be returned. (If only one source exists in the to-be-returned dataset, + # we'd get one row without filtering for source, but we cannot know if that happens before we query.) + if (most_recent_beliefs_only and most_recent_events_only) and len(sources) == 1: + most_recent_only = True + most_recent_events_only = False + most_recent_beliefs_only = False + # Otherwise, we should not allow to mix the two most-recent-X approaches, for clarity + elif (most_recent_beliefs_only or most_recent_events_only) and ( + most_recent_only + ): + raise ValueError( + "most_recent_events|beliefs_only can not be used with most_recent_only." + ) + # Apply most recent beliefs filter as subquery most_recent_beliefs_only_incompatible_criteria = ( beliefs_before is not None or beliefs_after is not None @@ -575,8 +608,19 @@ def apply_belief_timing_filters(q): ), ) + # Apply fast-track most-recent-only approach + # Note that currently, this only works for a deterministic belief. A probabilistic belief would have multiple rows + # (sharing the same event start and belief horizon). + if most_recent_only: + q = q.order_by(cls.event_start.desc(), cls.belief_horizon.asc()).limit(1) + + # Useful debugging code, let's keep it here + # from sqlalchemy.dialects import postgresql + # print(q.compile(dialect=postgresql.dialect())) + # Build our DataFrame of beliefs df = pd.DataFrame(session.execute(q)) + if df.empty: return BeliefsDataFrame(sensor=sensor) df.columns = [