Skip to content

Commit

Permalink
refactor(lifecycle): simplify clickhouse sql logic
Browse files Browse the repository at this point in the history
This updates the SQL to be comprised of two queries, one for getting
new, returning, and resurrecting periods of activity, one for getting
dormant periods right after periods of activity.

Refers to #7382
  • Loading branch information
Harry Waye authored and macobo committed Jan 13, 2022
1 parent e717b57 commit 9fea145
Show file tree
Hide file tree
Showing 3 changed files with 160 additions and 262 deletions.
161 changes: 55 additions & 106 deletions ee/clickhouse/queries/test/__snapshots__/test_lifecycle.ambr
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# name: TestClickhouseLifecycle.test_test_account_filters_with_groups
'

WITH 'day' AS selected_period
SELECT groupArray(start_of_period) as date,
groupArray(counts) as data,
status
Expand All @@ -13,132 +13,81 @@
toUInt16(0) AS counts,
status
FROM
(SELECT toStartOfDay(toDateTime('2020-01-19 23:59:59') - number * 86400) as start_of_period
(SELECT dateTrunc(selected_period, toDateTime('2020-01-19 23:59:59') - number * 86400) as start_of_period
FROM numbers(8)
UNION ALL SELECT toStartOfDay(toDateTime('2020-01-12 00:00:00')) as start_of_period) as ticks
UNION ALL SELECT dateTrunc(selected_period, toDateTime('2020-01-12 00:00:00')) as start_of_period) as ticks
CROSS JOIN
(SELECT status
FROM
(SELECT ['new', 'returning', 'resurrecting', 'dormant'] as status) ARRAY
JOIN status) as sec
ORDER BY status,
start_of_period
UNION ALL SELECT next_period,
count(DISTINCT person_id) counts,
UNION ALL SELECT start_of_period,
count(DISTINCT group_id) counts,
status
FROM
(WITH person_activity_including_previous_period AS
(SELECT DISTINCT person_id,
toStartOfDay(events.timestamp) start_of_period
(WITH 444 AS current_team,
'day' AS selected_period, INTERVAL 1 DAY AS interval_type,
toDateTime('2020-01-12 00:00:00') AS selected_date_from,
toDateTime('2020-01-19 23:59:59') AS selected_date_to,
dateTrunc(selected_period, selected_date_from) AS selected_interval_from,
dateTrunc(selected_period, selected_date_to) AS selected_interval_to,
selected_interval_from - INTERVAL 1 DAY AS previous_date_from,
dateDiff(selected_period, previous_date_from, selected_interval_to) AS number_of_intervals,
filtered_events AS
(SELECT timestamp,
person_id AS group_id,
event
FROM events
JOIN
(SELECT distinct_id,
argMax(person_id, version) as person_id
FROM person_distinct_id2
WHERE team_id = 2
GROUP BY distinct_id
HAVING argMax(is_deleted, version) = 0) pdi ON events.distinct_id = pdi.distinct_id
WHERE team_id = 2
person_id
FROM
(SELECT distinct_id,
person_id,
is_deleted
FROM person_distinct_id2
WHERE team_id = current_team
ORDER BY distinct_id,
version DESC
LIMIT 1 BY distinct_id)
WHERE is_deleted = 0 ) person ON person.distinct_id = events.distinct_id
WHERE team_id = current_team
AND event = '$pageview'
AND $group_0 IN
(SELECT DISTINCT group_key
FROM groups
WHERE team_id = 2
AND group_type_index = 0
AND has(['value'], trim(BOTH '"'
FROM JSONExtractRaw(group_properties, 'key'))) )
GROUP BY person_id,
start_of_period
HAVING start_of_period <= toDateTime('2020-01-19 23:59:59')
AND start_of_period >= toDateTime('2020-01-11 00:00:00')),
person_activity_as_array AS
(SELECT DISTINCT person_id,
groupArray(toStartOfDay(events.timestamp)) start_of_period
FROM events
JOIN
(SELECT distinct_id,
argMax(person_id, version) as person_id
FROM person_distinct_id2
WHERE team_id = 2
GROUP BY distinct_id
HAVING argMax(is_deleted, version) = 0) pdi ON events.distinct_id = pdi.distinct_id
WHERE team_id = 2
AND event = '$pageview'
AND $group_0 IN
(SELECT DISTINCT group_key
FROM groups
WHERE team_id = 2
AND group_type_index = 0
AND has(['value'], trim(BOTH '"'
FROM JSONExtractRaw(group_properties, 'key'))) )
AND toDateTime(events.timestamp) <= toDateTime('2020-01-19 23:59:59')
AND toStartOfDay(events.timestamp) >= toDateTime('2020-01-12 00:00:00')
GROUP BY person_id),
periods AS
(SELECT toStartOfDay(toDateTime('2020-01-19 23:59:59') - number * 86400) AS start_of_period
FROM numbers(8)) SELECT activity_pairs.person_id AS person_id,
activity_pairs.initial_period AS initial_period,
activity_pairs.next_period AS next_period,
if(initial_period = toDateTime('0000-00-00 00:00:00'), 'dormant', if(next_period = initial_period + INTERVAL 1 DAY, 'returning', if(next_period > earliest + INTERVAL 1 DAY, 'resurrecting', 'new'))) as status
FROM JSONExtractRaw(group_properties, 'key'))) ) ),
bounded_person_activity AS
(SELECT group_id,
dateTrunc(selected_period, events.timestamp) start_of_period
FROM filtered_events events
WHERE events.timestamp <= selected_date_to + interval_type
AND events.timestamp >= previous_date_from
LIMIT 1 BY group_id,
start_of_period) SELECT *
FROM
(SELECT person_id,
initial_period,
min(next_period) as next_period
FROM
(SELECT person_id,
base.start_of_period as initial_period,
subsequent.start_of_period as next_period
FROM person_activity_including_previous_period base
JOIN person_activity_including_previous_period subsequent ON base.person_id = subsequent.person_id
WHERE subsequent.start_of_period > base.start_of_period )
GROUP BY person_id,
initial_period
UNION ALL SELECT base.person_id,
min(base.start_of_period) as initial_period,
min(base.start_of_period) as next_period
FROM person_activity_including_previous_period base
GROUP BY person_id
UNION ALL SELECT person_id,
initial_period,
next_period
FROM
(SELECT person_activity.person_id AS person_id,
toDateTime('0000-00-00 00:00:00') as initial_period,
periods.start_of_period as next_period
FROM person_activity_as_array as person_activity
CROSS JOIN periods
WHERE has(person_activity.start_of_period, periods.start_of_period) = 0
ORDER BY person_id,
periods.start_of_period ASC)
WHERE ((empty(toString(neighbor(person_id, -1)))
OR neighbor(person_id, -1) != person_id)
AND next_period != toStartOfDay(toDateTime('2020-01-12 00:00:00') + INTERVAL 1 DAY - INTERVAL 1 HOUR))
OR ((neighbor(person_id, -1) = person_id)
AND neighbor(next_period, -1) < next_period - INTERVAL 1 DAY) ) activity_pairs
JOIN
(SELECT DISTINCT person_id,
toStartOfDay(min(events.timestamp)) earliest
FROM events
JOIN
(SELECT distinct_id,
argMax(person_id, version) as person_id
FROM person_distinct_id2
WHERE team_id = 2
GROUP BY distinct_id
HAVING argMax(is_deleted, version) = 0) pdi ON events.distinct_id = pdi.distinct_id
WHERE team_id = 2
AND event = '$pageview'
AND $group_0 IN
(SELECT DISTINCT group_key
FROM groups
WHERE team_id = 2
AND group_type_index = 0
AND has(['value'], trim(BOTH '"'
FROM JSONExtractRaw(group_properties, 'key'))) )
GROUP BY person_id) earliest ON activity_pairs.person_id = earliest.person_id)
WHERE next_period <= toDateTime('2020-01-19 23:59:59')
AND next_period >= toDateTime('2020-01-12 00:00:00')
GROUP BY next_period,
(SELECT group_id,
target.start_of_period as start_of_period,
if(previous_activity.group_id = '00000000-0000-0000-0000-000000000000', 'new', if(dateDiff(selected_period, previous_activity.timestamp, target.start_of_period) > 1, 'resurrecting', 'returning')) as status
FROM bounded_person_activity target ASOF
LEFT JOIN filtered_events previous_activity ON previous_activity.group_id = target.group_id
AND target.start_of_period > previous_activity.timestamp
UNION ALL SELECT group_id,
activity_before_target.start_of_period + interval_type AS start_of_period,
'dormant' AS status
FROM bounded_person_activity activity_before_target ASOF
LEFT JOIN filtered_events next_activity ON activity_before_target.group_id = next_activity.group_id
AND next_activity.timestamp > activity_before_target.start_of_period + interval_type
WHERE next_activity.group_id = '00000000-0000-0000-0000-000000000000'
OR dateDiff(selected_period, activity_before_target.start_of_period, next_activity.timestamp) > 1 ) activity_pairs)
WHERE start_of_period <= toDateTime('2020-01-19 23:59:59')
AND start_of_period >= toDateTime('2020-01-12 00:00:00')
GROUP BY start_of_period,
status)
GROUP BY start_of_period,
status
Expand Down
4 changes: 4 additions & 0 deletions ee/clickhouse/queries/trends/lifecycle.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def _format_lifecycle_query(self, entity: Entity, filter: Filter, team_id: int)
return (
LIFECYCLE_SQL.format(
interval=interval_string,
interval_keyword=interval_string[2:],
trunc_func=trunc_func,
event_query=event_query,
filters=prop_filters,
Expand All @@ -75,6 +76,7 @@ def _format_lifecycle_query(self, entity: Entity, filter: Filter, team_id: int)
"prev_date_from": (date_from - interval_increment).strftime(
"%Y-%m-%d{}".format(" %H:%M:%S" if filter.interval == "hour" else " 00:00:00")
),
"interval": filter.interval,
"num_intervals": num_intervals,
"seconds_in_interval": seconds_in_interval,
**event_params,
Expand Down Expand Up @@ -139,6 +141,7 @@ def get_people(
result = sync_execute(
LIFECYCLE_PEOPLE_SQL.format(
interval=interval_string,
interval_keyword=interval_string[2:],
trunc_func=trunc_func,
event_query=event_query,
filters=prop_filters,
Expand All @@ -150,6 +153,7 @@ def get_people(
"prev_date_from": (date_from - interval_increment).strftime(
"%Y-%m-%d{}".format(" %H:%M:%S" if filter.interval == "hour" else " 00:00:00")
),
"interval": filter.interval,
"num_intervals": num_intervals,
"seconds_in_interval": seconds_in_interval,
**event_params,
Expand Down
Loading

0 comments on commit 9fea145

Please sign in to comment.