From 2557531f0f60ee1891a74babed54800cb1bcfd06 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Tue, 22 Jan 2019 11:55:53 +0000 Subject: [PATCH 1/4] Fix bug when removing duplicate rows from user_ips This was caused by accidentally overwritting a `last_seen` variable in a for loop, causing the wrong value to be written to the progress table. The result of which was that we didn't scan sections of the table when searching for duplicates, and so some duplicates did not get deleted. --- synapse/storage/client_ips.py | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/synapse/storage/client_ips.py b/synapse/storage/client_ips.py index 5d548f250a2b..9548cfd3f8f9 100644 --- a/synapse/storage/client_ips.py +++ b/synapse/storage/client_ips.py @@ -110,8 +110,13 @@ def f(conn): @defer.inlineCallbacks def _remove_user_ip_dupes(self, progress, batch_size): + # This works function works by scanning the user_ips table in batches + # based on `last_seen`. For each row in a batch it searches the rest of + # the table to see if there are any duplicates, if there are then they + # are removed and replaced with a suitable row. - last_seen_progress = progress.get("last_seen", 0) + # Fetch the start of the batch + begin_last_seen = progress.get("last_seen", 0) def get_last_seen(txn): txn.execute( @@ -122,17 +127,20 @@ def get_last_seen(txn): LIMIT 1 OFFSET ? """, - (last_seen_progress, batch_size) + (begin_last_seen, batch_size) ) - results = txn.fetchone() - return results - - # Get a last seen that's sufficiently far away enough from the last one - last_seen = yield self.runInteraction( + row = txn.fetchone() + if row: + return row[0] + else: + return None + + # Get a last seen that has roughly `batch_size` since `begin_last_seen` + end_last_seen = yield self.runInteraction( "user_ips_dups_get_last_seen", get_last_seen ) - if not last_seen: + if end_last_seen is None: # If we get a None then we're reaching the end and just need to # delete the last batch. last = True @@ -142,9 +150,8 @@ def get_last_seen(txn): last_seen = int(self.clock.time_msec()) * 2 else: last = False - last_seen = last_seen[0] - def remove(txn, last_seen_progress, last_seen): + def remove(txn, begin_last_seen, end_last_seen): # This works by looking at all entries in the given time span, and # then for each (user_id, access_token, ip) tuple in that range # checking for any duplicates in the rest of the table (via a join). @@ -166,7 +173,7 @@ def remove(txn, last_seen_progress, last_seen): INNER JOIN user_ips USING (user_id, access_token, ip) GROUP BY user_id, access_token, ip HAVING count(*) > 1""", - (last_seen_progress, last_seen) + (begin_last_seen, end_last_seen) ) res = txn.fetchall() @@ -194,11 +201,11 @@ def remove(txn, last_seen_progress, last_seen): ) self._background_update_progress_txn( - txn, "user_ips_remove_dupes", {"last_seen": last_seen} + txn, "user_ips_remove_dupes", {"last_seen": end_last_seen} ) yield self.runInteraction( - "user_ips_dups_remove", remove, last_seen_progress, last_seen + "user_ips_dups_remove", remove, begin_last_seen, end_last_seen ) if last: yield self._end_background_update("user_ips_remove_dupes") From c658425e6f0683031f34807f8bc3dcc138d66fbe Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Tue, 22 Jan 2019 13:34:10 +0000 Subject: [PATCH 2/4] Newsfile --- changelog.d/4434.misc | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/4434.misc diff --git a/changelog.d/4434.misc b/changelog.d/4434.misc new file mode 100644 index 000000000000..047061ed3c71 --- /dev/null +++ b/changelog.d/4434.misc @@ -0,0 +1 @@ +Apply a unique index to the user_ips table, preventing duplicates. From 1c9704f8ab72047a83c6a3b364f3693b332434f2 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Tue, 22 Jan 2019 16:20:33 +0000 Subject: [PATCH 3/4] Don't shadow params --- synapse/storage/client_ips.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/synapse/storage/client_ips.py b/synapse/storage/client_ips.py index 9548cfd3f8f9..c485211175ac 100644 --- a/synapse/storage/client_ips.py +++ b/synapse/storage/client_ips.py @@ -151,7 +151,7 @@ def get_last_seen(txn): else: last = False - def remove(txn, begin_last_seen, end_last_seen): + def remove(txn): # This works by looking at all entries in the given time span, and # then for each (user_id, access_token, ip) tuple in that range # checking for any duplicates in the rest of the table (via a join). @@ -204,9 +204,8 @@ def remove(txn, begin_last_seen, end_last_seen): txn, "user_ips_remove_dupes", {"last_seen": end_last_seen} ) - yield self.runInteraction( - "user_ips_dups_remove", remove, begin_last_seen, end_last_seen - ) + yield self.runInteraction("user_ips_dups_remove", remove) + if last: yield self._end_background_update("user_ips_remove_dupes") From 7f503f83b92150edeb4cc5ae98f6d9bb2e9cdb49 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Tue, 22 Jan 2019 16:31:05 +0000 Subject: [PATCH 4/4] Refactor to rewrite the SQL instead --- synapse/storage/client_ips.py | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/synapse/storage/client_ips.py b/synapse/storage/client_ips.py index c485211175ac..78721a941a22 100644 --- a/synapse/storage/client_ips.py +++ b/synapse/storage/client_ips.py @@ -140,16 +140,8 @@ def get_last_seen(txn): "user_ips_dups_get_last_seen", get_last_seen ) - if end_last_seen is None: - # If we get a None then we're reaching the end and just need to - # delete the last batch. - last = True - - # We fake not having an upper bound by using a future date, by - # just multiplying the current time by two.... - last_seen = int(self.clock.time_msec()) * 2 - else: - last = False + # If it returns None, then we're processing the last batch + last = end_last_seen is None def remove(txn): # This works by looking at all entries in the given time span, and @@ -160,6 +152,16 @@ def remove(txn): # all other duplicates. # It is efficient due to the existence of (user_id, access_token, # ip) and (last_seen) indices. + + # Define the search space, which requires handling the last batch in + # a different way + if last: + clause = "? <= last_seen" + args = (begin_last_seen,) + else: + clause = "? <= last_seen AND last_seen < ?" + args = (begin_last_seen, end_last_seen) + txn.execute( """ SELECT user_id, access_token, ip, @@ -167,13 +169,14 @@ def remove(txn): FROM ( SELECT user_id, access_token, ip FROM user_ips - WHERE ? <= last_seen AND last_seen < ? + WHERE {} ORDER BY last_seen ) c INNER JOIN user_ips USING (user_id, access_token, ip) GROUP BY user_id, access_token, ip - HAVING count(*) > 1""", - (begin_last_seen, end_last_seen) + HAVING count(*) > 1 + """.format(clause), + args ) res = txn.fetchall()