From 79627b3a3c93b2166de679de0f4e863094e18460 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Wed, 28 Apr 2021 14:51:31 +0100 Subject: [PATCH 1/6] Limit how often GC happens by time. Synapse can be quite memory intensive, and unless care is taken to tune the GC thresholds it can end up thrashing, causing noticable performance problems for large servers. We fix this by limiting how often we GC a given generation, regardless of current counts/thresholds. This does not help with the reverse problem where the thresholds are set too high, but that should only happen in situations where they've been manually configured. Adds a `gc_min_seconds_between` config option to override the defaults. Fixes #9890. --- docs/sample_config.yaml | 8 ++++++++ synapse/app/generic_worker.py | 3 +++ synapse/app/homeserver.py | 3 +++ synapse/config/server.py | 9 +++++++++ synapse/metrics/__init__.py | 18 ++++++++++++++++-- 5 files changed, 39 insertions(+), 2 deletions(-) diff --git a/docs/sample_config.yaml b/docs/sample_config.yaml index e0350279ad39..ebf364cf40a9 100644 --- a/docs/sample_config.yaml +++ b/docs/sample_config.yaml @@ -152,6 +152,14 @@ presence: # #gc_thresholds: [700, 10, 10] +# The minimum time in seconds between each GC for a generation, regardless of +# the GC thresholds. This ensures that we don't do GC too frequently. +# +# A value of `[1, 10, 30]` indicates that a second must pass between consecutive +# generation 0 GCs, etc. +# +# gc_min_seconds_between: [1, 10, 30] + # Set the limit on the returned events in the timeline in the get # and sync operations. The default value is 100. -1 means no upper limit. # diff --git a/synapse/app/generic_worker.py b/synapse/app/generic_worker.py index 1a15ceee8112..a3fe9a3f381b 100644 --- a/synapse/app/generic_worker.py +++ b/synapse/app/generic_worker.py @@ -455,6 +455,9 @@ def start(config_options): synapse.events.USE_FROZEN_DICTS = config.use_frozen_dicts + if config.server.gc_seconds: + synapse.metrics.MIN_TIME_BETWEEN_GCS = config.server.gc_seconds + hs = GenericWorkerServer( config.server_name, config=config, diff --git a/synapse/app/homeserver.py b/synapse/app/homeserver.py index 8e78134bbedc..6a823da10d55 100644 --- a/synapse/app/homeserver.py +++ b/synapse/app/homeserver.py @@ -342,6 +342,9 @@ def setup(config_options): events.USE_FROZEN_DICTS = config.use_frozen_dicts + if config.server.gc_seconds: + synapse.metrics.MIN_TIME_BETWEEN_GCS = config.server.gc_seconds + hs = SynapseHomeServer( config.server_name, config=config, diff --git a/synapse/config/server.py b/synapse/config/server.py index 21ca7b33e38d..ca1c9711f86e 100644 --- a/synapse/config/server.py +++ b/synapse/config/server.py @@ -572,6 +572,7 @@ def read_config(self, config, **kwargs): _warn_if_webclient_configured(self.listeners) self.gc_thresholds = read_gc_thresholds(config.get("gc_thresholds", None)) + self.gc_seconds = read_gc_thresholds(config.get("gc_min_seconds_between", None)) @attr.s class LimitRemoteRoomsConfig: @@ -917,6 +918,14 @@ def generate_config_section( # #gc_thresholds: [700, 10, 10] + # The minimum time in seconds between each GC for a generation, regardless of + # the GC thresholds. This ensures that we don't do GC too frequently. + # + # A value of `[1, 10, 30]` indicates that a second must pass between consecutive + # generation 0 GCs, etc. + # + # gc_min_seconds_between: [1, 10, 30] + # Set the limit on the returned events in the timeline in the get # and sync operations. The default value is 100. -1 means no upper limit. # diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py index 31b7b3c256a7..6de91d826c23 100644 --- a/synapse/metrics/__init__.py +++ b/synapse/metrics/__init__.py @@ -535,6 +535,13 @@ def collect(self): REGISTRY.register(ReactorLastSeenMetric()) +# The minimum time in seconds between GCs for each generation, regardless of the current GC +# thresholds and counts. +MIN_TIME_BETWEEN_GCS = [1, 10, 30] + +# The time in seconds of the last time we did a GC for each generation. +_last_gc = [0, 0, 0] + def runUntilCurrentTimer(reactor, func): @functools.wraps(func) @@ -575,11 +582,16 @@ def f(*args, **kwargs): return ret # Check if we need to do a manual GC (since its been disabled), and do - # one if necessary. + # one if necessary. Note we go in reverse order as e.g. a gen 1 GC may + # promote an object into gen 2, and we don't want to handle the same + # object multiple times. threshold = gc.get_threshold() counts = gc.get_count() for i in (2, 1, 0): - if threshold[i] < counts[i]: + # We check if we need to do one based on a straightforward + # comparison between the threshold and count. We also do an extra + # check to make sure that we don't a GC too often. + if threshold[i] < counts[i] and MIN_TIME_BETWEEN_GCS[i] < end - _last_gc[i]: if i == 0: logger.debug("Collecting gc %d", i) else: @@ -589,6 +601,8 @@ def f(*args, **kwargs): unreachable = gc.collect(i) end = time.time() + _last_gc[i] = int(end) + gc_time.labels(i).observe(end - start) gc_unreachable.labels(i).set(unreachable) From 351f886bc8221b99419b9416a57d8a0a71ee5172 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Wed, 28 Apr 2021 14:55:46 +0100 Subject: [PATCH 2/6] Newsfile --- changelog.d/9902.feature | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/9902.feature diff --git a/changelog.d/9902.feature b/changelog.d/9902.feature new file mode 100644 index 000000000000..4d9f324d4e29 --- /dev/null +++ b/changelog.d/9902.feature @@ -0,0 +1 @@ +Add limits to how often Synapse will GC, ensuring that large servers do not end up GC thrashing if `gc_thresholds` has not been correctly set. From d3a6e38c96d95f5c6fa1dc41401ad09d293c218c Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Tue, 4 May 2021 10:40:18 +0100 Subject: [PATCH 3/6] Apply suggestions from code review Co-authored-by: Richard van der Hoff <1389908+richvdh@users.noreply.github.com> --- synapse/config/server.py | 2 +- synapse/metrics/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/synapse/config/server.py b/synapse/config/server.py index ca1c9711f86e..e95925d1abe4 100644 --- a/synapse/config/server.py +++ b/synapse/config/server.py @@ -924,7 +924,7 @@ def generate_config_section( # A value of `[1, 10, 30]` indicates that a second must pass between consecutive # generation 0 GCs, etc. # - # gc_min_seconds_between: [1, 10, 30] + #gc_min_seconds_between: [1, 10, 30] # Set the limit on the returned events in the timeline in the get # and sync operations. The default value is 100. -1 means no upper limit. diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py index 6de91d826c23..93662fa134c3 100644 --- a/synapse/metrics/__init__.py +++ b/synapse/metrics/__init__.py @@ -539,7 +539,7 @@ def collect(self): # thresholds and counts. MIN_TIME_BETWEEN_GCS = [1, 10, 30] -# The time in seconds of the last time we did a GC for each generation. +# The time (in seconds since the epoch) of the last time we did a GC for each generation. _last_gc = [0, 0, 0] From bd04fb63085bc46282e88bad3296b1225425fd4d Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Tue, 4 May 2021 10:47:32 +0100 Subject: [PATCH 4/6] Code review --- synapse/config/server.py | 23 ++++++++++++++++++++--- synapse/metrics/__init__.py | 2 +- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/synapse/config/server.py b/synapse/config/server.py index e95925d1abe4..b639bd3144e2 100644 --- a/synapse/config/server.py +++ b/synapse/config/server.py @@ -572,7 +572,7 @@ def read_config(self, config, **kwargs): _warn_if_webclient_configured(self.listeners) self.gc_thresholds = read_gc_thresholds(config.get("gc_thresholds", None)) - self.gc_seconds = read_gc_thresholds(config.get("gc_min_seconds_between", None)) + self.gc_seconds = self.read_gc_intervals(config.get("gc_min_interval", None)) @attr.s class LimitRemoteRoomsConfig: @@ -921,10 +921,10 @@ def generate_config_section( # The minimum time in seconds between each GC for a generation, regardless of # the GC thresholds. This ensures that we don't do GC too frequently. # - # A value of `[1, 10, 30]` indicates that a second must pass between consecutive + # A value of `[1s, 10s, 30s]` indicates that a second must pass between consecutive # generation 0 GCs, etc. # - #gc_min_seconds_between: [1, 10, 30] + #gc_min_interval: [0.5s, 30s, 1m] # Set the limit on the returned events in the timeline in the get # and sync operations. The default value is 100. -1 means no upper limit. @@ -1314,6 +1314,23 @@ def add_arguments(parser): help="Turn on the twisted telnet manhole service on the given port.", ) + def read_gc_intervals(self, durations): + """Reads the three durations for the GC min interval option.""" + if durations is None: + return None + try: + if len(durations) != 3: + raise ValueError() + return ( + self.parse_duration(durations[0]), + self.parse_duration(durations[1]), + self.parse_duration(durations[2]), + ) + except Exception: + raise ConfigError( + "Value of `gc_min_interval` must be a list of three durations if set" + ) + def is_threepid_reserved(reserved_threepids, threepid): """Check the threepid against the reserved threepid config diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py index 93662fa134c3..149ae3002a79 100644 --- a/synapse/metrics/__init__.py +++ b/synapse/metrics/__init__.py @@ -537,7 +537,7 @@ def collect(self): # The minimum time in seconds between GCs for each generation, regardless of the current GC # thresholds and counts. -MIN_TIME_BETWEEN_GCS = [1, 10, 30] +MIN_TIME_BETWEEN_GCS = (1, 10, 30) # The time (in seconds since the epoch) of the last time we did a GC for each generation. _last_gc = [0, 0, 0] From 43c9acda4c873d2f3b0872509ea0431d3e5ed1ff Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Tue, 4 May 2021 11:49:13 +0100 Subject: [PATCH 5/6] Config --- docs/sample_config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sample_config.yaml b/docs/sample_config.yaml index ebf364cf40a9..94b28fecaf7c 100644 --- a/docs/sample_config.yaml +++ b/docs/sample_config.yaml @@ -155,10 +155,10 @@ presence: # The minimum time in seconds between each GC for a generation, regardless of # the GC thresholds. This ensures that we don't do GC too frequently. # -# A value of `[1, 10, 30]` indicates that a second must pass between consecutive +# A value of `[1s, 10s, 30s]` indicates that a second must pass between consecutive # generation 0 GCs, etc. # -# gc_min_seconds_between: [1, 10, 30] +#gc_min_interval: [0.5s, 30s, 1m] # Set the limit on the returned events in the timeline in the get # and sync operations. The default value is 100. -1 means no upper limit. From b5169b68e90fe494d0137e4676af929b85594a50 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Tue, 4 May 2021 14:23:02 +0100 Subject: [PATCH 6/6] Document default. Add type annotations. Correctly convert to seconds --- docs/sample_config.yaml | 2 ++ synapse/config/server.py | 15 +++++++++------ synapse/metrics/__init__.py | 6 +++--- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/docs/sample_config.yaml b/docs/sample_config.yaml index 94b28fecaf7c..a1dacefbd7f7 100644 --- a/docs/sample_config.yaml +++ b/docs/sample_config.yaml @@ -158,6 +158,8 @@ presence: # A value of `[1s, 10s, 30s]` indicates that a second must pass between consecutive # generation 0 GCs, etc. # +# Defaults to `[1s, 10s, 30s]`. +# #gc_min_interval: [0.5s, 30s, 1m] # Set the limit on the returned events in the timeline in the get diff --git a/synapse/config/server.py b/synapse/config/server.py index b639bd3144e2..c290a35a9285 100644 --- a/synapse/config/server.py +++ b/synapse/config/server.py @@ -19,7 +19,7 @@ import os.path import re from textwrap import indent -from typing import Any, Dict, Iterable, List, Optional, Set +from typing import Any, Dict, Iterable, List, Optional, Set, Tuple import attr import yaml @@ -924,6 +924,8 @@ def generate_config_section( # A value of `[1s, 10s, 30s]` indicates that a second must pass between consecutive # generation 0 GCs, etc. # + # Defaults to `[1s, 10s, 30s]`. + # #gc_min_interval: [0.5s, 30s, 1m] # Set the limit on the returned events in the timeline in the get @@ -1314,17 +1316,18 @@ def add_arguments(parser): help="Turn on the twisted telnet manhole service on the given port.", ) - def read_gc_intervals(self, durations): - """Reads the three durations for the GC min interval option.""" + def read_gc_intervals(self, durations) -> Optional[Tuple[float, float, float]]: + """Reads the three durations for the GC min interval option, returning seconds.""" if durations is None: return None + try: if len(durations) != 3: raise ValueError() return ( - self.parse_duration(durations[0]), - self.parse_duration(durations[1]), - self.parse_duration(durations[2]), + self.parse_duration(durations[0]) / 1000, + self.parse_duration(durations[1]) / 1000, + self.parse_duration(durations[2]) / 1000, ) except Exception: raise ConfigError( diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py index 149ae3002a79..e671da26d51d 100644 --- a/synapse/metrics/__init__.py +++ b/synapse/metrics/__init__.py @@ -537,10 +537,10 @@ def collect(self): # The minimum time in seconds between GCs for each generation, regardless of the current GC # thresholds and counts. -MIN_TIME_BETWEEN_GCS = (1, 10, 30) +MIN_TIME_BETWEEN_GCS = (1.0, 10.0, 30.0) # The time (in seconds since the epoch) of the last time we did a GC for each generation. -_last_gc = [0, 0, 0] +_last_gc = [0.0, 0.0, 0.0] def runUntilCurrentTimer(reactor, func): @@ -601,7 +601,7 @@ def f(*args, **kwargs): unreachable = gc.collect(i) end = time.time() - _last_gc[i] = int(end) + _last_gc[i] = end gc_time.labels(i).observe(end - start) gc_unreachable.labels(i).set(unreachable)