Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Add configuration of the backoff algorithm for the federation client #15754

Merged
merged 17 commits into from
Aug 3, 2023
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/15754.misc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Allow for the configuration of the backoff algorithm for federation destinations.
11 changes: 11 additions & 0 deletions docs/usage/configuration/config_documentation.md
Original file line number Diff line number Diff line change
Expand Up @@ -1212,6 +1212,14 @@ like sending a federation transaction.
* `max_short_retries`: maximum number of retries for the short retry algo. Default to 3 attempts.
* `max_long_retries`: maximum number of retries for the long retry algo. Default to 10 attempts.

The following options are related to configuring the backoff parameters used for a specific destination.
Unlike previous configuration options, these values apply across all requests
and the state of the backoff is stored in the database.

* `destination_min_retry_interval`: the initial backoff, after the first request fails. Defaults to 10m.
* `destination_retry_multiplier`: how much we multiply the backoff by after each subsequent fail. Defaults to 2.
* `destination_max_retry_interval`: a cap on the backoff. Defaults to one day.

Example configuration:
```yaml
federation:
Expand All @@ -1220,6 +1228,9 @@ federation:
max_long_retry_delay: 100s
max_short_retries: 5
max_long_retries: 20
destination_min_retry_interval: 30s
destination_retry_multiplier: 5
destination_max_retry_interval: 12h
```
---
## Caching
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,7 @@ furo = ">=2022.12.7,<2024.0.0"
# system changes.
# We are happy to raise these upper bounds upon request,
# provided we check that it's safe to do so (i.e. that CI passes).
requires = ["poetry-core>=1.1.0,<=1.6.0", "setuptools_rust>=1.3,<=1.6.0"]
requires = ["poetry-core>=1.1.0,<=1.6.0", "setuptools_rust>=1.3,<=1.6.0", "wheel"]
build-backend = "poetry.core.masonry.api"


Expand Down
14 changes: 14 additions & 0 deletions synapse/config/federation.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,5 +65,19 @@ def read_config(self, config: JsonDict, **kwargs: Any) -> None:
self.max_long_retries = federation_config.get("max_long_retries", 10)
self.max_short_retries = federation_config.get("max_short_retries", 3)

# Allow for the configuration of the backoff algorithm used
# when trying to reach an unavailable destination.
# Unlike previous configuration those values applies across
# multiple requests and the state of the backoff is stored on DB.
self.destination_min_retry_interval_ms = Config.parse_duration(
federation_config.get("destination_min_retry_interval", "10m")
)
self.destination_retry_multiplier = federation_config.get(
"destination_retry_multiplier", 2
)
self.destination_max_retry_interval_ms = Config.parse_duration(
federation_config.get("destination_max_retry_interval", "1d")
)


_METRICS_FOR_DOMAINS_SCHEMA = {"type": "array", "items": {"type": "string"}}
29 changes: 16 additions & 13 deletions synapse/util/retryutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,6 @@

logger = logging.getLogger(__name__)

# the initial backoff, after the first transaction fails
MIN_RETRY_INTERVAL = 10 * 60 * 1000

# how much we multiply the backoff by after each subsequent fail
RETRY_MULTIPLIER = 5

# a cap on the backoff. (Essentially none)
MAX_RETRY_INTERVAL = 2**62


class NotRetryingDestination(Exception):
def __init__(self, retry_last_ts: int, retry_interval: int, destination: str):
Expand Down Expand Up @@ -169,6 +160,16 @@ def __init__(
self.notifier = notifier
self.replication_client = replication_client

self.destination_min_retry_interval_ms = (
self.store.hs.config.federation.destination_min_retry_interval_ms
)
self.destination_retry_multiplier = (
self.store.hs.config.federation.destination_retry_multiplier
)
self.destination_max_retry_interval_ms = (
self.store.hs.config.federation.destination_max_retry_interval_ms
)

def __enter__(self) -> None:
pass

Expand Down Expand Up @@ -220,13 +221,15 @@ def __exit__(
# We couldn't connect.
if self.retry_interval:
self.retry_interval = int(
self.retry_interval * RETRY_MULTIPLIER * random.uniform(0.8, 1.4)
self.retry_interval
* self.destination_retry_multiplier
* random.uniform(0.8, 1.4)
)

if self.retry_interval >= MAX_RETRY_INTERVAL:
self.retry_interval = MAX_RETRY_INTERVAL
if self.retry_interval >= self.destination_max_retry_interval_ms:
self.retry_interval = self.destination_max_retry_interval_ms
else:
self.retry_interval = MIN_RETRY_INTERVAL
self.retry_interval = self.destination_min_retry_interval_ms

logger.info(
"Connection to %s was unsuccessful (%s(%s)); backoff now %i",
Expand Down
6 changes: 4 additions & 2 deletions tests/storage/test_transactions.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
from synapse.server import HomeServer
from synapse.storage.databases.main.transactions import DestinationRetryTimings
from synapse.util import Clock
from synapse.util.retryutils import MAX_RETRY_INTERVAL

from tests.unittest import HomeserverTestCase

Expand Down Expand Up @@ -57,8 +56,11 @@ def test_initial_set_transactions(self) -> None:
self.get_success(d)

def test_large_destination_retry(self) -> None:
max_retry_interval = (
self.hs.config.federation.destination_max_retry_interval_ms * 1000
)
d = self.store.set_destination_retry_timings(
"example.com", MAX_RETRY_INTERVAL, MAX_RETRY_INTERVAL, MAX_RETRY_INTERVAL
"example.com", max_retry_interval, max_retry_interval, max_retry_interval
)
self.get_success(d)

Expand Down
20 changes: 9 additions & 11 deletions tests/util/test_retryutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from synapse.util.retryutils import (
MIN_RETRY_INTERVAL,
RETRY_MULTIPLIER,
NotRetryingDestination,
get_retry_limiter,
)
from synapse.util.retryutils import NotRetryingDestination, get_retry_limiter

from tests.unittest import HomeserverTestCase

Expand All @@ -42,6 +37,9 @@ def test_limiter(self) -> None:

limiter = self.get_success(get_retry_limiter("test_dest", self.clock, store))

min_retry_interval = self.hs.config.federation.destination_min_retry_interval_ms
retry_multiplier = self.hs.config.federation.destination_retry_multiplier

self.pump(1)
try:
with limiter:
Expand All @@ -57,7 +55,7 @@ def test_limiter(self) -> None:
assert new_timings is not None
self.assertEqual(new_timings.failure_ts, failure_ts)
self.assertEqual(new_timings.retry_last_ts, failure_ts)
self.assertEqual(new_timings.retry_interval, MIN_RETRY_INTERVAL)
self.assertEqual(new_timings.retry_interval, min_retry_interval)

# now if we try again we should get a failure
self.get_failure(
Expand All @@ -68,7 +66,7 @@ def test_limiter(self) -> None:
# advance the clock and try again
#

self.pump(MIN_RETRY_INTERVAL)
self.pump(min_retry_interval)
limiter = self.get_success(get_retry_limiter("test_dest", self.clock, store))

self.pump(1)
Expand All @@ -87,16 +85,16 @@ def test_limiter(self) -> None:
self.assertEqual(new_timings.failure_ts, failure_ts)
self.assertEqual(new_timings.retry_last_ts, retry_ts)
self.assertGreaterEqual(
new_timings.retry_interval, MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 0.5
new_timings.retry_interval, min_retry_interval * retry_multiplier * 0.5
)
self.assertLessEqual(
new_timings.retry_interval, MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 2.0
new_timings.retry_interval, min_retry_interval * retry_multiplier * 2.0
)

#
# one more go, with success
#
self.reactor.advance(MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 2.0)
self.reactor.advance(min_retry_interval * retry_multiplier * 2.0)
limiter = self.get_success(get_retry_limiter("test_dest", self.clock, store))

self.pump(1)
Expand Down