diff --git a/config/init.go b/config/init.go index e91b248718c..621ff95f353 100644 --- a/config/init.go +++ b/config/init.go @@ -110,6 +110,10 @@ const DefaultConnMgrGracePeriod = time.Second * 20 // type. const DefaultConnMgrType = "basic" +// DefaultResourceMgrMinInboundConns is a MAGIC number that probably a good +// enough number of inbound conns to be a good network citizen. +const DefaultResourceMgrMinInboundConns = 800 + func addressesConfig() Addresses { return Addresses{ Swarm: []string{ diff --git a/core/node/libp2p/rcmgr_defaults.go b/core/node/libp2p/rcmgr_defaults.go index d3c2942584c..4d578a9b650 100644 --- a/core/node/libp2p/rcmgr_defaults.go +++ b/core/node/libp2p/rcmgr_defaults.go @@ -186,5 +186,24 @@ Run 'ipfs swarm limit all' to see the resulting limits. defaultLimitConfig := scalingLimitConfig.Scale(int64(maxMemory), int(numFD)) + // Simple checks to overide autoscaling ensuring limits make sense versus the connmgr values. + // There are ways to break this, but this should catch most problems already. + // We might improve this in the future. + // See: https://github.com/ipfs/kubo/issues/9545 + if cfg.ConnMgr.Type.WithDefault(config.DefaultConnMgrType) != "none" { + maxInboundConns := int64(defaultLimitConfig.System.ConnsInbound) + if connmgrHighWaterTimesTwo := cfg.ConnMgr.HighWater.WithDefault(config.DefaultConnMgrHighWater) * 2; maxInboundConns < connmgrHighWaterTimesTwo { + maxInboundConns = connmgrHighWaterTimesTwo + } + + if maxInboundConns < config.DefaultResourceMgrMinInboundConns { + maxInboundConns = config.DefaultResourceMgrMinInboundConns + } + + // Scale System.StreamsInbound as well, but use the existing ratio of StreamsInbound to ConnsInbound + defaultLimitConfig.System.StreamsInbound = int(maxInboundConns * int64(defaultLimitConfig.System.StreamsInbound) / int64(defaultLimitConfig.System.ConnsInbound)) + defaultLimitConfig.System.ConnsInbound = int(maxInboundConns) + } + return defaultLimitConfig, nil } diff --git a/docs/libp2p-resource-management.md b/docs/libp2p-resource-management.md index 83c44251d98..d6b782da1f9 100644 --- a/docs/libp2p-resource-management.md +++ b/docs/libp2p-resource-management.md @@ -40,19 +40,19 @@ libp2p's resource manager provides tremendous flexibility but also adds complexi 1. "The user who does nothing" - In this case Kubo attempts to give some sane defaults discussed below based on the amount of memory and file descriptors their system has. This should protect the node from many attacks. - + 1. "Slightly more advanced user" - They can tweak the default limits discussed below. Where the defaults aren't good enough, a good set of higher-level "knobs" are exposed to satisfy most use cases without requiring users to wade into all the intricacies of libp2p's resource manager. - The "knobs"/inputs are `Swarm.ResourceMgr.MaxMemory` and `Swarm.ResourceMgr.MaxFileDescriptors` as described below. + The "knobs"/inputs are `Swarm.ResourceMgr.MaxMemory` and `Swarm.ResourceMgr.MaxFileDescriptors` as described below. 1. "Power user" - They specify overrides to computed default limits via `ipfs swarm limit` and `Swarm.ResourceMgr.Limits`; ### Computed Default Limits With the `Swarm.ResourceMgr.MaxMemory` and `Swarm.ResourceMgr.MaxFileDescriptors` inputs defined, -[resource manager limits](https://github.com/libp2p/go-libp2p/tree/master/p2p/host/resource-manager#limits) are created at the -[system](https://github.com/libp2p/go-libp2p/tree/master/p2p/host/resource-manager#the-system-scope), -[transient](https://github.com/libp2p/go-libp2p/tree/master/p2p/host/resource-manager#the-transient-scope), +[resource manager limits](https://github.com/libp2p/go-libp2p/tree/master/p2p/host/resource-manager#limits) are created at the +[system](https://github.com/libp2p/go-libp2p/tree/master/p2p/host/resource-manager#the-system-scope), +[transient](https://github.com/libp2p/go-libp2p/tree/master/p2p/host/resource-manager#the-transient-scope), and [peer](https://github.com/libp2p/go-libp2p/tree/master/p2p/host/resource-manager#peer-scopes) scopes. Other scopes are ignored (by being set to "[~infinity](#infinite-limits])". @@ -68,11 +68,15 @@ The reason these scopes are chosen is because: (e.g., bug in a peer which is causing it to "misbehave"). In the unintional case, we want to make sure a "misbehaving" node doesn't consume more resources than necessary. -Within these scopes, limits are just set on -[memory](https://github.com/libp2p/go-libp2p/tree/master/p2p/host/resource-manager#memory), +Within these scopes, limits are just set on +[memory](https://github.com/libp2p/go-libp2p/tree/master/p2p/host/resource-manager#memory), [file descriptors (FD)](https://github.com/libp2p/go-libp2p/tree/master/p2p/host/resource-manager#file-descriptors), [*inbound* connections](https://github.com/libp2p/go-libp2p/tree/master/p2p/host/resource-manager#connections), and [*inbound* streams](https://github.com/libp2p/go-libp2p/tree/master/p2p/host/resource-manager#streams). Limits are set based on the `Swarm.ResourceMgr.MaxMemory` and `Swarm.ResourceMgr.MaxFileDescriptors` inputs above. + +There are also some special cases where minimum values are enforced. +For example, Kubo maintainers have found in practice that it's a footgun to have too low of a value for `Swarm.ResourceMgr.Limits.System.ConnsInbound` and a default minimum is used. (See [core/node/libp2p/rcmgr_defaults.go](https://github.com/ipfs/kubo/blob/master/core/node/libp2p/rcmgr_defaults.go) for specifics.) + We trust this node to behave properly and thus don't limit *outbound* connection/stream limits. We apply any limits that libp2p has for its protocols/services since we assume libp2p knows best here. @@ -139,13 +143,17 @@ There is a go-libp2p issue ([#1928](https://github.com/libp2p/go-libp2p/issues/1 ### How does the resource manager (ResourceMgr) relate to the connection manager (ConnMgr)? As discussed [here](https://github.com/libp2p/go-libp2p/tree/master/p2p/host/resource-manager#connmanager-vs-resource-manager) these are separate systems in go-libp2p. -Kubo also configures the ConnMgr separately from ResourceMgr. There is no checking to make sure the limits between the systems are congruent. +Kubo performs sanity checks to ensure that some of the hard limits of the ResourceMgr are sufficiently greater than the soft limits of the ConnMgr. -Ideally `Swarm.ConnMgr.HighWater` is less than `Swarm.ResourceMgr.Limits.System.ConnsInbound`. -This is so the ConnMgr can kick in and cleanup connections based on connection priorities before the hard limits of the ResourceMgr are applied. +The soft limit of `Swarm.ConnMgr.HighWater` needs to be less than the hard limit `Swarm.ResourceMgr.Limits.System.ConnsInbound` for the configuration to make sense. +This ensures the ConnMgr cleans up connections based on connection priorities before the hard limits of the ResourceMgr are applied. If `Swarm.ConnMgr.HighWater` is greater than `Swarm.ResourceMgr.Limits.System.ConnsInbound`, existing low priority idle connections can prevent new high priority connections from being established. -The ResourceMgr doesn't know that the new connection is high priority and simply blocks it because of the limit its enforcing. +The ResourceMgr doesn't know that the new connection is high priority and simply blocks it because of the limit its enforcing. + +To ensure the ConnMgr and ResourceMgr are congruent, the ResourceMgr [computed default limts](#computed-default-limits) are adjusted such that: +1. `Swarm.ResourceMgr.Limits.System.ConnsInbound` >= `max(Swarm.ConnMgr.HighWater * 2, 800)` AND +2. `Swarm.ResourceMgr.Limits.System.StreamsInbound` is greater than any new/adjusted `Swarm.ResourceMgr.Limits.System.ConnsInbound` value so that there's enough streams per connection. ### How does one see the Active Limits? A dump of what limits are actually being used by the resource manager ([Computed Default Limits](#computed-default-limits) + [User Supplied Override Limits](#user-supplied-override-limits)) @@ -156,9 +164,9 @@ This can be observed with an empty [`Swarm.ResourceMgr.Limits`](https://github.c and then [seeing the active limits](#how-does-one-see-the-active-limits). ### How does one monitor libp2p resource usage? -For [monitoring libp2p resource usage](https://github.com/libp2p/go-libp2p/tree/master/p2p/host/resource-manager#monitoring), +For [monitoring libp2p resource usage](https://github.com/libp2p/go-libp2p/tree/master/p2p/host/resource-manager#monitoring), various `*rcmgr_*` metrics can be accessed as the prometheus endpoint at `{Addresses.API}/debug/metrics/prometheus` (default: `http://127.0.0.1:5001/debug/metrics/prometheus`). -There are also [pre-built Grafana dashboards](https://github.com/libp2p/go-libp2p/tree/master/p2p/host/resource-manager/obs/grafana-dashboards) that can be added to a Grafana instance. +There are also [pre-built Grafana dashboards](https://github.com/libp2p/go-libp2p/tree/master/p2p/host/resource-manager/obs/grafana-dashboards) that can be added to a Grafana instance. A textual view of current resource usage and a list of services, protocols, and peers can be obtained via `ipfs swarm stats --help` diff --git a/test/sharness/t0139-swarm-rcmgr.sh b/test/sharness/t0139-swarm-rcmgr.sh index 69c5e4600a6..1b870abb787 100755 --- a/test/sharness/t0139-swarm-rcmgr.sh +++ b/test/sharness/t0139-swarm-rcmgr.sh @@ -40,9 +40,35 @@ test_expect_success 'disconnected: swarm stats requires running daemon' ' test_should_contain "missing ResourceMgr" actual ' +# test sanity scaling +test_expect_success 'set very high connmgr highwater' ' + ipfs config --json Swarm.ConnMgr.HighWater 1000 +' + +test_launch_ipfs_daemon + +test_expect_success 'conns and streams are above 2000' ' + ipfs swarm limit system --enc=json | tee json && + [ "$(jq -r .ConnsInbound < json)" -ge 2000 ] && + [ "$(jq -r .StreamsInbound < json)" -ge 2000 ] +' + +test_kill_ipfs_daemon + +test_expect_success 'set previous connmgr highwater' ' + ipfs config --json Swarm.ConnMgr.HighWater 96 +' + +test_launch_ipfs_daemon + +test_expect_success 'conns and streams are above 800' ' + ipfs swarm limit system --enc=json | tee json && + [ "$(jq -r .ConnsInbound < json)" -ge 800 ] && + [ "$(jq -r .StreamsInbound < json)" -ge 800 ] +' + # swarm limit|stats should succeed in online mode by default # because Resource Manager is opt-out -test_launch_ipfs_daemon # every scope has the same fields, so we only inspect System test_expect_success 'ResourceMgr enabled: swarm limit' '