From 9d288b5b430f7aa5cbd3f60acd2862f2b402c398 Mon Sep 17 00:00:00 2001 From: Albin Kerouanton Date: Mon, 22 Apr 2024 22:36:03 +0200 Subject: [PATCH 1/3] libnet/i/defaultipam: introduce a linear allocator The previous allocator was subnetting address pools eagerly when the daemon started, and would then just iterate over that list whenever RequestPool was called. This was leading to high memory usage whenever IPv6 pools were configured with a target subnet size too different from the pools prefix size. For instance: pool = fd00::/8, target size = /64 -- 2 ^ (64-8) subnets would be generated upfront. This would take approx. 9 * 10^18 bits -- way too much for any human computer in 2024. Another noteworthy issue, the previous implementation was allocating a subnet, and then in another layer was checking whether the allocation was conflicting with some 'reserved networks'. If so, the allocation would be retried, etc... To make it worse, 'reserved networks' would be recomputed on every iteration. This is totally ineffective as there could be 'reserved networks' that fully overlap a given address pool (or many!). To fix this issue, a new field `Exclude` is added to `RequestPool`. It's up to each driver to take it into account. Since we don't know whether this retry loop is useful for some remote IPAM driver, it's reimplemented bug-for-bug directly in the remote driver. The new allocator uses a linear-search algorithm. It takes advantage of all lists (predefined pools, allocated subnets and reserved networks) being sorted and logically combines 'allocated' and 'reserved' through a 'double cursor' to iterate on both lists at the same time while preserving the total order. At the same time, it iterates over 'predefined' pools and looks for the first empty space that would be a good fit. Currently, the size of the allocated subnet is still dictated by each 'predefined' pools. We should consider hardcoding that size instead, and let users specify what subnet size they want. This wasn't possible before as the subnets were generated upfront. This new allocator should be able to deal with this easily. The method used for static allocation has been updated to make sure the ascending order of 'allocated' is preserved. It's bug-for-bug compatible with the previous implementation. One consequence of this new algorithm is that we don't keep track of where the last allocation happened, we just allocate the first free subnet we find. Before: - Allocate: 10.0.1.0/24, 10.0.2.0/24 ; Deallocate: 10.0.1.0/24 ; Allocate 10.0.3.0/24. Now, the 3rd allocation would yield 10.0.1.0/24 once again. As it doesn't change the semantics of the allocator, there's no reason to worry about that. Finally, about 'reserved networks'. The heuristics we use are now properly documented. It was discovered that we don't check routes for IPv6 allocations -- this can't be changed because there's no such thing as on-link routes for IPv6. (Kudos to Rob Murray for coming up with the linear-search idea.) Signed-off-by: Albin Kerouanton --- daemon/config/config_test.go | 7 +- daemon/info.go | 2 +- libnetwork/cnmallocator/drivers_ipam.go | 8 +- .../drivers/bridge/bridge_linux_test.go | 26 +- libnetwork/internal/netiputil/netiputil.go | 29 ++ .../internal/netiputil/netiputil_test.go | 46 +++ libnetwork/ipamapi/contract.go | 5 + libnetwork/ipams/defaultipam/address_space.go | 278 ++++++++++--- .../ipams/defaultipam/address_space_test.go | 389 ++++++++++++++++++ libnetwork/ipams/defaultipam/allocator.go | 50 +-- .../ipams/defaultipam/allocator_test.go | 59 ++- libnetwork/ipams/defaultipam/parallel_test.go | 91 ++-- libnetwork/ipams/defaultipam/structures.go | 53 +++ .../ipams/defaultipam/structures_test.go | 32 ++ libnetwork/ipams/remote/remote.go | 75 +++- libnetwork/ipamutils/utils.go | 103 ++--- libnetwork/ipamutils/utils_test.go | 74 ---- libnetwork/ipbits/ipbits.go | 21 + libnetwork/ipbits/ipbits_test.go | 29 ++ libnetwork/ipbits/uint128.go | 6 + libnetwork/netutils/utils.go | 29 -- libnetwork/netutils/utils_freebsd.go | 13 +- libnetwork/netutils/utils_linux.go | 108 +++-- libnetwork/netutils/utils_linux_test.go | 220 ++-------- libnetwork/netutils/utils_windows.go | 13 +- libnetwork/network.go | 80 +--- libnetwork/resolvconf/resolvconf.go | 12 +- libnetwork/resolvconf/resolvconf_unix_test.go | 32 +- opts/address_pools.go | 7 +- 29 files changed, 1201 insertions(+), 696 deletions(-) create mode 100644 libnetwork/internal/netiputil/netiputil_test.go create mode 100644 libnetwork/ipams/defaultipam/address_space_test.go create mode 100644 libnetwork/ipams/defaultipam/structures_test.go delete mode 100644 libnetwork/ipamutils/utils_test.go diff --git a/daemon/config/config_test.go b/daemon/config/config_test.go index c2c974b909c23..5d365915ecd51 100644 --- a/daemon/config/config_test.go +++ b/daemon/config/config_test.go @@ -2,6 +2,7 @@ package config // import "github.com/docker/docker/daemon/config" import ( "encoding/json" + "net/netip" "os" "path/filepath" "reflect" @@ -157,7 +158,7 @@ func TestDaemonConfigurationMergeDefaultAddressPools(t *testing.T) { emptyConfigFile := makeConfigFile(t, `{}`) configFile := makeConfigFile(t, `{"default-address-pools":[{"base": "10.123.0.0/16", "size": 24 }]}`) - expected := []*ipamutils.NetworkToSplit{{Base: "10.123.0.0/16", Size: 24}} + expected := []*ipamutils.NetworkToSplit{{Base: netip.MustParsePrefix("10.123.0.0/16"), Size: 24}} t.Run("empty config file", func(t *testing.T) { conf := Config{} @@ -167,7 +168,7 @@ func TestDaemonConfigurationMergeDefaultAddressPools(t *testing.T) { config, err := MergeDaemonConfigurations(&conf, flags, emptyConfigFile) assert.NilError(t, err) - assert.DeepEqual(t, config.DefaultAddressPools.Value(), expected) + assert.DeepEqual(t, config.DefaultAddressPools.Value(), expected, cmpopts.EquateComparable(netip.Prefix{})) }) t.Run("config file", func(t *testing.T) { @@ -177,7 +178,7 @@ func TestDaemonConfigurationMergeDefaultAddressPools(t *testing.T) { config, err := MergeDaemonConfigurations(&conf, flags, configFile) assert.NilError(t, err) - assert.DeepEqual(t, config.DefaultAddressPools.Value(), expected) + assert.DeepEqual(t, config.DefaultAddressPools.Value(), expected, cmpopts.EquateComparable(netip.Prefix{})) }) t.Run("with conflicting options", func(t *testing.T) { diff --git a/daemon/info.go b/daemon/info.go index 8e56bd9a68028..e1a0b44bae379 100644 --- a/daemon/info.go +++ b/daemon/info.go @@ -258,7 +258,7 @@ func (daemon *Daemon) fillDefaultAddressPools(ctx context.Context, v *system.Inf defer span.End() for _, pool := range cfg.DefaultAddressPools.Value() { v.DefaultAddressPools = append(v.DefaultAddressPools, system.NetworkAddressPool{ - Base: pool.Base, + Base: pool.Base.String(), Size: pool.Size, }) } diff --git a/libnetwork/cnmallocator/drivers_ipam.go b/libnetwork/cnmallocator/drivers_ipam.go index 8d0608c846264..37b8ba7d9767d 100644 --- a/libnetwork/cnmallocator/drivers_ipam.go +++ b/libnetwork/cnmallocator/drivers_ipam.go @@ -2,6 +2,8 @@ package cnmallocator import ( "context" + "fmt" + "net/netip" "strconv" "strings" @@ -22,8 +24,12 @@ func initIPAMDrivers(r ipamapi.Registerer, netConfig *networkallocator.Config) e // happens with default address pool option if netConfig != nil { for _, p := range netConfig.DefaultAddrPool { + base, err := netip.ParsePrefix(p) + if err != nil { + return fmt.Errorf("invalid prefix %q: %w", p, err) + } addressPool = append(addressPool, &ipamutils.NetworkToSplit{ - Base: p, + Base: base, Size: int(netConfig.SubnetSize), }) str.WriteString(p + ",") diff --git a/libnetwork/drivers/bridge/bridge_linux_test.go b/libnetwork/drivers/bridge/bridge_linux_test.go index 8b027d8537842..2029a1b8fd09a 100644 --- a/libnetwork/drivers/bridge/bridge_linux_test.go +++ b/libnetwork/drivers/bridge/bridge_linux_test.go @@ -12,6 +12,9 @@ import ( "github.com/docker/docker/internal/testutils/netnsutils" "github.com/docker/docker/libnetwork/driverapi" + "github.com/docker/docker/libnetwork/internal/netiputil" + "github.com/docker/docker/libnetwork/ipamapi" + "github.com/docker/docker/libnetwork/ipams/defaultipam" "github.com/docker/docker/libnetwork/ipamutils" "github.com/docker/docker/libnetwork/iptables" "github.com/docker/docker/libnetwork/netlabel" @@ -195,16 +198,19 @@ func compareBindings(a, b []types.PortBinding) bool { } func getIPv4Data(t *testing.T) []driverapi.IPAMData { - ipd := driverapi.IPAMData{AddressSpace: "full"} - nw, err := netutils.FindAvailableNetwork(ipamutils.GetLocalScopeDefaultNetworks()) - if err != nil { - t.Fatal(err) - } - ipd.Pool = nw - // Set network gateway to X.X.X.1 - ipd.Gateway = types.GetIPNetCopy(nw) - ipd.Gateway.IP[len(ipd.Gateway.IP)-1] = 1 - return []driverapi.IPAMData{ipd} + t.Helper() + + a, _ := defaultipam.NewAllocator(ipamutils.GetLocalScopeDefaultNetworks(), nil) + alloc, err := a.RequestPool(ipamapi.PoolRequest{ + AddressSpace: "LocalDefault", + Exclude: netutils.InferReservedNetworks(false), + }) + assert.NilError(t, err) + + gw, _, err := a.RequestAddress(alloc.PoolID, nil, nil) + assert.NilError(t, err) + + return []driverapi.IPAMData{{AddressSpace: "LocalDefault", Pool: netiputil.ToIPNet(alloc.Pool), Gateway: gw}} } func getIPv6Data(t *testing.T) []driverapi.IPAMData { diff --git a/libnetwork/internal/netiputil/netiputil.go b/libnetwork/internal/netiputil/netiputil.go index 7c907569f537a..ae0c0a5373763 100644 --- a/libnetwork/internal/netiputil/netiputil.go +++ b/libnetwork/internal/netiputil/netiputil.go @@ -59,3 +59,32 @@ func AddrPortFromNet(addr net.Addr) netip.AddrPort { } return netip.AddrPort{} } + +// LastAddr returns the last address of prefix 'p'. +func LastAddr(p netip.Prefix) netip.Addr { + return ipbits.Add(p.Addr().Prev(), 1, uint(p.Addr().BitLen()-p.Bits())) +} + +// PrefixCompare two prefixes and return a negative, 0, or a positive integer as +// required by [slices.SortFunc]. When two prefixes with the same address is +// provided, the shortest one will be sorted first. +func PrefixCompare(a, b netip.Prefix) int { + cmp := a.Addr().Compare(b.Addr()) + if cmp != 0 { + return cmp + } + return a.Bits() - b.Bits() +} + +// PrefixAfter returns the prefix of size 'sz' right after 'prev'. +func PrefixAfter(prev netip.Prefix, sz int) netip.Prefix { + s := sz + if prev.Bits() < sz { + s = prev.Bits() + } + addr := ipbits.Add(prev.Addr(), 1, uint(prev.Addr().BitLen()-s)) + if addr.IsUnspecified() { + return netip.Prefix{} + } + return netip.PrefixFrom(addr, sz).Masked() +} diff --git a/libnetwork/internal/netiputil/netiputil_test.go b/libnetwork/internal/netiputil/netiputil_test.go new file mode 100644 index 0000000000000..5f6871c191096 --- /dev/null +++ b/libnetwork/internal/netiputil/netiputil_test.go @@ -0,0 +1,46 @@ +package netiputil + +import ( + "net/netip" + "testing" + + "gotest.tools/v3/assert" +) + +func TestLastAddr(t *testing.T) { + testcases := []struct { + p netip.Prefix + want netip.Addr + }{ + {netip.MustParsePrefix("10.0.0.0/24"), netip.MustParseAddr("10.0.0.255")}, + {netip.MustParsePrefix("10.0.0.0/8"), netip.MustParseAddr("10.255.255.255")}, + {netip.MustParsePrefix("fd00::/64"), netip.MustParseAddr("fd00::ffff:ffff:ffff:ffff")}, + {netip.MustParsePrefix("fd00::/16"), netip.MustParseAddr("fd00:ffff:ffff:ffff:ffff:ffff:ffff:ffff")}, + {netip.MustParsePrefix("ffff::/16"), netip.MustParseAddr("ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff")}, + } + + for _, tc := range testcases { + last := LastAddr(tc.p) + assert.Check(t, last == tc.want, "LastAddr(%q) = %s; want: %s", tc.p, last, tc.want) + } +} + +func TestPrefixAfter(t *testing.T) { + testcases := []struct { + prev netip.Prefix + sz int + want netip.Prefix + }{ + {netip.MustParsePrefix("10.0.10.0/24"), 24, netip.MustParsePrefix("10.0.11.0/24")}, + {netip.MustParsePrefix("10.0.10.0/24"), 16, netip.MustParsePrefix("10.1.0.0/16")}, + {netip.MustParsePrefix("10.10.0.0/16"), 24, netip.MustParsePrefix("10.11.0.0/24")}, + {netip.MustParsePrefix("2001:db8:feed:cafe:b000:dead::/96"), 16, netip.MustParsePrefix("2002::/16")}, + {netip.MustParsePrefix("ffff::/16"), 16, netip.Prefix{}}, + {netip.MustParsePrefix("2001:db8:1::/48"), 64, netip.MustParsePrefix("2001:db8:2::/64")}, + } + + for _, tc := range testcases { + next := PrefixAfter(tc.prev, tc.sz) + assert.Check(t, next == tc.want, "PrefixAfter(%q, %d) = %s; want: %s", tc.prev, tc.sz, next, tc.want) + } +} diff --git a/libnetwork/ipamapi/contract.go b/libnetwork/ipamapi/contract.go index 5b88327cb7e2f..2cf8f74ad35ed 100644 --- a/libnetwork/ipamapi/contract.go +++ b/libnetwork/ipamapi/contract.go @@ -35,6 +35,7 @@ var ( ErrIPOutOfRange = types.InvalidParameterErrorf("requested address is out of range") ErrPoolOverlap = types.ForbiddenErrorf("Pool overlaps with other one on this address space") ErrBadPool = types.InvalidParameterErrorf("address space does not contain specified address pool") + ErrNoMoreSubnets = types.InvalidParameterErrorf("all predefined address pools have been fully subnetted") ) // Ipam represents the interface the IPAM service plugins must implement @@ -73,6 +74,10 @@ type PoolRequest struct { // Options is a map of opaque k/v passed to the driver. It's non-mandatory. // Drivers are free to ignore it. Options map[string]string + // Exclude is a list of prefixes the requester wish to not be dynamically + // allocated (ie. when Pool isn't specified). It's up to the IPAM driver to + // take it into account, or totally ignore it. It's required to be sorted. + Exclude []netip.Prefix // V6 indicates which address family should be used to dynamically allocate // a prefix (ie. when Pool isn't specified). V6 bool diff --git a/libnetwork/ipams/defaultipam/address_space.go b/libnetwork/ipams/defaultipam/address_space.go index 84d6f8be62bf1..d305d02ad48d6 100644 --- a/libnetwork/ipams/defaultipam/address_space.go +++ b/libnetwork/ipams/defaultipam/address_space.go @@ -2,36 +2,74 @@ package defaultipam import ( "context" - "fmt" + "errors" "net/netip" + "slices" "sync" "github.com/containerd/log" "github.com/docker/docker/libnetwork/internal/netiputil" "github.com/docker/docker/libnetwork/ipamapi" + "github.com/docker/docker/libnetwork/ipamutils" + "github.com/docker/docker/libnetwork/ipbits" "github.com/docker/docker/libnetwork/types" ) // addrSpace contains the pool configurations for the address space type addrSpace struct { - // Master subnet pools, indexed by the value's stringified PoolData.Pool field. + // Ordered list of allocated subnets. This field is used for dynamic subnet + // allocations. + allocated []netip.Prefix + // Allocated subnets, indexed by their prefix. Values track address + // allocations. subnets map[netip.Prefix]*PoolData - // Predefined pool for the address space - predefined []netip.Prefix - predefinedStartIndex int + // predefined pools for the address space + predefined []*ipamutils.NetworkToSplit mu sync.Mutex } -func newAddrSpace(predefined []netip.Prefix) (*addrSpace, error) { +func newAddrSpace(predefined []*ipamutils.NetworkToSplit) (*addrSpace, error) { + for i, p := range predefined { + if !p.Base.IsValid() { + return nil, errors.New("newAddrSpace: prefix zero found") + } + + predefined[i].Base = p.Base.Masked() + } + + slices.SortFunc(predefined, func(a, b *ipamutils.NetworkToSplit) int { + return netiputil.PrefixCompare(a.Base, b.Base) + }) + + // We need to discard longer overlapping prefixes (sorted after the shorter + // one), otherwise the dynamic allocator might consider a predefined + // network is fully overlapped, go to the next one, which is a subnet of + // the previous, and allocate from it. + j := 0 + for i := 1; i < len(predefined); i++ { + if predefined[j].Overlaps(predefined[i].Base) { + continue + } + j++ + predefined[j] = predefined[i] + } + + if len(predefined) > j { + j++ + } + clear(predefined[j:]) + return &addrSpace{ subnets: map[netip.Prefix]*PoolData{}, - predefined: predefined, + predefined: predefined[:j], }, nil } -// allocateSubnet adds the subnet k to the address space. +// allocateSubnet makes a static allocation for subnets 'nw' and 'sub'. +// +// This method is safe for concurrent use. func (aSpace *addrSpace) allocateSubnet(nw, sub netip.Prefix) error { aSpace.mu.Lock() defer aSpace.mu.Unlock() @@ -52,99 +90,205 @@ func (aSpace *addrSpace) allocateSubnet(nw, sub netip.Prefix) error { return aSpace.allocateSubnetL(nw, sub) } +// allocateSubnetL takes a 'nw' parent prefix and a 'sub' prefix. These are +// '--subnet' and '--ip-range' on the CLI. +// +// If 'sub' prefix is specified, we don't check if 'parent' overlaps with +// existing allocations. However, if no 'sub' prefix is specified, we do check +// for overlaps. This behavior is weird and leads to the inconsistencies +// documented in https://github.com/moby/moby/issues/46756. func (aSpace *addrSpace) allocateSubnetL(nw, sub netip.Prefix) error { // If master pool, check for overlap if sub == (netip.Prefix{}) { if aSpace.overlaps(nw) { return ipamapi.ErrPoolOverlap } - // This is a new master pool, add it along with corresponding bitmask - aSpace.subnets[nw] = newPoolData(nw) - return nil - } - - // This is a new non-master pool (subPool) - if nw.Addr().BitLen() != sub.Addr().BitLen() { - return fmt.Errorf("pool and subpool are of incompatible address families") + return aSpace.allocatePool(nw) } // Look for parent pool - pp, ok := aSpace.subnets[nw] + _, ok := aSpace.subnets[nw] if !ok { - // Parent pool does not exist, add it along with corresponding bitmask - pp = newPoolData(nw) - pp.autoRelease = true - aSpace.subnets[nw] = pp + if err := aSpace.allocatePool(nw); err != nil { + return err + } + aSpace.subnets[nw].autoRelease = true } - pp.children[sub] = struct{}{} + aSpace.subnets[nw].children[sub] = struct{}{} return nil } // overlaps reports whether nw contains any IP addresses in common with any of // the existing subnets in this address space. func (aSpace *addrSpace) overlaps(nw netip.Prefix) bool { - for pool := range aSpace.subnets { - if pool.Overlaps(nw) { + for _, allocated := range aSpace.allocated { + if allocated.Overlaps(nw) { return true } } return false } -// getPredefineds returns the predefined subnets for the address space. -// -// It should not be called concurrently with any other method on the addrSpace. -func (aSpace *addrSpace) getPredefineds() []netip.Prefix { - i := aSpace.predefinedStartIndex - // defensive in case the list changed since last update - if i >= len(aSpace.predefined) { - i = 0 - } - return append(aSpace.predefined[i:], aSpace.predefined[:i]...) +func (aSpace *addrSpace) allocatePool(nw netip.Prefix) error { + n, _ := slices.BinarySearchFunc(aSpace.allocated, nw, netiputil.PrefixCompare) + aSpace.allocated = slices.Insert(aSpace.allocated, n, nw) + aSpace.subnets[nw] = newPoolData(nw) + return nil } -// updatePredefinedStartIndex rotates the predefined subnet list by amt. +// allocatePredefinedPool dynamically allocates a subnet that doesn't overlap +// with existing allocations and 'reserved' prefixes. // -// It should not be called concurrently with any other method on the addrSpace. -func (aSpace *addrSpace) updatePredefinedStartIndex(amt int) { - i := aSpace.predefinedStartIndex + amt - if i < 0 || i >= len(aSpace.predefined) { - i = 0 - } - aSpace.predefinedStartIndex = i -} - -func (aSpace *addrSpace) allocatePredefinedPool(ipV6 bool) (netip.Prefix, error) { +// This method is safe for concurrent use. +func (aSpace *addrSpace) allocatePredefinedPool(reserved []netip.Prefix) (netip.Prefix, error) { aSpace.mu.Lock() defer aSpace.mu.Unlock() - for i, nw := range aSpace.getPredefineds() { - if ipV6 != nw.Addr().Is6() { - continue + var pdfID int + var partialOverlap bool + var prevAlloc netip.Prefix + + it := newMergeIter(aSpace.allocated, reserved, netiputil.PrefixCompare) + + makeAlloc := func(subnet netip.Prefix) netip.Prefix { + // it.ia tracks the position of the mergeIter within aSpace.allocated. + aSpace.allocated = slices.Insert(aSpace.allocated, it.ia, subnet) + aSpace.subnets[subnet] = newPoolData(subnet) + return subnet + } + + for { + allocated := it.Get() + if allocated == (netip.Prefix{}) { + // We reached the end of both 'aSpace.allocated' and 'reserved'. + break } - // Checks whether pool has already been allocated - if _, ok := aSpace.subnets[nw]; ok { + + if pdfID >= len(aSpace.predefined) { + return netip.Prefix{}, ipamapi.ErrNoMoreSubnets + } + pdf := aSpace.predefined[pdfID] + + if allocated.Overlaps(pdf.Base) { + it.Inc() + + if allocated.Bits() <= pdf.Base.Bits() { + // The current 'allocated' prefix is bigger than the 'pdf' + // network, thus the block is fully overlapped. + partialOverlap = false + prevAlloc = netip.Prefix{} + pdfID++ + continue + } + + // If no previous 'allocated' was found to partially overlap 'pdf', + // we need to test whether there's enough space available at the + // beginning of 'pdf'. + if !partialOverlap && ipbits.SubnetsBetween(pdf.FirstPrefix().Addr(), allocated.Addr(), pdf.Size) >= 1 { + // Okay, so there's at least a whole subnet available between + // the start of 'pdf' and 'allocated'. + next := pdf.FirstPrefix() + return makeAlloc(next), nil + } + + // If the network 'pdf' was already found to be partially + // overlapped, we need to test whether there's enough space between + // the end of 'prevAlloc' and current 'allocated'. + afterPrev := netiputil.PrefixAfter(prevAlloc, pdf.Size) + if partialOverlap && ipbits.SubnetsBetween(afterPrev.Addr(), allocated.Addr(), pdf.Size) >= 1 { + // Okay, so there's at least a whole subnet available after + // 'prevAlloc' and before 'allocated'. + return makeAlloc(afterPrev), nil + } + + if netiputil.LastAddr(allocated) == netiputil.LastAddr(pdf.Base) { + // The last address of the current 'allocated' prefix is the + // same as the last address of the 'pdf' network, it's fully + // overlapped. + partialOverlap = false + prevAlloc = netip.Prefix{} + pdfID++ + continue + } + + // This 'pdf' network is partially overlapped. + partialOverlap = true + prevAlloc = allocated continue } - // Shouldn't be necessary, but check prevents IP collisions should - // predefined pools overlap for any reason. - if !aSpace.overlaps(nw) { - aSpace.updatePredefinedStartIndex(i + 1) - err := aSpace.allocateSubnetL(nw, netip.Prefix{}) - if err != nil { - return netip.Prefix{}, err + + // Okay, so previous 'allocated' overlapped and current doesn't. Now + // the question is: is there enough space left between previous + // 'allocated' and the end of the 'pdf' network? + if partialOverlap { + partialOverlap = false + + if next := netiputil.PrefixAfter(prevAlloc, pdf.Size); pdf.Overlaps(next) { + return makeAlloc(next), nil } - return nw, nil + + // No luck, PrefixAfter yielded an invalid prefix. There's not + // enough space left to subnet it once more. + pdfID++ + + // 'it' is not incremented here, we need to re-test the current + // 'allocated' against the next 'pdf' network. + continue + } + + // If the network 'pdf' doesn't overlap and is sorted before the + // current 'allocated', we found the right spot. + if pdf.Base.Addr().Less(allocated.Addr()) { + next := netip.PrefixFrom(pdf.Base.Addr(), pdf.Size) + return makeAlloc(next), nil + } + + it.Inc() + prevAlloc = allocated + } + + if pdfID >= len(aSpace.predefined) { + return netip.Prefix{}, ipamapi.ErrNoMoreSubnets + } + + // We reached the end of 'allocated', but not the end of predefined + // networks. Let's try two more times (once on the current 'pdf', and once + // on the next network if any). + if partialOverlap { + pdf := aSpace.predefined[pdfID] + + if next := netiputil.PrefixAfter(prevAlloc, pdf.Size); pdf.Overlaps(next) { + return makeAlloc(next), nil } + + // No luck -- PrefixAfter yielded an invalid prefix. There's not enough + // space left. + pdfID++ } - v := 4 - if ipV6 { - v = 6 + // One last chance. Here we don't increment pdfID since the last iteration + // on 'it' found either: + // + // - A full overlap, and incremented 'pdfID'. + // - A partial overlap, and the previous 'if' incremented 'pdfID'. + // - The current 'pdfID' comes after the last 'allocated' -- it's not + // overlapped at all. + // + // Hence, we're sure 'pdfID' has never been subnetted yet. + if pdfID < len(aSpace.predefined) { + pdf := aSpace.predefined[pdfID] + + next := pdf.FirstPrefix() + return makeAlloc(next), nil } - return netip.Prefix{}, types.NotFoundErrorf("could not find an available, non-overlapping IPv%d address pool among the defaults to assign to the network", v) + + return netip.Prefix{}, ipamapi.ErrNoMoreSubnets } +// releaseSubnet deallocates prefixes nw and sub. It returns an error if no +// matching allocations could be found. +// +// This method is safe for concurrent use. func (aSpace *addrSpace) releaseSubnet(nw, sub netip.Prefix) error { aSpace.mu.Lock() defer aSpace.mu.Unlock() @@ -164,12 +308,20 @@ func (aSpace *addrSpace) releaseSubnet(nw, sub netip.Prefix) error { } if len(p.children) == 0 && p.autoRelease { - delete(aSpace.subnets, nw) + aSpace.deallocate(nw) } return nil } +// deallocate removes 'nw' from the list of allocations. +func (aSpace *addrSpace) deallocate(nw netip.Prefix) { + if i, ok := slices.BinarySearchFunc(aSpace.allocated, nw, netiputil.PrefixCompare); ok { + aSpace.allocated = slices.Delete(aSpace.allocated, i, i+1) + delete(aSpace.subnets, nw) + } +} + func (aSpace *addrSpace) requestAddress(nw, sub netip.Prefix, prefAddress netip.Addr, opts map[string]string) (netip.Addr, error) { aSpace.mu.Lock() defer aSpace.mu.Unlock() diff --git a/libnetwork/ipams/defaultipam/address_space_test.go b/libnetwork/ipams/defaultipam/address_space_test.go new file mode 100644 index 0000000000000..5fb8e4754d429 --- /dev/null +++ b/libnetwork/ipams/defaultipam/address_space_test.go @@ -0,0 +1,389 @@ +package defaultipam + +import ( + "net/netip" + "testing" + + "github.com/docker/docker/libnetwork/ipamapi" + "github.com/docker/docker/libnetwork/ipamutils" + "github.com/google/go-cmp/cmp/cmpopts" + "gotest.tools/v3/assert" + is "gotest.tools/v3/assert/cmp" +) + +func TestNewAddrSpaceDedup(t *testing.T) { + as, err := newAddrSpace([]*ipamutils.NetworkToSplit{ + {Base: netip.MustParsePrefix("10.0.0.0/8"), Size: 16}, + {Base: netip.MustParsePrefix("10.0.0.0/8"), Size: 24}, + {Base: netip.MustParsePrefix("10.10.0.0/8"), Size: 24}, + {Base: netip.MustParsePrefix("10.0.100.0/8"), Size: 24}, + {Base: netip.MustParsePrefix("192.168.0.0/24"), Size: 24}, + }) + assert.NilError(t, err) + + assert.DeepEqual(t, as.predefined, []*ipamutils.NetworkToSplit{ + {Base: netip.MustParsePrefix("10.0.0.0/8"), Size: 16}, + {Base: netip.MustParsePrefix("192.168.0.0/24"), Size: 24}, + }, cmpopts.EquateComparable(ipamutils.NetworkToSplit{})) +} + +func TestDynamicPoolAllocation(t *testing.T) { + testcases := []struct { + name string + predefined []*ipamutils.NetworkToSplit + allocated []netip.Prefix + reserved []netip.Prefix + expPrefix netip.Prefix + expErr error + }{ + { + name: "First allocated overlaps at the end of first pool", + predefined: []*ipamutils.NetworkToSplit{ + {Base: netip.MustParsePrefix("192.168.0.0/16"), Size: 24}, + }, + allocated: []netip.Prefix{ + // Partial overlap with enough space remaining + netip.MustParsePrefix("192.168.255.0/24"), + }, + expPrefix: netip.MustParsePrefix("192.168.0.0/24"), + }, + { + name: "First reserved bigger than first allocated", + predefined: []*ipamutils.NetworkToSplit{ + {Base: netip.MustParsePrefix("10.0.0.0/8"), Size: 24}, + {Base: netip.MustParsePrefix("192.168.0.0/16"), Size: 24}, + }, + allocated: []netip.Prefix{ + // Partial overlap with enough space remaining + netip.MustParsePrefix("10.0.0.0/8"), + }, + reserved: []netip.Prefix{ + netip.MustParsePrefix("10.0.0.0/7"), + }, + expPrefix: netip.MustParsePrefix("192.168.0.0/24"), + }, + { + name: "First pool fully overlapped by bigger allocated, next overlapped in the middle", + predefined: []*ipamutils.NetworkToSplit{ + {Base: netip.MustParsePrefix("10.20.0.0/16"), Size: 24}, + {Base: netip.MustParsePrefix("192.168.0.0/16"), Size: 24}, + }, + allocated: []netip.Prefix{ + netip.MustParsePrefix("10.0.0.0/8"), + // Partial overlap with enough space remaining + netip.MustParsePrefix("192.168.128.0/24"), + }, + expPrefix: netip.MustParsePrefix("192.168.0.0/24"), + }, + { + name: "First pool fully overlapped by bigger allocated, next overlapped at the beginning and in the middle", + predefined: []*ipamutils.NetworkToSplit{ + {Base: netip.MustParsePrefix("10.20.0.0/16"), Size: 24}, + {Base: netip.MustParsePrefix("192.168.0.0/16"), Size: 24}, + }, + allocated: []netip.Prefix{ + netip.MustParsePrefix("10.0.0.0/8"), + // Partial overlap with enough space remaining + netip.MustParsePrefix("192.168.0.0/24"), + netip.MustParsePrefix("192.168.128.0/24"), + }, + expPrefix: netip.MustParsePrefix("192.168.1.0/24"), + }, + { + name: "First pool fully overlapped by smaller prefixes, next overlapped in the middle", + predefined: []*ipamutils.NetworkToSplit{ + {Base: netip.MustParsePrefix("10.20.0.0/22"), Size: 24}, + {Base: netip.MustParsePrefix("192.168.0.0/16"), Size: 24}, + }, + allocated: []netip.Prefix{ + netip.MustParsePrefix("10.20.0.0/24"), + netip.MustParsePrefix("10.20.1.0/24"), + netip.MustParsePrefix("10.20.2.0/24"), + netip.MustParsePrefix("192.168.128.0/24"), + }, + reserved: []netip.Prefix{ + netip.MustParsePrefix("10.20.3.0/24"), + }, + expPrefix: netip.MustParsePrefix("192.168.0.0/24"), + }, + { + name: "First pool fully overlapped by smaller prefix, next predefined before reserved", + predefined: []*ipamutils.NetworkToSplit{ + {Base: netip.MustParsePrefix("10.20.0.0/16"), Size: 24}, + {Base: netip.MustParsePrefix("192.168.0.0/16"), Size: 24}, + }, + allocated: []netip.Prefix{ + netip.MustParsePrefix("10.20.0.0/17"), + netip.MustParsePrefix("10.20.128.0/17"), + }, + reserved: []netip.Prefix{ + netip.MustParsePrefix("200.1.2.0/24"), + }, + expPrefix: netip.MustParsePrefix("192.168.0.0/24"), + }, + { + name: "First pool fully overlapped by smaller prefix, reserved is the same as the last allocated subnet", + predefined: []*ipamutils.NetworkToSplit{ + {Base: netip.MustParsePrefix("10.10.0.0/22"), Size: 24}, + {Base: netip.MustParsePrefix("192.168.0.0/16"), Size: 24}, + }, + allocated: []netip.Prefix{ + netip.MustParsePrefix("10.10.0.0/24"), + netip.MustParsePrefix("10.10.1.0/24"), + netip.MustParsePrefix("10.10.2.0/24"), + netip.MustParsePrefix("10.10.3.0/24"), + }, + reserved: []netip.Prefix{ + netip.MustParsePrefix("10.10.3.0/24"), + }, + expPrefix: netip.MustParsePrefix("192.168.0.0/24"), + }, + { + name: "Partial overlap by allocated of different sizes", + predefined: []*ipamutils.NetworkToSplit{ + {Base: netip.MustParsePrefix("192.168.0.0/16"), Size: 24}, + }, + allocated: []netip.Prefix{ + // Partial overlap with enough space remaining + netip.MustParsePrefix("192.168.0.0/24"), + netip.MustParsePrefix("192.168.1.0/24"), + netip.MustParsePrefix("192.168.2.0/23"), + netip.MustParsePrefix("192.168.4.3/30"), + }, + expPrefix: netip.MustParsePrefix("192.168.5.0/24"), + }, + { + name: "Partial overlap at the start, not enough space left", + predefined: []*ipamutils.NetworkToSplit{ + {Base: netip.MustParsePrefix("10.0.0.0/31"), Size: 31}, + {Base: netip.MustParsePrefix("192.168.0.0/16"), Size: 24}, + }, + allocated: []netip.Prefix{ + // Partial overlap, not enough space left + netip.MustParsePrefix("10.0.0.0/32"), + netip.MustParsePrefix("100.0.0.0/32"), + netip.MustParsePrefix("200.0.0.0/32"), + }, + expPrefix: netip.MustParsePrefix("192.168.0.0/24"), + }, + { + name: "Partial overlap by allocations and reserved of different sizes", + predefined: []*ipamutils.NetworkToSplit{ + {Base: netip.MustParsePrefix("192.168.0.0/16"), Size: 24}, + }, + allocated: []netip.Prefix{ + // Partial overlap with enough space remaining + netip.MustParsePrefix("192.168.0.0/24"), + netip.MustParsePrefix("192.168.1.0/24"), + netip.MustParsePrefix("192.168.2.3/30"), + }, + reserved: []netip.Prefix{ + netip.MustParsePrefix("192.168.2.4/30"), + netip.MustParsePrefix("192.168.3.0/30"), + netip.MustParsePrefix("192.168.4.0/23"), + }, + expPrefix: netip.MustParsePrefix("192.168.6.0/24"), + }, + { + name: "Partial overlap, same prefix in allocated and reserved", + predefined: []*ipamutils.NetworkToSplit{ + {Base: netip.MustParsePrefix("192.168.0.0/16"), Size: 24}, + }, + allocated: []netip.Prefix{ + // Partial overlap with enough space remaining + netip.MustParsePrefix("192.168.0.0/24"), + }, + reserved: []netip.Prefix{ + netip.MustParsePrefix("192.168.0.0/24"), + }, + expPrefix: netip.MustParsePrefix("192.168.1.0/24"), + }, + { + name: "Partial overlap, two predefined", + predefined: []*ipamutils.NetworkToSplit{ + {Base: netip.MustParsePrefix("10.0.0.0/8"), Size: 24}, + {Base: netip.MustParsePrefix("192.168.0.0/16"), Size: 24}, + }, + allocated: []netip.Prefix{ + netip.MustParsePrefix("10.0.0.0/24"), + }, + reserved: []netip.Prefix{ + netip.MustParsePrefix("192.168.0.0/24"), + }, + expPrefix: netip.MustParsePrefix("10.0.1.0/24"), + }, + { + name: "Predefined with overlapping prefixes, longer prefixes discarded", + predefined: []*ipamutils.NetworkToSplit{ + {Base: netip.MustParsePrefix("10.0.0.0/8"), Size: 24}, + // This predefined will be discarded. + {Base: netip.MustParsePrefix("10.0.0.0/16"), Size: 24}, + // This predefined will be discarded. + {Base: netip.MustParsePrefix("10.10.0.0/16"), Size: 24}, + {Base: netip.MustParsePrefix("192.168.0.0/16"), Size: 24}, + }, + reserved: []netip.Prefix{netip.MustParsePrefix("10.0.0.0/8")}, + expPrefix: netip.MustParsePrefix("192.168.0.0/24"), + }, + { + name: "Partial overlap at the beginning, single predefined", + predefined: []*ipamutils.NetworkToSplit{ + {Base: netip.MustParsePrefix("172.16.0.0/15"), Size: 16}, + }, + allocated: []netip.Prefix{ + netip.MustParsePrefix("172.16.0.0/16"), + }, + expPrefix: netip.MustParsePrefix("172.17.0.0/16"), + }, + { + name: "Partial overlap, no space left at the end, next pool not subnetted yet", + predefined: []*ipamutils.NetworkToSplit{ + {Base: netip.MustParsePrefix("172.16.0.0/15"), Size: 16}, + {Base: netip.MustParsePrefix("192.168.0.0/16"), Size: 24}, + }, + allocated: []netip.Prefix{ + netip.MustParsePrefix("172.16.0.0/16"), + netip.MustParsePrefix("172.17.0.0/17"), + }, + expPrefix: netip.MustParsePrefix("192.168.0.0/24"), + }, + { + name: "Partial overlap, no space left at the end, no more predefined", + predefined: []*ipamutils.NetworkToSplit{ + {Base: netip.MustParsePrefix("172.16.0.0/15"), Size: 16}, + }, + allocated: []netip.Prefix{ + netip.MustParsePrefix("172.16.0.0/16"), + netip.MustParsePrefix("172.17.0.0/17"), + }, + expErr: ipamapi.ErrNoMoreSubnets, + }, + { + name: "Extra allocated, no pool left", + predefined: []*ipamutils.NetworkToSplit{ + {Base: netip.MustParsePrefix("172.16.0.0/15"), Size: 16}, + }, + allocated: []netip.Prefix{ + netip.MustParsePrefix("172.16.0.0/16"), + netip.MustParsePrefix("172.17.0.0/16"), + netip.MustParsePrefix("192.168.0.0/24"), + }, + expErr: ipamapi.ErrNoMoreSubnets, + }, + { + name: "Extra reserved, no pool left", + predefined: []*ipamutils.NetworkToSplit{ + {Base: netip.MustParsePrefix("172.16.0.0/15"), Size: 16}, + }, + allocated: []netip.Prefix{ + netip.MustParsePrefix("172.16.0.0/16"), + netip.MustParsePrefix("172.17.0.0/16"), + }, + reserved: []netip.Prefix{ + netip.MustParsePrefix("192.168.0.0/24"), + }, + expErr: ipamapi.ErrNoMoreSubnets, + }, + { + name: "Predefined fully allocated", + predefined: []*ipamutils.NetworkToSplit{ + {Base: netip.MustParsePrefix("172.16.0.0/15"), Size: 16}, + {Base: netip.MustParsePrefix("192.168.0.0/23"), Size: 24}, + }, + allocated: []netip.Prefix{ + netip.MustParsePrefix("172.16.0.0/16"), + netip.MustParsePrefix("172.17.0.0/16"), + netip.MustParsePrefix("192.168.0.0/24"), + netip.MustParsePrefix("192.168.1.0/24"), + }, + expErr: ipamapi.ErrNoMoreSubnets, + }, + { + name: "Partial overlap, not enough space left", + predefined: []*ipamutils.NetworkToSplit{ + {Base: netip.MustParsePrefix("172.16.0.0/15"), Size: 16}, + {Base: netip.MustParsePrefix("192.168.0.0/23"), Size: 24}, + }, + allocated: []netip.Prefix{ + netip.MustParsePrefix("172.16.0.0/16"), + netip.MustParsePrefix("172.17.128.0/17"), + netip.MustParsePrefix("192.168.0.1/32"), + netip.MustParsePrefix("192.168.1.0/24"), + }, + expErr: ipamapi.ErrNoMoreSubnets, + }, + { + name: "Duplicate 'allocated' at the end of a predefined", + predefined: []*ipamutils.NetworkToSplit{ + {Base: netip.MustParsePrefix("172.16.0.0/15"), Size: 16}, + {Base: netip.MustParsePrefix("192.168.0.0/23"), Size: 24}, + }, + allocated: []netip.Prefix{ + netip.MustParsePrefix("172.16.0.0/16"), + netip.MustParsePrefix("172.17.128.0/17"), + netip.MustParsePrefix("172.17.128.0/17"), + netip.MustParsePrefix("172.17.128.0/17"), + }, + expPrefix: netip.MustParsePrefix("192.168.0.0/24"), + }, + { + name: "Duplicate 'allocated'", + predefined: []*ipamutils.NetworkToSplit{ + {Base: netip.MustParsePrefix("172.16.0.0/15"), Size: 16}, + {Base: netip.MustParsePrefix("192.168.0.0/23"), Size: 24}, + }, + allocated: []netip.Prefix{ + netip.MustParsePrefix("172.16.0.0/16"), + netip.MustParsePrefix("172.16.120.0/24"), + netip.MustParsePrefix("172.17.128.0/17"), + netip.MustParsePrefix("172.17.128.0/17"), + netip.MustParsePrefix("172.17.128.0/24"), + netip.MustParsePrefix("172.17.128.0/17"), + }, + expPrefix: netip.MustParsePrefix("192.168.0.0/24"), + }, + } + + for _, tc := range testcases { + t.Run(tc.name, func(t *testing.T) { + as, err := newAddrSpace(tc.predefined) + assert.NilError(t, err) + as.allocated = tc.allocated + + p, err := as.allocatePredefinedPool(tc.reserved) + + assert.Check(t, is.ErrorIs(err, tc.expErr)) + assert.Check(t, is.Equal(p, tc.expPrefix)) + }) + } +} + +func TestStaticAllocation(t *testing.T) { + as, err := newAddrSpace([]*ipamutils.NetworkToSplit{ + {Base: netip.MustParsePrefix("10.0.0.0/8"), Size: 24}, + }) + assert.NilError(t, err) + + for _, alloc := range []netip.Prefix{ + netip.MustParsePrefix("192.168.0.0/16"), + netip.MustParsePrefix("10.0.0.0/24"), + netip.MustParsePrefix("10.0.1.0/24"), + netip.MustParsePrefix("10.1.0.0/16"), + netip.MustParsePrefix("10.0.0.0/31"), + netip.MustParsePrefix("10.0.0.0/8"), + netip.MustParsePrefix("192.168.3.0/24"), + } { + err := as.allocatePool(alloc) + assert.NilError(t, err) + } + + assert.Check(t, is.DeepEqual(as.allocated, []netip.Prefix{ + netip.MustParsePrefix("10.0.0.0/8"), + netip.MustParsePrefix("10.0.0.0/24"), + netip.MustParsePrefix("10.0.0.0/31"), + netip.MustParsePrefix("10.0.1.0/24"), + netip.MustParsePrefix("10.1.0.0/16"), + netip.MustParsePrefix("192.168.0.0/16"), + netip.MustParsePrefix("192.168.3.0/24"), + }, cmpopts.EquateComparable(netip.Prefix{}))) +} diff --git a/libnetwork/ipams/defaultipam/allocator.go b/libnetwork/ipams/defaultipam/allocator.go index 65e6ebfb9bf01..1d2fcf30ee3e4 100644 --- a/libnetwork/ipams/defaultipam/allocator.go +++ b/libnetwork/ipams/defaultipam/allocator.go @@ -27,25 +27,15 @@ const ( // two optional address pools respectively containing the list of user-defined // address pools for 'local' and 'global' address spaces. func Register(ic ipamapi.Registerer, lAddrPools, gAddrPools []*ipamutils.NetworkToSplit) error { - localAddressPools := ipamutils.GetLocalScopeDefaultNetworks() - if len(lAddrPools) > 0 { - var err error - localAddressPools, err = ipamutils.SplitNetworks(lAddrPools) - if err != nil { - return err - } + if len(lAddrPools) == 0 { + lAddrPools = ipamutils.GetLocalScopeDefaultNetworks() } - globalAddressPools := ipamutils.GetGlobalScopeDefaultNetworks() - if len(gAddrPools) > 0 { - var err error - globalAddressPools, err = ipamutils.SplitNetworks(gAddrPools) - if err != nil { - return err - } + if len(gAddrPools) == 0 { + gAddrPools = ipamutils.GetGlobalScopeDefaultNetworks() } - a, err := NewAllocator(localAddressPools, globalAddressPools) + a, err := NewAllocator(lAddrPools, gAddrPools) if err != nil { return err } @@ -55,18 +45,18 @@ func Register(ic ipamapi.Registerer, lAddrPools, gAddrPools []*ipamutils.Network return ic.RegisterIpamDriverWithCapabilities(DriverName, a, cps) } -// Allocator provides per address space ipv4/ipv6 book keeping +// Allocator provides per address space ipv4/ipv6 bookkeeping type Allocator struct { // The address spaces local4, local6, global4, global6 *addrSpace } // NewAllocator returns an instance of libnetwork ipam -func NewAllocator(lcAs, glAs []*net.IPNet) (*Allocator, error) { +func NewAllocator(lcAs, glAs []*ipamutils.NetworkToSplit) (*Allocator, error) { var ( a Allocator err error - lcAs4, lcAs6, glAs4, glAs6 []netip.Prefix + lcAs4, lcAs6, glAs4, glAs6 []*ipamutils.NetworkToSplit ) lcAs4, lcAs6, err = splitByIPFamily(lcAs) @@ -98,19 +88,18 @@ func NewAllocator(lcAs, glAs []*net.IPNet) (*Allocator, error) { return &a, nil } -func splitByIPFamily(s []*net.IPNet) ([]netip.Prefix, []netip.Prefix, error) { - var v4, v6 []netip.Prefix +func splitByIPFamily(s []*ipamutils.NetworkToSplit) ([]*ipamutils.NetworkToSplit, []*ipamutils.NetworkToSplit, error) { + var v4, v6 []*ipamutils.NetworkToSplit for i, n := range s { - p, ok := netiputil.ToPrefix(n) - if !ok { - return []netip.Prefix{}, []netip.Prefix{}, fmt.Errorf("network at index %d (%v) is not in canonical form", i, n) + if !n.Base.IsValid() || n.Size == 0 { + return []*ipamutils.NetworkToSplit{}, []*ipamutils.NetworkToSplit{}, fmt.Errorf("network at index %d (%v) is not in canonical form", i, n) } - if p.Addr().Is4() { - v4 = append(v4, p) + if n.Base.Addr().Is4() { + v4 = append(v4, n) } else { - v6 = append(v6, p) + v6 = append(v6, n) } } @@ -147,7 +136,7 @@ func (a *Allocator) RequestPool(req ipamapi.PoolRequest) (ipamapi.AllocatedPool, k := PoolID{AddressSpace: req.AddressSpace} if req.Pool == "" { - if k.Subnet, err = aSpace.allocatePredefinedPool(req.V6); err != nil { + if k.Subnet, err = aSpace.allocatePredefinedPool(req.Exclude); err != nil { return ipamapi.AllocatedPool{}, err } return ipamapi.AllocatedPool{PoolID: k.String(), Pool: k.Subnet}, nil @@ -163,6 +152,11 @@ func (a *Allocator) RequestPool(req ipamapi.PoolRequest) (ipamapi.AllocatedPool, } } + // This is a new non-master pool (subPool) + if k.Subnet.IsValid() && k.ChildSubnet.IsValid() && k.Subnet.Addr().BitLen() != k.ChildSubnet.Addr().BitLen() { + return ipamapi.AllocatedPool{}, types.InvalidParameterErrorf("pool and subpool are of incompatible address families") + } + k.Subnet, k.ChildSubnet = k.Subnet.Masked(), k.ChildSubnet.Masked() // Prior to https://github.com/moby/moby/pull/44968, libnetwork would happily accept a ChildSubnet with a bigger // mask than its parent subnet. In such case, it was producing IP addresses based on the parent subnet, and the @@ -227,7 +221,7 @@ func newPoolData(pool netip.Prefix) *PoolData { h := bitmap.New(numAddresses) // Pre-reserve the network address on IPv4 networks large - // enough to have one (i.e., anything bigger than a /31. + // enough to have one (i.e., anything bigger than a /31). if !(pool.Addr().Is4() && numAddresses <= 2) { h.Set(0) } diff --git a/libnetwork/ipams/defaultipam/allocator_test.go b/libnetwork/ipams/defaultipam/allocator_test.go index 29e4cc7903258..a97cd3151e294 100644 --- a/libnetwork/ipams/defaultipam/allocator_test.go +++ b/libnetwork/ipams/defaultipam/allocator_test.go @@ -336,40 +336,39 @@ func TestGetSameAddress(t *testing.T) { } } -func TestPoolAllocationReuse(t *testing.T) { +// TestRequestFromSamePool verify the allocator implements the validation +// inconsistencies described in https://github.com/moby/moby/issues/46756. +func TestRequestFromSamePool(t *testing.T) { a, err := NewAllocator(ipamutils.GetLocalScopeDefaultNetworks(), ipamutils.GetGlobalScopeDefaultNetworks()) assert.NilError(t, err) - // First get all pools until they are exhausted to - allocs := []ipamapi.AllocatedPool{} - alloc, err := a.RequestPool(ipamapi.PoolRequest{AddressSpace: localAddressSpace}) - for err == nil { - allocs = append(allocs, alloc) - alloc, err = a.RequestPool(ipamapi.PoolRequest{AddressSpace: localAddressSpace}) - } - for _, alloc := range allocs { - if err := a.ReleasePool(alloc.PoolID); err != nil { - t.Fatal(err) - } - } + _, err = a.RequestPool(ipamapi.PoolRequest{ + AddressSpace: localAddressSpace, + Pool: "10.0.0.0/8", + SubPool: "10.10.0.0/16", + }) + assert.NilError(t, err) - // Now try to allocate then free nPool pools sequentially. - // Verify that we don't see any repeat networks even though - // we have freed them. - seen := map[string]bool{} - for range allocs { - alloc, err := a.RequestPool(ipamapi.PoolRequest{AddressSpace: localAddressSpace}) - if err != nil { - t.Fatal(err) - } - if _, ok := seen[alloc.Pool.String()]; ok { - t.Fatalf("Network %s was reused before exhausing the pool list", alloc.Pool.String()) - } - seen[alloc.Pool.String()] = true - if err := a.ReleasePool(alloc.PoolID); err != nil { - t.Fatal(err) - } - } + _, err = a.RequestPool(ipamapi.PoolRequest{ + AddressSpace: localAddressSpace, + Pool: "10.0.0.0/8", + SubPool: "10.10.0.0/16", + }) + assert.ErrorContains(t, err, "invalid pool request") + + _, err = a.RequestPool(ipamapi.PoolRequest{ + AddressSpace: localAddressSpace, + Pool: "10.0.0.0/8", + SubPool: "10.10.0.0/17", + }) + assert.NilError(t, err) + + _, err = a.RequestPool(ipamapi.PoolRequest{ + AddressSpace: localAddressSpace, + Pool: "10.0.0.0/8", + SubPool: "10.11.0.0/16", + }) + assert.NilError(t, err) } func TestGetAddressSubPoolEqualPool(t *testing.T) { diff --git a/libnetwork/ipams/defaultipam/parallel_test.go b/libnetwork/ipams/defaultipam/parallel_test.go index 1312e30924d04..a4187515e2ad7 100644 --- a/libnetwork/ipams/defaultipam/parallel_test.go +++ b/libnetwork/ipams/defaultipam/parallel_test.go @@ -3,13 +3,11 @@ package defaultipam import ( "context" "fmt" - "math/rand" "net" - "sort" + "net/netip" + "slices" "sync" - "sync/atomic" "testing" - "time" "github.com/docker/docker/libnetwork/ipamapi" "github.com/docker/docker/libnetwork/ipamutils" @@ -66,77 +64,46 @@ func TestDebug(t *testing.T) { tctx.a.RequestAddress(tctx.pid, nil, map[string]string{ipamapi.AllocSerialPrefix: "true"}) } -type op struct { - id int32 - add bool - name string -} +func TestRequestPoolParallel(t *testing.T) { + a, err := NewAllocator([]*ipamutils.NetworkToSplit{ + {Base: netip.MustParsePrefix("10.0.0.0/10"), Size: 24}, + }, nil) + assert.NilError(t, err) -func (o *op) String() string { - return fmt.Sprintf("%+v", *o) -} + var expected []string + var eg errgroup.Group + imax := 1 << (a.local4.predefined[0].Size - a.local4.predefined[0].Base.Bits()) + allocCh := make(chan string, imax) -func TestRequestPoolParallel(t *testing.T) { - a, err := NewAllocator(ipamutils.GetLocalScopeDefaultNetworks(), ipamutils.GetGlobalScopeDefaultNetworks()) - if err != nil { - t.Fatal(err) - } - var operationIndex int32 - ch := make(chan *op, 240) + for i := 0; i < imax; i++ { + expected = append(expected, fmt.Sprintf("10.%d.%d.0/24", uint(i/256), i%256)) - group := new(errgroup.Group) - defer func() { - if err := group.Wait(); err != nil { - t.Fatal(err) - } - }() + eg.Go(func() error { + t.Helper() - for i := 0; i < 120; i++ { - group.Go(func() error { - alloc, err := a.RequestPool(ipamapi.PoolRequest{AddressSpace: "GlobalDefault"}) - if err != nil { - t.Log(err) // log so we can see the error in real time rather than at the end when we actually call "Wait". - return fmt.Errorf("request error %v", err) - } - idx := atomic.AddInt32(&operationIndex, 1) - ch <- &op{idx, true, alloc.PoolID} - time.Sleep(time.Duration(rand.Intn(100)) * time.Millisecond) - idx = atomic.AddInt32(&operationIndex, 1) - err = a.ReleasePool(alloc.PoolID) + alloc, err := a.RequestPool(ipamapi.PoolRequest{AddressSpace: localAddressSpace}) if err != nil { - t.Log(err) // log so we can see the error in real time rather than at the end when we actually call "Wait". - return fmt.Errorf("release error %v", err) + return err } - ch <- &op{idx, false, alloc.PoolID} + + allocCh <- alloc.Pool.String() + return nil }) } - // map of events - m := make(map[string][]*op) - for i := 0; i < 240; i++ { - x := <-ch - ops, ok := m[x.name] - if !ok { - ops = make([]*op, 0, 10) - } - ops = append(ops, x) - m[x.name] = ops + assert.NilError(t, eg.Wait()) + close(allocCh) + + var allocated []string + for alloc := range allocCh { + allocated = append(allocated, alloc) } - // Post processing to avoid event reordering on the channel - for pool, ops := range m { - sort.Slice(ops[:], func(i, j int) bool { - return ops[i].id < ops[j].id - }) - expected := true - for _, op := range ops { - if op.add != expected { - t.Fatalf("Operations for %v not valid %v, operations %v", pool, op, ops) - } - expected = !expected - } + for _, exp := range expected { + assert.Check(t, slices.Contains(allocated, exp) == true) } + assert.Equal(t, len(allocated), len(expected)) } func TestFullAllocateRelease(t *testing.T) { diff --git a/libnetwork/ipams/defaultipam/structures.go b/libnetwork/ipams/defaultipam/structures.go index 37deed6890d5b..8a18f55021452 100644 --- a/libnetwork/ipams/defaultipam/structures.go +++ b/libnetwork/ipams/defaultipam/structures.go @@ -72,3 +72,56 @@ func (s *PoolID) String() string { func (p *PoolData) String() string { return fmt.Sprintf("PoolData[Children: %d]", len(p.children)) } + +// mergeIter is used to iterate on both 'a' and 'b' at the same time while +// maintaining the total order that would arise if both were merged and then +// sorted. Both 'a' and 'b' have to be sorted beforehand. +type mergeIter struct { + a, b []netip.Prefix + ia, ib int + cmp func(a, b netip.Prefix) int + lastA bool +} + +func newMergeIter(a, b []netip.Prefix, cmp func(a, b netip.Prefix) int) *mergeIter { + iter := &mergeIter{ + a: a, + b: b, + cmp: cmp, + } + iter.lastA = iter.nextA() + + return iter +} + +func (it *mergeIter) Get() netip.Prefix { + if it.ia+it.ib >= len(it.a)+len(it.b) { + return netip.Prefix{} + } + + if it.lastA { + return it.a[it.ia] + } + + return it.b[it.ib] +} + +func (it *mergeIter) Inc() { + if it.lastA { + it.ia++ + } else { + it.ib++ + } + + it.lastA = it.nextA() +} + +func (it *mergeIter) nextA() bool { + if it.ia < len(it.a) && it.ib < len(it.b) && it.cmp(it.a[it.ia], it.b[it.ib]) <= 0 { + return true + } else if it.ia < len(it.a) && it.ib >= len(it.b) { + return true + } + + return false +} diff --git a/libnetwork/ipams/defaultipam/structures_test.go b/libnetwork/ipams/defaultipam/structures_test.go new file mode 100644 index 0000000000000..b5c66c0ce500f --- /dev/null +++ b/libnetwork/ipams/defaultipam/structures_test.go @@ -0,0 +1,32 @@ +package defaultipam + +import ( + "net/netip" + "testing" + + "github.com/docker/docker/libnetwork/internal/netiputil" + "gotest.tools/v3/assert" +) + +func TestMergeIter(t *testing.T) { + allocated := []netip.Prefix{ + netip.MustParsePrefix("172.16.0.0/24"), + netip.MustParsePrefix("172.17.0.0/24"), + netip.MustParsePrefix("172.18.0.0/24"), + } + reserved := []netip.Prefix{ + netip.MustParsePrefix("172.16.0.0/24"), + } + it := newMergeIter(allocated, reserved, netiputil.PrefixCompare) + + for _, exp := range []netip.Prefix{ + allocated[0], + reserved[0], + allocated[1], + allocated[2], + {}, + } { + assert.Equal(t, it.Get(), exp) + it.Inc() + } +} diff --git a/libnetwork/ipams/remote/remote.go b/libnetwork/ipams/remote/remote.go index 4ddadc9420a65..d3378ccc5e013 100644 --- a/libnetwork/ipams/remote/remote.go +++ b/libnetwork/ipams/remote/remote.go @@ -116,8 +116,26 @@ func (a *allocator) GetDefaultAddressSpaces() (string, string, error) { return res.LocalDefaultAddressSpace, res.GlobalDefaultAddressSpace, nil } -// RequestPool requests an address pool in the specified address space +// RequestPool requests an address pool in the specified address space. +// +// This is a bug-for-bug re-implementation of the logic originally found in +// requestPoolHelper prior to v27. See https://github.com/moby/moby/blob/faf84d7f0a1f2e6badff6f720a3e1e559c356fff/libnetwork/network.go#L1518-L1570 func (a *allocator) RequestPool(req ipamapi.PoolRequest) (ipamapi.AllocatedPool, error) { + var tmpPoolLeases []string + defer func() { + // Release all pools we held on to. + for _, pID := range tmpPoolLeases { + if err := a.ReleasePool(pID); err != nil { + log.G(context.TODO()).Warnf("Failed to release overlapping pool") + } + } + }() + + _, globalSpace, err := a.GetDefaultAddressSpaces() + if err != nil { + return ipamapi.AllocatedPool{}, err + } + remoteReq := &api.RequestPoolRequest{ AddressSpace: req.AddressSpace, Pool: req.Pool, @@ -125,8 +143,51 @@ func (a *allocator) RequestPool(req ipamapi.PoolRequest) (ipamapi.AllocatedPool, Options: req.Options, V6: req.V6, } + + for { + alloc, err := a.requestPool(remoteReq) + if err != nil { + return alloc, err + } + + // If the network pool was explicitly chosen, the network belongs to + // global address space, or it is invalid ("0.0.0.0/0"), then we don't + // perform check for overlaps. + // + // FIXME(thaJeztah): why are we ignoring invalid pools here? + // + // The "invalid" conditions was added in [libnetwork#1095][1], which + // moved code to reduce os-specific dependencies in the ipam package, + // but also introduced a types.IsIPNetValid() function, which considers + // "0.0.0.0/0" invalid, and added it to the conditions below. + // + // Unfortunately review does not mention this change, so there's no + // context why. Possibly this was done to prevent errors further down + // the line (when checking for overlaps), but returning an error here + // instead would likely have avoided that as well, so we can only guess. + // + // [1]: https://github.com/moby/libnetwork/commit/5ca79d6b87873264516323a7b76f0af7d0298492#diff-bdcd879439d041827d334846f9aba01de6e3683ed8fdd01e63917dae6df23846 + if req.Pool != "" || req.AddressSpace == globalSpace || alloc.Pool.String() == "0.0.0.0/0" { + return alloc, nil + } + + // Check for overlap and if none found, we have found the right pool. + if !checkOverlaps(alloc, req.Exclude) { + return alloc, nil + } + + // Pool obtained in this iteration is overlapping. Hold onto the pool + // and don't release it yet, because we don't want IPAM to give us back + // the same pool over again. But make sure we still do a deferred release + // when we have either obtained a non-overlapping pool or ran out of + // pre-defined pools. + tmpPoolLeases = append(tmpPoolLeases, alloc.PoolID) + } +} + +func (a *allocator) requestPool(req *api.RequestPoolRequest) (ipamapi.AllocatedPool, error) { res := &api.RequestPoolResponse{} - if err := a.call("RequestPool", remoteReq, res); err != nil { + if err := a.call("RequestPool", req, res); err != nil { return ipamapi.AllocatedPool{}, err } @@ -138,6 +199,16 @@ func (a *allocator) RequestPool(req ipamapi.PoolRequest) (ipamapi.AllocatedPool, }, err } +// checkOverlaps returns true if the 'pool' overlaps with some prefix in 'reserved'. +func checkOverlaps(pool ipamapi.AllocatedPool, reserved []netip.Prefix) bool { + for _, r := range reserved { + if r.Overlaps(pool.Pool) { + return true + } + } + return false +} + // ReleasePool removes an address pool from the specified address space func (a *allocator) ReleasePool(poolID string) error { req := &api.ReleasePoolRequest{PoolID: poolID} diff --git a/libnetwork/ipamutils/utils.go b/libnetwork/ipamutils/utils.go index ed3a5b278ea61..725f678f6eb7c 100644 --- a/libnetwork/ipamutils/utils.go +++ b/libnetwork/ipamutils/utils.go @@ -2,27 +2,23 @@ package ipamutils import ( - "fmt" - "net" + "net/netip" + "slices" ) var ( - // predefinedLocalScopeDefaultNetworks contains a list of 31 IPv4 private networks with host size 16 and 12 - // (172.17-31.x.x/16, 192.168.x.x/20) which do not overlap with the networks in `PredefinedGlobalScopeDefaultNetworks` - predefinedLocalScopeDefaultNetworks []*net.IPNet - // predefinedGlobalScopeDefaultNetworks contains a list of 64K IPv4 private networks with host size 8 - // (10.x.x.x/24) which do not overlap with the networks in `PredefinedLocalScopeDefaultNetworks` - predefinedGlobalScopeDefaultNetworks []*net.IPNet - localScopeDefaultNetworks = []*NetworkToSplit{ - {"172.17.0.0/16", 16}, - {"172.18.0.0/16", 16}, - {"172.19.0.0/16", 16}, - {"172.20.0.0/14", 16}, - {"172.24.0.0/14", 16}, - {"172.28.0.0/14", 16}, - {"192.168.0.0/16", 20}, + localScopeDefaultNetworks = []*NetworkToSplit{ + {netip.MustParsePrefix("172.17.0.0/16"), 16}, + {netip.MustParsePrefix("172.18.0.0/16"), 16}, + {netip.MustParsePrefix("172.19.0.0/16"), 16}, + {netip.MustParsePrefix("172.20.0.0/14"), 16}, + {netip.MustParsePrefix("172.24.0.0/14"), 16}, + {netip.MustParsePrefix("172.28.0.0/14"), 16}, + {netip.MustParsePrefix("192.168.0.0/16"), 20}, + } + globalScopeDefaultNetworks = []*NetworkToSplit{ + {netip.MustParsePrefix("10.0.0.0/8"), 24}, } - globalScopeDefaultNetworks = []*NetworkToSplit{{"10.0.0.0/8", 24}} ) // NetworkToSplit represent a network that has to be split in chunks with mask length Size. @@ -31,73 +27,26 @@ var ( // Example: a Base "10.10.0.0/16 with Size 24 will define the set of 256 // 10.10.[0-255].0/24 address pools type NetworkToSplit struct { - Base string `json:"base"` - Size int `json:"size"` + Base netip.Prefix `json:"base"` + Size int `json:"size"` } -func init() { - var err error - if predefinedGlobalScopeDefaultNetworks, err = SplitNetworks(globalScopeDefaultNetworks); err != nil { - panic("failed to initialize the global scope default address pool: " + err.Error()) - } +// FirstPrefix returns the first prefix available in NetworkToSplit. +func (n NetworkToSplit) FirstPrefix() netip.Prefix { + return netip.PrefixFrom(n.Base.Addr(), n.Size) +} - if predefinedLocalScopeDefaultNetworks, err = SplitNetworks(localScopeDefaultNetworks); err != nil { - panic("failed to initialize the local scope default address pool: " + err.Error()) - } +// Overlaps is a util function checking whether 'p' overlaps with 'n'. +func (n NetworkToSplit) Overlaps(p netip.Prefix) bool { + return n.Base.Overlaps(p) } // GetGlobalScopeDefaultNetworks returns a copy of the global-sopce network list. -func GetGlobalScopeDefaultNetworks() []*net.IPNet { - return append([]*net.IPNet(nil), predefinedGlobalScopeDefaultNetworks...) +func GetGlobalScopeDefaultNetworks() []*NetworkToSplit { + return slices.Clone(globalScopeDefaultNetworks) } // GetLocalScopeDefaultNetworks returns a copy of the default local-scope network list. -func GetLocalScopeDefaultNetworks() []*net.IPNet { - return append([]*net.IPNet(nil), predefinedLocalScopeDefaultNetworks...) -} - -// SplitNetworks takes a slice of networks, split them accordingly and returns them -func SplitNetworks(list []*NetworkToSplit) ([]*net.IPNet, error) { - localPools := make([]*net.IPNet, 0, len(list)) - - for _, p := range list { - _, b, err := net.ParseCIDR(p.Base) - if err != nil { - return nil, fmt.Errorf("invalid base pool %q: %v", p.Base, err) - } - ones, _ := b.Mask.Size() - if p.Size <= 0 || p.Size < ones { - return nil, fmt.Errorf("invalid pools size: %d", p.Size) - } - localPools = append(localPools, splitNetwork(p.Size, b)...) - } - return localPools, nil -} - -func splitNetwork(size int, base *net.IPNet) []*net.IPNet { - one, bits := base.Mask.Size() - mask := net.CIDRMask(size, bits) - n := 1 << uint(size-one) - s := uint(bits - size) - list := make([]*net.IPNet, 0, n) - - for i := 0; i < n; i++ { - ip := copyIP(base.IP) - addIntToIP(ip, uint(i<= 0; i-- { - array[i] |= (byte)(ordinal & 0xff) - ordinal >>= 8 - } +func GetLocalScopeDefaultNetworks() []*NetworkToSplit { + return slices.Clone(localScopeDefaultNetworks) } diff --git a/libnetwork/ipamutils/utils_test.go b/libnetwork/ipamutils/utils_test.go deleted file mode 100644 index 94221b424d20f..0000000000000 --- a/libnetwork/ipamutils/utils_test.go +++ /dev/null @@ -1,74 +0,0 @@ -package ipamutils - -import ( - "net" - "testing" - - "gotest.tools/v3/assert" - is "gotest.tools/v3/assert/cmp" -) - -func initBroadPredefinedNetworks() []*net.IPNet { - pl := make([]*net.IPNet, 0, 31) - mask := []byte{255, 255, 0, 0} - for i := 17; i < 32; i++ { - pl = append(pl, &net.IPNet{IP: []byte{172, byte(i), 0, 0}, Mask: mask}) - } - mask20 := []byte{255, 255, 240, 0} - for i := 0; i < 16; i++ { - pl = append(pl, &net.IPNet{IP: []byte{192, 168, byte(i << 4), 0}, Mask: mask20}) - } - return pl -} - -func initGranularPredefinedNetworks() []*net.IPNet { - pl := make([]*net.IPNet, 0, 256*256) - mask := []byte{255, 255, 255, 0} - for i := 0; i < 256; i++ { - for j := 0; j < 256; j++ { - pl = append(pl, &net.IPNet{IP: []byte{10, byte(i), byte(j), 0}, Mask: mask}) - } - } - return pl -} - -func TestDefaultNetwork(t *testing.T) { - for _, nw := range GetGlobalScopeDefaultNetworks() { - if ones, bits := nw.Mask.Size(); bits != 32 || ones != 24 { - t.Fatalf("Unexpected size for network in granular list: %v", nw) - } - } - - for _, nw := range GetLocalScopeDefaultNetworks() { - if ones, bits := nw.Mask.Size(); bits != 32 || (ones != 20 && ones != 16) { - t.Fatalf("Unexpected size for network in broad list: %v", nw) - } - } - - originalBroadNets := initBroadPredefinedNetworks() - m := make(map[string]bool) - for _, v := range originalBroadNets { - m[v.String()] = true - } - for _, nw := range GetLocalScopeDefaultNetworks() { - _, ok := m[nw.String()] - assert.Check(t, ok) - delete(m, nw.String()) - } - - assert.Check(t, is.Len(m, 0)) - - originalGranularNets := initGranularPredefinedNetworks() - - m = make(map[string]bool) - for _, v := range originalGranularNets { - m[v.String()] = true - } - for _, nw := range GetGlobalScopeDefaultNetworks() { - _, ok := m[nw.String()] - assert.Check(t, ok) - delete(m, nw.String()) - } - - assert.Check(t, is.Len(m, 0)) -} diff --git a/libnetwork/ipbits/ipbits.go b/libnetwork/ipbits/ipbits.go index ab2c04ed31d4f..6ef3b52915a1a 100644 --- a/libnetwork/ipbits/ipbits.go +++ b/libnetwork/ipbits/ipbits.go @@ -24,6 +24,27 @@ func Add(ip netip.Addr, x uint64, shift uint) netip.Addr { } } +// SubnetsBetween computes the number of subnets of size 'sz' available between 'a1' +// and 'a2'. The result is capped at [math.MaxUint64]. It returns 0 when one of +// 'a1' or 'a2' is invalid, if both aren't of the same family, or when 'a2' is +// less than 'a1'. +func SubnetsBetween(a1 netip.Addr, a2 netip.Addr, sz int) uint64 { + if !a1.IsValid() || !a2.IsValid() || a1.Is4() != a2.Is4() || a2.Less(a1) { + return 0 + } + + p1, _ := a1.Prefix(sz) + p2, _ := a2.Prefix(sz) + + return subAddr(p2.Addr(), p1.Addr()).rsh(uint(a1.BitLen() - sz)).uint64() +} + +// subAddr returns 'ip1 - ip2'. Both netip.Addr have to be of the same address +// family. 'ip1' as to be greater than or equal to 'ip2'. +func subAddr(ip1 netip.Addr, ip2 netip.Addr) uint128 { + return uint128From16(ip1.As16()).sub(uint128From16(ip2.As16())) +} + // Field returns the value of the bitfield [u, v] in ip as an integer, // where bit 0 is the most-significant bit of ip. // diff --git a/libnetwork/ipbits/ipbits_test.go b/libnetwork/ipbits/ipbits_test.go index d23ea1ad8c8d5..6db4f41cf6aee 100644 --- a/libnetwork/ipbits/ipbits_test.go +++ b/libnetwork/ipbits/ipbits_test.go @@ -3,6 +3,8 @@ package ipbits import ( "net/netip" "testing" + + "gotest.tools/v3/assert" ) func TestAdd(t *testing.T) { @@ -68,3 +70,30 @@ func TestField(t *testing.T) { } } } +func TestSubnetsBetween(t *testing.T) { + tests := []struct { + a1, a2 netip.Addr + sz int + want uint64 + }{ + {netip.MustParseAddr("10.0.0.0"), netip.MustParseAddr("10.0.0.0"), 8, 0}, + {netip.MustParseAddr("10.0.0.0"), netip.MustParseAddr("10.0.10.0"), 8, 0}, + {netip.MustParseAddr("10.0.0.0"), netip.MustParseAddr("10.1.0.0"), 24, 256}, + {netip.MustParseAddr("10.0.0.0"), netip.MustParseAddr("10.10.0.0"), 16, 10}, + {netip.MustParseAddr("10.20.0.0"), netip.MustParseAddr("10.20.128.0"), 24, 128}, + {netip.MustParseAddr("10.0.0.0"), netip.MustParseAddr("10.0.10.0"), 24, 10}, + + {netip.MustParseAddr("fc00::"), netip.MustParseAddr("fc00::"), 8, 0x0}, + {netip.MustParseAddr("fc00::"), netip.MustParseAddr("fc00:1000::"), 16, 0x0}, + {netip.MustParseAddr("fc00::"), netip.MustParseAddr("fc01::"), 24, 0x100}, + {netip.MustParseAddr("fc00::"), netip.MustParseAddr("fc01::"), 16, 0x1}, + {netip.MustParseAddr("fc00::"), netip.MustParseAddr("fc00:1000::"), 24, 0x10}, + {netip.MustParseAddr("fc00::"), netip.MustParseAddr("fc00:1000::"), 24, 0x10}, + {netip.MustParseAddr("fc00::"), netip.MustParseAddr("fd00::"), 64, 0x100_0000_0000_0000}, + } + + for _, tt := range tests { + d := SubnetsBetween(tt.a1, tt.a2, tt.sz) + assert.Check(t, d == tt.want, "SubnetsBetween(%q, %q, %d) = 0x%x; want: 0x%x", tt.a1, tt.a2, tt.sz, d, tt.want) + } +} diff --git a/libnetwork/ipbits/uint128.go b/libnetwork/ipbits/uint128.go index 56700d03a0cc8..9bee6f5d76375 100644 --- a/libnetwork/ipbits/uint128.go +++ b/libnetwork/ipbits/uint128.go @@ -24,6 +24,12 @@ func (x uint128) add(y uint128) uint128 { return uint128{hi: hi, lo: lo} } +func (x uint128) sub(y uint128) uint128 { + lo, carry := bits.Sub64(x.lo, y.lo, 0) + hi, _ := bits.Sub64(x.hi, y.hi, carry) + return uint128{hi: hi, lo: lo} +} + func (x uint128) lsh(n uint) uint128 { if n > 64 { return uint128{hi: x.lo << (n - 64)} diff --git a/libnetwork/netutils/utils.go b/libnetwork/netutils/utils.go index 490fa1dfef29c..5db4b137c888d 100644 --- a/libnetwork/netutils/utils.go +++ b/libnetwork/netutils/utils.go @@ -6,7 +6,6 @@ import ( "context" "crypto/rand" "encoding/hex" - "errors" "fmt" "io" "net" @@ -16,34 +15,6 @@ import ( "github.com/containerd/log" ) -var ( - // ErrNetworkOverlapsWithNameservers preformatted error - ErrNetworkOverlapsWithNameservers = errors.New("requested network overlaps with nameserver") - // ErrNetworkOverlaps preformatted error - ErrNetworkOverlaps = errors.New("requested network overlaps with existing network") -) - -// CheckNameserverOverlaps checks whether the passed network overlaps with any of the nameservers -func CheckNameserverOverlaps(nameservers []string, toCheck *net.IPNet) error { - if len(nameservers) > 0 { - for _, ns := range nameservers { - _, nsNetwork, err := net.ParseCIDR(ns) - if err != nil { - return err - } - if NetworkOverlaps(toCheck, nsNetwork) { - return ErrNetworkOverlapsWithNameservers - } - } - } - return nil -} - -// NetworkOverlaps detects overlap between one IPNet and another -func NetworkOverlaps(netX *net.IPNet, netY *net.IPNet) bool { - return netX.Contains(netY.IP) || netY.Contains(netX.IP) -} - func genMAC(ip net.IP) net.HardwareAddr { hw := make(net.HardwareAddr, 6) // The first byte of the MAC address has to comply with these rules: diff --git a/libnetwork/netutils/utils_freebsd.go b/libnetwork/netutils/utils_freebsd.go index 31b2bf70eca9d..4c4820a66218e 100644 --- a/libnetwork/netutils/utils_freebsd.go +++ b/libnetwork/netutils/utils_freebsd.go @@ -1,13 +1,6 @@ package netutils -import ( - "net" - - "github.com/docker/docker/libnetwork/types" -) - -// FindAvailableNetwork returns a network from the passed list which does not -// overlap with existing interfaces in the system -func FindAvailableNetwork(list []*net.IPNet) (*net.IPNet, error) { - return nil, types.NotImplementedErrorf("not supported on freebsd") +// InferReservedNetworks returns an empty list on FreeBSD. +func InferReservedNetworks(v6 bool) []netip.Prefix { + return []netip.Prefix{} } diff --git a/libnetwork/netutils/utils_linux.go b/libnetwork/netutils/utils_linux.go index b5488aa6bbd42..491f32107c50a 100644 --- a/libnetwork/netutils/utils_linux.go +++ b/libnetwork/netutils/utils_linux.go @@ -5,9 +5,11 @@ package netutils import ( - "net" + "net/netip" "os" + "slices" + "github.com/docker/docker/libnetwork/internal/netiputil" "github.com/docker/docker/libnetwork/ns" "github.com/docker/docker/libnetwork/resolvconf" "github.com/docker/docker/libnetwork/types" @@ -15,24 +17,84 @@ import ( "github.com/vishvananda/netlink" ) -var networkGetRoutesFct func(netlink.Link, int) ([]netlink.Route, error) +// InferReservedNetworks returns a list of network prefixes that seem to be +// used by the system and that would likely break it if they were assigned to +// some Docker networks. It uses two heuristics to build that list: +// +// 1. Nameservers configured in /etc/resolv.conf ; +// 2. On-link routes ; +// +// That 2nd heuristic was originally not limited to on-links -- all non-default +// routes were checked (see [1]). This proved to be not ideal at best and +// highly problematic at worst: +// +// - VPN software and appliances doing split tunneling might push a small set +// of routes for large, aggregated prefixes to avoid maintenance and +// potential issues whenever a new subnet comes into use on internal +// network. However, not all subnets from these aggregates might be in use. +// - For full tunneling, especially when implemented with OpenVPN, the +// situation is even worse as the host might end up with the two following +// routes: 0.0.0.0/1 and 128.0.0.0/1. They are functionally +// indistinguishable from a default route, yet the Engine was treating them +// differently. With those routes, there was no way to use dynamic subnet +// allocation at all. (see 'def1' on [2]) +// - A subnet covered by the default route can be used, or not. Same for +// non-default and non-on-link routes. The type of route says little about +// the availability of subnets it covers, except for on-link routes as they +// specifically define what subnet the current host is part of. +// +// The 2nd heuristic was modified to be limited to on-link routes in PR #42598 +// (first released in v23.0, see [3]). +// +// If these heuristics don't detect an overlap, users should change their daemon +// config to remove that overlapping prefix from `default-address-pools`. If a +// prefix is found to overlap but users care enough about it being associated +// to a Docker network they can still rely on static allocation. +// +// For IPv6, the 2nd heuristic isn't applied as there's no such thing as +// on-link routes for IPv6. +// +// [1]: https://github.com/moby/libnetwork/commit/56832d6d89bf0f9d5280849026ee25ae4ae5f22e +// [2]: https://community.openvpn.net/openvpn/wiki/Openvpn23ManPage +// [3]: https://github.com/moby/moby/pull/42598 +func InferReservedNetworks(v6 bool) []netip.Prefix { + var reserved []netip.Prefix -// CheckRouteOverlaps checks whether the passed network overlaps with any existing routes -func CheckRouteOverlaps(toCheck *net.IPNet) error { - networkGetRoutesFct := networkGetRoutesFct - if networkGetRoutesFct == nil { - networkGetRoutesFct = ns.NlHandle().RouteList + // We don't really care if os.ReadFile fails here. It either doesn't exist, + // or we can't read it for some reason. + if rc, err := os.ReadFile(resolvconf.Path()); err == nil { + reserved = slices.DeleteFunc(resolvconf.GetNameserversAsPrefix(rc), func(p netip.Prefix) bool { + return p.Addr().Is6() != v6 + }) + } + + if !v6 { + reserved = append(reserved, queryOnLinkRoutes()...) } - networks, err := networkGetRoutesFct(nil, netlink.FAMILY_V4) + + slices.SortFunc(reserved, netiputil.PrefixCompare) + return reserved +} + +// queryOnLinkRoutes returns a list of on-link routes available on the host. +// Only IPv4 prefixes are returned as there's no such thing as on-link +// routes for IPv6. +func queryOnLinkRoutes() []netip.Prefix { + routes, err := ns.NlHandle().RouteList(nil, netlink.FAMILY_V4) if err != nil { - return err + return nil } - for _, network := range networks { - if network.Dst != nil && network.Scope == netlink.SCOPE_LINK && NetworkOverlaps(toCheck, network.Dst) { - return ErrNetworkOverlaps + + var prefixes []netip.Prefix + for _, route := range routes { + if route.Dst != nil && route.Scope == netlink.SCOPE_LINK { + if p, ok := netiputil.ToPrefix(route.Dst); ok { + prefixes = append(prefixes, p) + } } } - return nil + + return prefixes } // GenerateIfaceName returns an interface name using the passed in @@ -58,23 +120,3 @@ func GenerateIfaceName(nlh *netlink.Handle, prefix string, len int) (string, err } return "", types.InternalErrorf("could not generate interface name") } - -// FindAvailableNetwork returns a network from the passed list which does not -// overlap with existing interfaces in the system -func FindAvailableNetwork(list []*net.IPNet) (*net.IPNet, error) { - // We don't check for an error here, because we don't really care if we - // can't read /etc/resolv.conf. So instead we skip the append if resolvConf - // is nil. It either doesn't exist, or we can't read it for some reason. - var nameservers []string - if rc, err := os.ReadFile(resolvconf.Path()); err == nil { - nameservers = resolvconf.GetNameserversAsCIDR(rc) - } - for _, nw := range list { - if err := CheckNameserverOverlaps(nameservers, nw); err == nil { - if err := CheckRouteOverlaps(nw); err == nil { - return nw, nil - } - } - } - return nil, errors.New("no available network") -} diff --git a/libnetwork/netutils/utils_linux_test.go b/libnetwork/netutils/utils_linux_test.go index 762c82f80a910..b9228e9d0581c 100644 --- a/libnetwork/netutils/utils_linux_test.go +++ b/libnetwork/netutils/utils_linux_test.go @@ -3,128 +3,18 @@ package netutils import ( "bytes" "fmt" - "net" + "net/netip" + "slices" "strings" "testing" "github.com/docker/docker/internal/testutils/netnsutils" - "github.com/docker/docker/libnetwork/ipamutils" - "github.com/docker/docker/libnetwork/types" + "github.com/docker/docker/libnetwork/internal/netiputil" "github.com/vishvananda/netlink" "gotest.tools/v3/assert" is "gotest.tools/v3/assert/cmp" ) -func TestNonOverlappingNameservers(t *testing.T) { - network := &net.IPNet{ - IP: []byte{192, 168, 0, 1}, - Mask: []byte{255, 255, 255, 0}, - } - nameservers := []string{ - "127.0.0.1/32", - } - - if err := CheckNameserverOverlaps(nameservers, network); err != nil { - t.Fatal(err) - } -} - -func TestOverlappingNameservers(t *testing.T) { - network := &net.IPNet{ - IP: []byte{192, 168, 0, 1}, - Mask: []byte{255, 255, 255, 0}, - } - nameservers := []string{ - "192.168.0.1/32", - } - - if err := CheckNameserverOverlaps(nameservers, network); err == nil { - t.Fatalf("Expected error %s got %s", ErrNetworkOverlapsWithNameservers, err) - } -} - -func TestCheckRouteOverlaps(t *testing.T) { - networkGetRoutesFct = func(netlink.Link, int) ([]netlink.Route, error) { - routesData := []string{"10.0.2.0/32", "10.0.3.0/24", "10.0.42.0/24", "172.16.42.0/24", "192.168.142.0/24"} - routes := []netlink.Route{} - for _, addr := range routesData { - _, netX, _ := net.ParseCIDR(addr) - routes = append(routes, netlink.Route{Dst: netX, Scope: netlink.SCOPE_LINK}) - } - // Add a route with a scope which should not overlap - _, netX, _ := net.ParseCIDR("10.0.5.0/24") - routes = append(routes, netlink.Route{Dst: netX, Scope: netlink.SCOPE_UNIVERSE}) - return routes, nil - } - defer func() { networkGetRoutesFct = nil }() - - _, netX, _ := net.ParseCIDR("172.16.0.1/24") - if err := CheckRouteOverlaps(netX); err != nil { - t.Fatal(err) - } - - _, netX, _ = net.ParseCIDR("10.0.2.0/24") - if err := CheckRouteOverlaps(netX); err == nil { - t.Fatal("10.0.2.0/24 and 10.0.2.0 should overlap but it doesn't") - } - - _, netX, _ = net.ParseCIDR("10.0.5.0/24") - if err := CheckRouteOverlaps(netX); err != nil { - t.Fatal("10.0.5.0/24 and 10.0.5.0 with scope UNIVERSE should not overlap but it does") - } -} - -func TestCheckNameserverOverlaps(t *testing.T) { - nameservers := []string{"10.0.2.3/32", "192.168.102.1/32"} - - _, netX, _ := net.ParseCIDR("10.0.2.3/32") - - if err := CheckNameserverOverlaps(nameservers, netX); err == nil { - t.Fatalf("%s should overlap 10.0.2.3/32 but doesn't", netX) - } - - _, netX, _ = net.ParseCIDR("192.168.102.2/32") - - if err := CheckNameserverOverlaps(nameservers, netX); err != nil { - t.Fatalf("%s should not overlap %v but it does", netX, nameservers) - } -} - -func AssertOverlap(CIDRx string, CIDRy string, t *testing.T) { - _, netX, _ := net.ParseCIDR(CIDRx) - _, netY, _ := net.ParseCIDR(CIDRy) - if !NetworkOverlaps(netX, netY) { - t.Errorf("%v and %v should overlap", netX, netY) - } -} - -func AssertNoOverlap(CIDRx string, CIDRy string, t *testing.T) { - _, netX, _ := net.ParseCIDR(CIDRx) - _, netY, _ := net.ParseCIDR(CIDRy) - if NetworkOverlaps(netX, netY) { - t.Errorf("%v and %v should not overlap", netX, netY) - } -} - -func TestNetworkOverlaps(t *testing.T) { - // netY starts at same IP and ends within netX - AssertOverlap("172.16.0.1/24", "172.16.0.1/25", t) - // netY starts within netX and ends at same IP - AssertOverlap("172.16.0.1/24", "172.16.0.128/25", t) - // netY starts and ends within netX - AssertOverlap("172.16.0.1/24", "172.16.0.64/25", t) - // netY starts at same IP and ends outside of netX - AssertOverlap("172.16.0.1/24", "172.16.0.1/23", t) - // netY starts before and ends at same IP of netX - AssertOverlap("172.16.1.1/24", "172.16.0.1/23", t) - // netY starts before and ends outside of netX - AssertOverlap("172.16.1.1/24", "172.16.0.1/22", t) - // netY starts and ends before netX - AssertNoOverlap("172.16.1.1/25", "172.16.0.1/24", t) - // netX starts and ends before netY - AssertNoOverlap("172.16.1.1/25", "172.16.2.1/24", t) -} - // Test veth name generation "veth"+rand (e.g.veth0f60e2c) func TestGenerateRandomName(t *testing.T) { const vethPrefix = "veth" @@ -184,83 +74,53 @@ func TestUtilGenerateRandomMAC(t *testing.T) { } } -func TestNetworkRequest(t *testing.T) { +func TestInferReservedNetworksV4(t *testing.T) { defer netnsutils.SetupTestOSContext(t)() - nw, err := FindAvailableNetwork(ipamutils.GetLocalScopeDefaultNetworks()) - if err != nil { - t.Fatal(err) - } - - var found bool - for _, exp := range ipamutils.GetLocalScopeDefaultNetworks() { - if types.CompareIPNet(exp, nw) { - found = true - break - } - } - - if !found { - t.Fatalf("Found unexpected broad network %s", nw) - } - - nw, err = FindAvailableNetwork(ipamutils.GetGlobalScopeDefaultNetworks()) - if err != nil { - t.Fatal(err) - } - - found = false - for _, exp := range ipamutils.GetGlobalScopeDefaultNetworks() { - if types.CompareIPNet(exp, nw) { - found = true - break - } - } - - if !found { - t.Fatalf("Found unexpected granular network %s", nw) - } - - // Add iface and ssert returned address on request - createInterface(t, "test", "172.17.42.1/16") - - _, exp, err := net.ParseCIDR("172.18.0.0/16") - if err != nil { - t.Fatal(err) - } - nw, err = FindAvailableNetwork(ipamutils.GetLocalScopeDefaultNetworks()) - if err != nil { - t.Fatal(err) - } - if !types.CompareIPNet(exp, nw) { - t.Fatalf("expected %s. got %s", exp, nw) - } + ifaceID := createInterface(t, "foobar") + addRoute(t, ifaceID, netlink.SCOPE_LINK, netip.MustParsePrefix("100.0.0.0/24")) + addRoute(t, ifaceID, netlink.SCOPE_LINK, netip.MustParsePrefix("10.0.0.0/8")) + addRoute(t, ifaceID, netlink.SCOPE_UNIVERSE, netip.MustParsePrefix("20.0.0.0/8")) + + reserved := InferReservedNetworks(false) + t.Logf("reserved: %+v", reserved) + + // We don't check the size of 'reserved' here because it also includes + // nameservers set in /etc/resolv.conf. This file might change from one test + // env to another, and it'd be unnecessarily complex to set up a mount + // namespace just to check that. Current implementation uses a function + // which is properly tested, so everything should be good. + assert.Check(t, slices.Contains(reserved, netip.MustParsePrefix("100.0.0.0/24"))) + assert.Check(t, slices.Contains(reserved, netip.MustParsePrefix("10.0.0.0/8"))) + assert.Check(t, !slices.Contains(reserved, netip.MustParsePrefix("20.0.0.0/8"))) } -func createInterface(t *testing.T, name string, nws ...string) { - // Add interface - link := &netlink.Bridge{ +func createInterface(t *testing.T, name string) int { + t.Helper() + + link := &netlink.Dummy{ LinkAttrs: netlink.LinkAttrs{ - Name: "test", + Name: name, }, } - bips := []*net.IPNet{} - for _, nw := range nws { - bip, err := types.ParseCIDR(nw) - if err != nil { - t.Fatal(err) - } - bips = append(bips, bip) - } if err := netlink.LinkAdd(link); err != nil { - t.Fatalf("Failed to create interface via netlink: %v", err) - } - for _, bip := range bips { - if err := netlink.AddrAdd(link, &netlink.Addr{IPNet: bip}); err != nil { - t.Fatal(err) - } + t.Fatalf("failed to create interface %s: %v", name, err) } if err := netlink.LinkSetUp(link); err != nil { t.Fatal(err) } + + return link.Attrs().Index +} + +func addRoute(t *testing.T, linkID int, scope netlink.Scope, prefix netip.Prefix) { + t.Helper() + + if err := netlink.RouteAdd(&netlink.Route{ + Scope: scope, + LinkIndex: linkID, + Dst: netiputil.ToIPNet(prefix), + }); err != nil { + t.Fatalf("failed to add on-link route %s: %v", prefix, err) + } } diff --git a/libnetwork/netutils/utils_windows.go b/libnetwork/netutils/utils_windows.go index b88e3ebe65d00..fed188b8aca8a 100644 --- a/libnetwork/netutils/utils_windows.go +++ b/libnetwork/netutils/utils_windows.go @@ -1,13 +1,8 @@ package netutils -import ( - "net" -) +import "net/netip" -// FindAvailableNetwork returns a network from the passed list which does not -// overlap with existing interfaces in the system -// -// TODO : Use appropriate windows APIs to identify non-overlapping subnets -func FindAvailableNetwork(list []*net.IPNet) (*net.IPNet, error) { - return nil, nil +// InferReservedNetworks returns an empty list on Windows. +func InferReservedNetworks(v6 bool) []netip.Prefix { + return []netip.Prefix{} } diff --git a/libnetwork/network.go b/libnetwork/network.go index 7340ad101b1c8..d9c3070b6e350 100644 --- a/libnetwork/network.go +++ b/libnetwork/network.go @@ -8,6 +8,7 @@ import ( "encoding/json" "fmt" "net" + "net/netip" "runtime" "strings" "sync" @@ -1517,66 +1518,6 @@ func (n *Network) ipamAllocate() error { return err } -func (n *Network) requestPoolHelper(ipam ipamapi.Ipam, addressSpace, requestedPool, requestedSubPool string, options map[string]string, v6 bool) (string, *net.IPNet, map[string]string, error) { - var tmpPoolLeases []string - defer func() { - // Prevent repeated lock/unlock in the loop. - nwName := n.Name() - // Release all pools we held on to. - for _, pID := range tmpPoolLeases { - if err := ipam.ReleasePool(pID); err != nil { - log.G(context.TODO()).Warnf("Failed to release overlapping pool while returning from pool request helper for network %s", nwName) - } - } - }() - - for { - alloc, err := ipam.RequestPool(ipamapi.PoolRequest{ - AddressSpace: addressSpace, - Pool: requestedPool, - SubPool: requestedSubPool, - Options: options, - V6: v6, - }) - if err != nil { - return "", nil, nil, err - } - - // If the network pool was explicitly chosen, the network belongs to - // global scope, or it is invalid ("0.0.0.0/0"), then we don't perform - // check for overlaps. - // - // FIXME(thaJeztah): why are we ignoring invalid pools here? - // - // The "invalid" conditions was added in [libnetwork#1095][1], which - // moved code to reduce os-specific dependencies in the ipam package, - // but also introduced a types.IsIPNetValid() function, which considers - // "0.0.0.0/0" invalid, and added it to the conditions below. - // - // Unfortunately review does not mention this change, so there's no - // context why. Possibly this was done to prevent errors further down - // the line (when checking for overlaps), but returning an error here - // instead would likely have avoided that as well, so we can only guess. - // - // [1]: https://github.com/moby/libnetwork/commit/5ca79d6b87873264516323a7b76f0af7d0298492#diff-bdcd879439d041827d334846f9aba01de6e3683ed8fdd01e63917dae6df23846 - if requestedPool != "" || n.Scope() == scope.Global || alloc.Pool.String() == "0.0.0.0/0" { - return alloc.PoolID, netiputil.ToIPNet(alloc.Pool), alloc.Meta, nil - } - - // Check for overlap and if none found, we have found the right pool. - if _, err := netutils.FindAvailableNetwork([]*net.IPNet{netiputil.ToIPNet(alloc.Pool)}); err == nil { - return alloc.PoolID, netiputil.ToIPNet(alloc.Pool), alloc.Meta, nil - } - - // Pool obtained in this iteration is overlapping. Hold onto the pool - // and don't release it yet, because we don't want IPAM to give us back - // the same pool over again. But make sure we still do a deferred release - // when we have either obtained a non-overlapping pool or ran out of - // pre-defined pools. - tmpPoolLeases = append(tmpPoolLeases, alloc.PoolID) - } -} - func (n *Network) ipamAllocateVersion(ipVer int, ipam ipamapi.Ipam) error { var ( cfgList *[]*IpamConf @@ -1611,11 +1552,28 @@ func (n *Network) ipamAllocateVersion(ipVer int, ipam ipamapi.Ipam) error { (*infoList)[i] = d d.AddressSpace = n.addrSpace - d.PoolID, d.Pool, d.Meta, err = n.requestPoolHelper(ipam, n.addrSpace, cfg.PreferredPool, cfg.SubPool, n.ipamOptions, ipVer == 6) + + var reserved []netip.Prefix + if n.Scope() != scope.Global { + reserved = netutils.InferReservedNetworks(ipVer == 6) + } + + alloc, err := ipam.RequestPool(ipamapi.PoolRequest{ + AddressSpace: n.addrSpace, + Pool: cfg.PreferredPool, + SubPool: cfg.SubPool, + Options: n.ipamOptions, + Exclude: reserved, + V6: ipVer == 6, + }) if err != nil { return err } + d.PoolID = alloc.PoolID + d.Pool = netiputil.ToIPNet(alloc.Pool) + d.Meta = alloc.Meta + defer func() { if err != nil { if err := ipam.ReleasePool(d.PoolID); err != nil { diff --git a/libnetwork/resolvconf/resolvconf.go b/libnetwork/resolvconf/resolvconf.go index c3473872b1725..4805b0eadf793 100644 --- a/libnetwork/resolvconf/resolvconf.go +++ b/libnetwork/resolvconf/resolvconf.go @@ -3,7 +3,7 @@ package resolvconf import ( "bytes" - "fmt" + "net/netip" "os" "strings" @@ -83,19 +83,17 @@ func GetNameservers(resolvConf []byte, kind int) []string { return nameservers } -// GetNameserversAsCIDR returns nameservers (if any) listed in +// GetNameserversAsPrefix returns nameservers (if any) listed in // /etc/resolv.conf as CIDR blocks (e.g., "1.2.3.4/32") -// This function's output is intended for net.ParseCIDR -func GetNameserversAsCIDR(resolvConf []byte) []string { +func GetNameserversAsPrefix(resolvConf []byte) []netip.Prefix { rc, err := resolvconf.Parse(bytes.NewBuffer(resolvConf), "") if err != nil { return nil } nsAddrs := rc.NameServers() - nameservers := make([]string, 0, len(nsAddrs)) + nameservers := make([]netip.Prefix, 0, len(nsAddrs)) for _, addr := range nsAddrs { - str := fmt.Sprintf("%s/%d", addr.WithZone("").String(), addr.BitLen()) - nameservers = append(nameservers, str) + nameservers = append(nameservers, netip.PrefixFrom(addr, addr.BitLen())) } return nameservers } diff --git a/libnetwork/resolvconf/resolvconf_unix_test.go b/libnetwork/resolvconf/resolvconf_unix_test.go index 6d3492d10b123..5cf6ea839711b 100644 --- a/libnetwork/resolvconf/resolvconf_unix_test.go +++ b/libnetwork/resolvconf/resolvconf_unix_test.go @@ -4,10 +4,12 @@ package resolvconf import ( "bytes" + "net/netip" "os" "strings" "testing" + "github.com/google/go-cmp/cmp/cmpopts" "github.com/opencontainers/go-digest" "gotest.tools/v3/assert" is "gotest.tools/v3/assert/cmp" @@ -78,58 +80,58 @@ nameserver 1.2.3.4 # not 4.3.2.1`, } } -func TestGetNameserversAsCIDR(t *testing.T) { +func TestGetNameserversAsPrefix(t *testing.T) { for _, tc := range []struct { input string - result []string + result []netip.Prefix }{ { - input: ``, + input: ``, + result: []netip.Prefix{}, }, { - input: `search example.com`, + input: `search example.com`, + result: []netip.Prefix{}, }, { input: ` nameserver 1.2.3.4 `, - result: []string{"1.2.3.4/32"}, + result: []netip.Prefix{netip.MustParsePrefix("1.2.3.4/32")}, }, { input: ` nameserver 1.2.3.4 nameserver 40.3.200.10 search example.com`, - result: []string{"1.2.3.4/32", "40.3.200.10/32"}, + result: []netip.Prefix{netip.MustParsePrefix("1.2.3.4/32"), netip.MustParsePrefix("40.3.200.10/32")}, }, { input: `nameserver 1.2.3.4 search example.com nameserver 4.30.20.100`, - result: []string{"1.2.3.4/32", "4.30.20.100/32"}, + result: []netip.Prefix{netip.MustParsePrefix("1.2.3.4/32"), netip.MustParsePrefix("4.30.20.100/32")}, }, { input: `search example.com nameserver 1.2.3.4 #nameserver 4.3.2.1`, - result: []string{"1.2.3.4/32"}, + result: []netip.Prefix{netip.MustParsePrefix("1.2.3.4/32")}, }, { input: `search example.com nameserver 1.2.3.4 # not 4.3.2.1`, - result: []string{"1.2.3.4/32"}, + result: []netip.Prefix{netip.MustParsePrefix("1.2.3.4/32")}, }, { input: `nameserver fd6f:c490:ec68::1`, - result: []string{"fd6f:c490:ec68::1/128"}, + result: []netip.Prefix{netip.MustParsePrefix("fd6f:c490:ec68::1/128")}, }, { input: `nameserver fe80::1234%eth0`, - result: []string{"fe80::1234/128"}, + result: []netip.Prefix{netip.MustParsePrefix("fe80::1234/128")}, }, } { - test := GetNameserversAsCIDR([]byte(tc.input)) - if !strSlicesEqual(test, tc.result) { - t.Errorf("Wrong nameserver string {%s} should be %v. Input: %s", test, tc.result, tc.input) - } + test := GetNameserversAsPrefix([]byte(tc.input)) + assert.DeepEqual(t, test, tc.result, cmpopts.EquateComparable(netip.Prefix{})) } } diff --git a/opts/address_pools.go b/opts/address_pools.go index a34f98b0c0dc7..64ce721fdbfc2 100644 --- a/opts/address_pools.go +++ b/opts/address_pools.go @@ -4,6 +4,7 @@ import ( "encoding/csv" "encoding/json" "fmt" + "net/netip" "strconv" "strings" @@ -39,7 +40,11 @@ func (p *PoolsOpt) Set(value string) error { switch key { case "base": - poolsDef.Base = val + base, err := netip.ParsePrefix(val) + if err != nil { + return fmt.Errorf("invalid base prefix %q: %w", val, err) + } + poolsDef.Base = base case "size": size, err := strconv.Atoi(val) if err != nil { From 0c022307e970d97bcca626a06eafe27a3a088c70 Mon Sep 17 00:00:00 2001 From: Albin Kerouanton Date: Tue, 7 May 2024 08:51:20 +0200 Subject: [PATCH 2/3] libnet/i/defaultipam: Unmap IPv4-mapped IPv6 addrs This ensures such address pools are part of the IPv4 address space. Signed-off-by: Albin Kerouanton --- libnetwork/ipams/defaultipam/address_space.go | 9 --------- libnetwork/ipams/defaultipam/allocator.go | 2 ++ 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/libnetwork/ipams/defaultipam/address_space.go b/libnetwork/ipams/defaultipam/address_space.go index d305d02ad48d6..92f886dcc9cdc 100644 --- a/libnetwork/ipams/defaultipam/address_space.go +++ b/libnetwork/ipams/defaultipam/address_space.go @@ -2,7 +2,6 @@ package defaultipam import ( "context" - "errors" "net/netip" "slices" "sync" @@ -31,14 +30,6 @@ type addrSpace struct { } func newAddrSpace(predefined []*ipamutils.NetworkToSplit) (*addrSpace, error) { - for i, p := range predefined { - if !p.Base.IsValid() { - return nil, errors.New("newAddrSpace: prefix zero found") - } - - predefined[i].Base = p.Base.Masked() - } - slices.SortFunc(predefined, func(a, b *ipamutils.NetworkToSplit) int { return netiputil.PrefixCompare(a.Base, b.Base) }) diff --git a/libnetwork/ipams/defaultipam/allocator.go b/libnetwork/ipams/defaultipam/allocator.go index 1d2fcf30ee3e4..645f1dfa8affc 100644 --- a/libnetwork/ipams/defaultipam/allocator.go +++ b/libnetwork/ipams/defaultipam/allocator.go @@ -96,6 +96,8 @@ func splitByIPFamily(s []*ipamutils.NetworkToSplit) ([]*ipamutils.NetworkToSplit return []*ipamutils.NetworkToSplit{}, []*ipamutils.NetworkToSplit{}, fmt.Errorf("network at index %d (%v) is not in canonical form", i, n) } + n.Base, _ = n.Base.Addr().Unmap().Prefix(n.Base.Bits()) + if n.Base.Addr().Is4() { v4 = append(v4, n) } else { From 500eff0ae93b17a7b9258455fb870a95330bc6db Mon Sep 17 00:00:00 2001 From: Albin Kerouanton Date: Tue, 7 May 2024 08:55:01 +0200 Subject: [PATCH 3/3] libnet/i/defaultipam: improve address pools validation Nothing was validating whether address pools' `base` prefix were larger than the target subnet `size` they're associated to. As such invalid address pools would yield no subnet, the error could go unnoticed. Signed-off-by: Albin Kerouanton --- libnetwork/ipams/defaultipam/allocator.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/libnetwork/ipams/defaultipam/allocator.go b/libnetwork/ipams/defaultipam/allocator.go index 645f1dfa8affc..a211843ca90e4 100644 --- a/libnetwork/ipams/defaultipam/allocator.go +++ b/libnetwork/ipams/defaultipam/allocator.go @@ -95,6 +95,9 @@ func splitByIPFamily(s []*ipamutils.NetworkToSplit) ([]*ipamutils.NetworkToSplit if !n.Base.IsValid() || n.Size == 0 { return []*ipamutils.NetworkToSplit{}, []*ipamutils.NetworkToSplit{}, fmt.Errorf("network at index %d (%v) is not in canonical form", i, n) } + if n.Base.Bits() > n.Size { + return []*ipamutils.NetworkToSplit{}, []*ipamutils.NetworkToSplit{}, fmt.Errorf("network at index %d (%v) has a smaller prefix (/%d) than the target size of that pool (/%d)", i, n, n.Base.Bits(), n.Size) + } n.Base, _ = n.Base.Addr().Unmap().Prefix(n.Base.Bits())