Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplify map session management #1931

Merged
merged 25 commits into from
May 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
e2d95e2
try to close on existing connection, also reject
kradalby Apr 22, 2024
5b132b6
expand notifier and mapresp metrics
kradalby May 4, 2024
59c522d
make timeout and rejects label node id
kradalby May 5, 2024
1f9ffaa
remove mapresponse map
kradalby May 8, 2024
eb33299
standardise trace logging in notifier
kradalby May 8, 2024
9851123
update comments
kradalby May 8, 2024
3ed9f72
format id in metrics
kradalby May 9, 2024
97f8b36
add missing ts version
kradalby May 13, 2024
0a69b26
send updates async so we reach select in poll
kradalby May 13, 2024
3d92837
expand integ test cert validity
kradalby May 13, 2024
789bfe7
only print nodes with not all up peers in test
kradalby May 14, 2024
c96f11d
set a timeout for sendall nodes
kradalby May 17, 2024
e8e9290
split mapresp serve by streaming
kradalby May 17, 2024
fa97a86
up notifier timeout
kradalby May 17, 2024
edcd23b
name integration test metrics by hostname
kradalby May 17, 2024
9060260
fix not setting sendtimeout
kradalby May 17, 2024
c76128c
add env options for debug deadlock and config
kradalby May 17, 2024
0cfe0dd
add debug to intg test, make high cardin metrics debug flag
kradalby May 17, 2024
516da1c
change how conditional metric is registered
kradalby May 17, 2024
5e5b443
comment on sendall timeout
kradalby May 17, 2024
c03497d
dump config in integ test
kradalby May 17, 2024
e9c6687
add debug metric for last sent to node
kradalby May 17, 2024
00aa63c
add 1.66 to testing
kradalby May 20, 2024
0047008
fix issue where new clients are not always setting NetInfo
kradalby May 21, 2024
a49755e
replace docker head file with upstream and bisect note
kradalby May 21, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 36 additions & 14 deletions Dockerfile.tailscale-HEAD
Original file line number Diff line number Diff line change
@@ -1,21 +1,43 @@
# This Dockerfile and the images produced are for testing headscale,
# and are in no way endorsed by Headscale's maintainers as an
# official nor supported release or distribution.
# Copyright (c) Tailscale Inc & AUTHORS
# SPDX-License-Identifier: BSD-3-Clause

FROM golang:latest
# This Dockerfile is more or less lifted from tailscale/tailscale
# to ensure a similar build process when testing the HEAD of tailscale.

RUN apt-get update \
&& apt-get install -y dnsutils git iptables ssh ca-certificates \
&& rm -rf /var/lib/apt/lists/*
FROM golang:1.22-alpine AS build-env

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please never use alpine images. Musl C is so broken.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please never use alpine images. Musl C is so broken.

Why in general agree with you, this particular case will reproduce the head image so it's more or less the same as all the other ones which we pull from dockerhub and is built by that file.

So in this case this is fixing the odd one out.

RUN useradd --shell=/bin/bash --create-home ssh-it-user
WORKDIR /go/src

RUN apk add --no-cache git

# Replace `RUN git...` with `COPY` and a local checked out version of Tailscale in `./tailscale`
# to test specific commits of the Tailscale client. This is useful when trying to find out why
# something specific broke between two versions of Tailscale with for example `git bisect`.
# COPY ./tailscale .
RUN git clone https://github.com/tailscale/tailscale.git

WORKDIR /go/tailscale
WORKDIR /go/src/tailscale


# see build_docker.sh
ARG VERSION_LONG=""
ENV VERSION_LONG=$VERSION_LONG
ARG VERSION_SHORT=""
ENV VERSION_SHORT=$VERSION_SHORT
ARG VERSION_GIT_HASH=""
ENV VERSION_GIT_HASH=$VERSION_GIT_HASH
ARG TARGETARCH

RUN GOARCH=$TARGETARCH go install -ldflags="\
-X tailscale.com/version.longStamp=$VERSION_LONG \
-X tailscale.com/version.shortStamp=$VERSION_SHORT \
-X tailscale.com/version.gitCommitStamp=$VERSION_GIT_HASH" \
-v ./cmd/tailscale ./cmd/tailscaled ./cmd/containerboot

FROM alpine:3.18
RUN apk add --no-cache ca-certificates iptables iproute2 ip6tables curl

RUN git checkout main \
&& sh build_dist.sh tailscale.com/cmd/tailscale \
&& sh build_dist.sh tailscale.com/cmd/tailscaled \
&& cp tailscale /usr/local/bin/ \
&& cp tailscaled /usr/local/bin/
COPY --from=build-env /go/bin/* /usr/local/bin/
# For compat with the previous run.sh, although ideally you should be
# using build_docker.sh which sets an entrypoint for the image.
RUN mkdir /tailscale && ln -s /usr/local/bin/containerboot /tailscale/run.sh
4 changes: 2 additions & 2 deletions flake.nix
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@
checkFlags = ["-short"];

# When updating go.mod or go.sum, a new sha will need to be calculated,
# update this if you have a mismatch after doing a change to those files.
vendorHash = "sha256-wXfKeiJaGe6ahOsONrQhvbuMN8flQ13b0ZjxdbFs1e8=";
# update this if you have a mismatch after doing a change to thos files.
vendorHash = "sha256-EorT2AVwA3usly/LcNor6r5UIhLCdj3L4O4ilgTIC2o=";

subPackages = ["cmd/headscale"];

Expand Down
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ require (
github.com/puzpuzpuz/xsync/v3 v3.1.0
github.com/rs/zerolog v1.32.0
github.com/samber/lo v1.39.0
github.com/sasha-s/go-deadlock v0.3.1
github.com/spf13/cobra v1.8.0
github.com/spf13/viper v1.18.2
github.com/stretchr/testify v1.9.0
Expand Down Expand Up @@ -155,6 +156,7 @@ require (
github.com/opencontainers/image-spec v1.1.0 // indirect
github.com/opencontainers/runc v1.1.12 // indirect
github.com/pelletier/go-toml/v2 v2.2.2 // indirect
github.com/petermattis/goid v0.0.0-20180202154549-b0b1615b78e5 // indirect
github.com/pierrec/lz4/v4 v4.1.21 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
Expand Down
4 changes: 4 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,8 @@ github.com/patrickmn/go-cache v2.1.0+incompatible h1:HRMgzkcYKYpi3C8ajMPV8OFXaaR
github.com/patrickmn/go-cache v2.1.0+incompatible/go.mod h1:3Qf8kWWT7OJRJbdiICTKqZju1ZixQ/KpMGzzAfe6+WQ=
github.com/pelletier/go-toml/v2 v2.2.2 h1:aYUidT7k73Pcl9nb2gScu7NSrKCSHIDE89b3+6Wq+LM=
github.com/pelletier/go-toml/v2 v2.2.2/go.mod h1:1t835xjRzz80PqgE6HHgN2JOsmgYu/h4qDAS4n929Rs=
github.com/petermattis/goid v0.0.0-20180202154549-b0b1615b78e5 h1:q2e307iGHPdTGp0hoxKjt1H5pDo6utceo3dQVK3I5XQ=
github.com/petermattis/goid v0.0.0-20180202154549-b0b1615b78e5/go.mod h1:jvVRKCrJTQWu0XVbaOlby/2lO20uSCHEMzzplHXte1o=
github.com/philip-bui/grpc-zerolog v1.0.1 h1:EMacvLRUd2O1K0eWod27ZP5CY1iTNkhBDLSN+Q4JEvA=
github.com/philip-bui/grpc-zerolog v1.0.1/go.mod h1:qXbiq/2X4ZUMMshsqlWyTHOcw7ns+GZmlqZZN05ZHcQ=
github.com/pierrec/lz4/v4 v4.1.14/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
Expand Down Expand Up @@ -423,6 +425,8 @@ github.com/sagikazarmark/slog-shim v0.1.0 h1:diDBnUNK9N/354PgrxMywXnAwEr1QZcOr6g
github.com/sagikazarmark/slog-shim v0.1.0/go.mod h1:SrcSrq8aKtyuqEI1uvTDTK1arOWRIczQRv+GVI1AkeQ=
github.com/samber/lo v1.39.0 h1:4gTz1wUhNYLhFSKl6O+8peW0v2F4BCY034GRpU9WnuA=
github.com/samber/lo v1.39.0/go.mod h1:+m/ZKRl6ClXCE2Lgf3MsQlWfh4bn1bz6CXEOxnEXnEA=
github.com/sasha-s/go-deadlock v0.3.1 h1:sqv7fDNShgjcaxkO0JNcOAlr8B9+cV5Ey/OB71efZx0=
github.com/sasha-s/go-deadlock v0.3.1/go.mod h1:F73l+cr82YSh10GxyRI6qZiCgK64VaZjwesgfQ1/iLM=
github.com/sergi/go-diff v1.2.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM=
github.com/sergi/go-diff v1.3.1 h1:xkr+Oxo4BOQKmkn/B9eMK0g5Kg/983T9DqqPHwYqD+8=
github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NFbPK1I=
Expand Down
45 changes: 12 additions & 33 deletions hscontrol/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"time"

"github.com/coreos/go-oidc/v3/oidc"
"github.com/davecgh/go-spew/spew"
"github.com/gorilla/mux"
grpcMiddleware "github.com/grpc-ecosystem/go-grpc-middleware"
grpcRuntime "github.com/grpc-ecosystem/grpc-gateway/v2/runtime"
Expand Down Expand Up @@ -104,16 +105,15 @@ type Headscale struct {
registrationCache *cache.Cache

pollNetMapStreamWG sync.WaitGroup

mapSessions map[types.NodeID]*mapSession
mapSessionMu sync.Mutex
}

var (
profilingEnabled = envknob.Bool("HEADSCALE_PROFILING_ENABLED")
profilingEnabled = envknob.Bool("HEADSCALE_DEBUG_PROFILING_ENABLED")
profilingPath = envknob.String("HEADSCALE_DEBUG_PROFILING_PATH")
tailsqlEnabled = envknob.Bool("HEADSCALE_DEBUG_TAILSQL_ENABLED")
tailsqlStateDir = envknob.String("HEADSCALE_DEBUG_TAILSQL_STATE_DIR")
tailsqlTSKey = envknob.String("TS_AUTHKEY")
dumpConfig = envknob.Bool("HEADSCALE_DEBUG_DUMP_CONFIG")
)

func NewHeadscale(cfg *types.Config) (*Headscale, error) {
Expand All @@ -138,7 +138,6 @@ func NewHeadscale(cfg *types.Config) (*Headscale, error) {
registrationCache: registrationCache,
pollNetMapStreamWG: sync.WaitGroup{},
nodeNotifier: notifier.NewNotifier(cfg),
mapSessions: make(map[types.NodeID]*mapSession),
}

app.db, err = db.NewHeadscaleDatabase(
Expand Down Expand Up @@ -502,21 +501,25 @@ func (h *Headscale) createRouter(grpcMux *grpcRuntime.ServeMux) *mux.Router {

// Serve launches the HTTP and gRPC server service Headscale and the API.
func (h *Headscale) Serve() error {
if _, enableProfile := os.LookupEnv("HEADSCALE_PROFILING_ENABLED"); enableProfile {
if profilePath, ok := os.LookupEnv("HEADSCALE_PROFILING_PATH"); ok {
err := os.MkdirAll(profilePath, os.ModePerm)
if profilingEnabled {
if profilingPath != "" {
err := os.MkdirAll(profilingPath, os.ModePerm)
if err != nil {
log.Fatal().Err(err).Msg("failed to create profiling directory")
}

defer profile.Start(profile.ProfilePath(profilePath)).Stop()
defer profile.Start(profile.ProfilePath(profilingPath)).Stop()
} else {
defer profile.Start().Stop()
}
}

var err error

if dumpConfig {
spew.Dump(h.cfg)
}

// Fetch an initial DERP Map before we start serving
h.DERPMap = derp.GetDERPMap(h.cfg.DERP)
h.mapper = mapper.NewMapper(h.db, h.cfg, h.DERPMap, h.nodeNotifier)
Expand Down Expand Up @@ -729,19 +732,6 @@ func (h *Headscale) Serve() error {
w.WriteHeader(http.StatusOK)
w.Write([]byte(h.nodeNotifier.String()))
})
debugMux.HandleFunc("/debug/mapresp", func(w http.ResponseWriter, r *http.Request) {
h.mapSessionMu.Lock()
defer h.mapSessionMu.Unlock()

var b strings.Builder
b.WriteString("mapresponders:\n")
for k, v := range h.mapSessions {
fmt.Fprintf(&b, "\t%d: %p\n", k, v)
}

w.WriteHeader(http.StatusOK)
w.Write([]byte(b.String()))
})
debugMux.Handle("/metrics", promhttp.Handler())

debugHTTPServer := &http.Server{
Expand Down Expand Up @@ -822,17 +812,6 @@ func (h *Headscale) Serve() error {
expireNodeCancel()
expireEphemeralCancel()

trace("closing map sessions")
wg := sync.WaitGroup{}
for _, mapSess := range h.mapSessions {
wg.Add(1)
go func() {
mapSess.close()
wg.Done()
}()
}
wg.Wait()

trace("waiting for netmap stream to close")
h.pollNetMapStreamWG.Wait()

Expand Down
31 changes: 23 additions & 8 deletions hscontrol/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,23 @@ import (
"github.com/gorilla/mux"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"tailscale.com/envknob"
)

var debugHighCardinalityMetrics = envknob.Bool("HEADSCALE_DEBUG_HIGH_CARDINALITY_METRICS")

var mapResponseLastSentSeconds *prometheus.GaugeVec

func init() {
if debugHighCardinalityMetrics {
mapResponseLastSentSeconds = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: prometheusNamespace,
Name: "mapresponse_last_sent_seconds",
Help: "last sent metric to node.id",
}, []string{"type", "id"})
}
}

const prometheusNamespace = "headscale"

var (
Expand Down Expand Up @@ -37,16 +52,16 @@ var (
Name: "mapresponse_readonly_requests_total",
Help: "total count of readonly requests received",
}, []string{"status"})
mapResponseSessions = promauto.NewGauge(prometheus.GaugeOpts{
Namespace: prometheusNamespace,
Name: "mapresponse_current_sessions_total",
Help: "total count open map response sessions",
})
mapResponseRejected = promauto.NewCounterVec(prometheus.CounterOpts{
mapResponseEnded = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: prometheusNamespace,
Name: "mapresponse_rejected_new_sessions_total",
Help: "total count of new mapsessions rejected",
Name: "mapresponse_ended_total",
Help: "total count of new mapsessions ended",
}, []string{"reason"})
mapResponseClosed = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: prometheusNamespace,
Name: "mapresponse_closed_total",
Help: "total count of calls to mapresponse close",
}, []string{"return"})
httpDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{
Namespace: prometheusNamespace,
Name: "http_duration_seconds",
Expand Down
60 changes: 5 additions & 55 deletions hscontrol/noise.go
Original file line number Diff line number Diff line change
Expand Up @@ -231,62 +231,12 @@ func (ns *noiseServer) NoisePollNetMapHandler(

return
}
sess := ns.headscale.newMapSession(req.Context(), mapRequest, writer, node)

sess := ns.headscale.newMapSession(req.Context(), mapRequest, writer, node)
sess.tracef("a node sending a MapRequest with Noise protocol")

// If a streaming mapSession exists for this node, close it
// and start a new one.
if sess.isStreaming() {
sess.tracef("aquiring lock to check stream")

ns.headscale.mapSessionMu.Lock()
if _, ok := ns.headscale.mapSessions[node.ID]; ok {
// NOTE/TODO(kradalby): From how I understand the protocol, when
// a client connects with stream=true, and already has a streaming
// connection open, the correct way is to close the current channel
// and replace it. However, I cannot manage to get that working with
// some sort of lock/block happening on the cancelCh in the streaming
// session.
// Not closing the channel and replacing it puts us in a weird state
// which keeps a ghost stream open, receiving keep alives, but no updates.
//
// Typically a new connection is opened when one exists as a client which
// is already authenticated reconnects (e.g. down, then up). The client will
// start auth and streaming at the same time, and then cancel the streaming
// when the auth has finished successfully, opening a new connection.
//
// As a work-around to not replacing, abusing the clients "resilience"
// by reject the new connection which will cause the client to immediately
// reconnect and "fix" the issue, as the other connection typically has been
// closed, meaning there is nothing to replace.
//
// sess.infof("node has an open stream(%p), replacing with %p", oldSession, sess)
// oldSession.close()

defer ns.headscale.mapSessionMu.Unlock()

sess.infof("node has an open stream(%p), rejecting new stream", sess)
mapResponseRejected.WithLabelValues("exists").Inc()
return
}

ns.headscale.mapSessions[node.ID] = sess
mapResponseSessions.Inc()
ns.headscale.mapSessionMu.Unlock()
sess.tracef("releasing lock to check stream")
}

sess.serve()

if sess.isStreaming() {
sess.tracef("aquiring lock to remove stream")
ns.headscale.mapSessionMu.Lock()
defer ns.headscale.mapSessionMu.Unlock()

delete(ns.headscale.mapSessions, node.ID)
mapResponseSessions.Dec()

sess.tracef("releasing lock to remove stream")
if !sess.isStreaming() {
sess.serve()
} else {
sess.serveLongPoll()
}
}
46 changes: 41 additions & 5 deletions hscontrol/notifier/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,43 @@ package notifier
import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"tailscale.com/envknob"
)

const prometheusNamespace = "headscale"

var debugHighCardinalityMetrics = envknob.Bool("HEADSCALE_DEBUG_HIGH_CARDINALITY_METRICS")

var notifierUpdateSent *prometheus.CounterVec

func init() {
if debugHighCardinalityMetrics {
notifierUpdateSent = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: prometheusNamespace,
Name: "notifier_update_sent_total",
Help: "total count of update sent on nodes channel",
}, []string{"status", "type", "trigger", "id"})
} else {
notifierUpdateSent = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: prometheusNamespace,
Name: "notifier_update_sent_total",
Help: "total count of update sent on nodes channel",
}, []string{"status", "type", "trigger"})
}
}

var (
notifierWaitersForLock = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: prometheusNamespace,
Name: "notifier_waiters_for_lock",
Help: "gauge of waiters for the notifier lock",
}, []string{"type", "action"})
notifierWaitForLock = promauto.NewHistogramVec(prometheus.HistogramOpts{
Namespace: prometheusNamespace,
Name: "notifier_wait_for_lock_seconds",
Help: "histogram of time spent waiting for the notifier lock",
Buckets: []float64{0.001, 0.01, 0.1, 0.3, 0.5, 1, 3, 5, 10},
}, []string{"action"})
notifierUpdateSent = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: prometheusNamespace,
Name: "notifier_update_sent_total",
Help: "total count of update sent on nodes channel",
}, []string{"status", "type", "trigger"})
notifierUpdateReceived = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: prometheusNamespace,
Name: "notifier_update_received_total",
Expand All @@ -29,4 +50,19 @@ var (
Name: "notifier_open_channels_total",
Help: "total count open channels in notifier",
})
notifierBatcherWaitersForLock = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: prometheusNamespace,
Name: "notifier_batcher_waiters_for_lock",
Help: "gauge of waiters for the notifier batcher lock",
}, []string{"type", "action"})
notifierBatcherChanges = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: prometheusNamespace,
Name: "notifier_batcher_changes_pending",
Help: "gauge of full changes pending in the notifier batcher",
}, []string{})
notifierBatcherPatches = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: prometheusNamespace,
Name: "notifier_batcher_patches_pending",
Help: "gauge of patches pending in the notifier batcher",
}, []string{})
)
Loading
Loading