diff --git a/Dockerfile b/Dockerfile index 032e960..0970c7d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM golang:1.22.1-bullseye as builder +FROM golang:1.22.4-bullseye AS builder WORKDIR /app COPY . ./ RUN apt update diff --git a/Dockerfile.default b/Dockerfile.default index 6e22ed9..3e53d97 100644 --- a/Dockerfile.default +++ b/Dockerfile.default @@ -1,4 +1,4 @@ -FROM golang:1.22.1-bullseye as builder +FROM golang:1.22.4-bullseye AS builder WORKDIR /app COPY . ./ RUN apt update @@ -8,7 +8,7 @@ ENV GOCACHE=/root/.cache/go-build RUN go mod tidy -v RUN --mount=type=cache,target="/root/.cache/go-build" GOOS=linux go build -ldflags="-X 'github.com/ddosify/alaz/datastore.tag=$VERSION'" -o alaz -FROM debian:12.5-slim +FROM debian:12.6-slim RUN apt-get update && apt-get install -y procps ca-certificates && rm -rf /var/lib/apt/lists/* COPY --chown=0:0 --from=builder /app/alaz ./bin/ diff --git a/aggregator/cluster.go b/aggregator/cluster.go new file mode 100644 index 0000000..beeb38c --- /dev/null +++ b/aggregator/cluster.go @@ -0,0 +1,110 @@ +package aggregator + +import ( + "context" + "fmt" + "sync" + "sync/atomic" + + "github.com/ddosify/alaz/log" + "k8s.io/apimachinery/pkg/types" +) + +type ClusterInfo struct { + k8smu sync.RWMutex + PodIPToPodUid map[string]types.UID `json:"podIPToPodUid"` + ServiceIPToServiceUid map[string]types.UID `json:"serviceIPToServiceUid"` + + // Pid -> SocketMap + // pid -> fd -> {saddr, sport, daddr, dport} + SocketMaps []*SocketMap // index symbolizes pid + socketMapsmu sync.Mutex + + // Below mutexes guard socketMaps, set to mu inside SocketMap struct + // Used to find the correct mutex for the process, some pids can share the same mutex + muIndex atomic.Uint64 + muArray []*sync.RWMutex + + signalChan chan uint32 // pids are signaled on this channel to notify clusterInfo struct to initialize a SocketMap +} + +func newClusterInfo(liveProcCount int) *ClusterInfo { + ci := &ClusterInfo{ + PodIPToPodUid: map[string]types.UID{}, + ServiceIPToServiceUid: map[string]types.UID{}, + } + ci.signalChan = make(chan uint32) + sockMaps := make([]*SocketMap, maxPid+1) // index=pid + ci.SocketMaps = sockMaps + ci.muIndex = atomic.Uint64{} + + // initialize mutex array + + // normally, mutex per pid is straightforward solution + // on regular systems, maxPid is around 32768 + // so, we allocate 32768 mutexes, which is 32768 * 24 bytes = 786KB + // but on 64-bit systems, maxPid can be 4194304 + // and we don't want to allocate 4194304 mutexes, it adds up to 4194304 * 24 bytes = 100MB + // So, some process will have to share the mutex + + // assume liveprocesses can increase up to 100 times of current count + // if processes exceeds the count of mutex, they will share the mutex + countMuArray := liveProcCount * 100 + if countMuArray > maxPid { + countMuArray = maxPid + } + // for 2k processes, 200k mutex => 200k * 24 bytes = 4.80MB + // in case of maxPid is 32678, 32678 * 24 bytes = 784KB, pick the smaller one + ci.muArray = make([]*sync.RWMutex, countMuArray) + go ci.handleSocketMapCreation() + return ci +} + +func (ci *ClusterInfo) SignalSocketMapCreation(pid uint32) { + ci.signalChan <- pid +} + +// events will be processed sequentially here in one goroutine. +// in order to prevent race. +func (ci *ClusterInfo) handleSocketMapCreation() { + for pid := range ci.signalChan { + if ci.SocketMaps[pid] != nil { + continue + } + + ctxPid := context.WithValue(context.Background(), log.LOG_CONTEXT, fmt.Sprint(pid)) + + sockMap := &SocketMap{ + mu: nil, // set below + pid: pid, + M: map[uint64]*SocketLine{}, + waitingFds: make(chan uint64, 1000), + processedFds: map[uint64]struct{}{}, + processedFdsmu: sync.RWMutex{}, + closeCh: make(chan struct{}, 1), + ctx: ctxPid, + } + + ci.muIndex.Add(1) + i := (ci.muIndex.Load()) % uint64(len(ci.muArray)) + ci.muArray[i] = &sync.RWMutex{} + sockMap.mu = ci.muArray[i] + ci.SocketMaps[pid] = sockMap + go sockMap.ProcessSocketLineCreationRequests() + } +} + +func (ci *ClusterInfo) clearProc(pid uint32) { + sm := ci.SocketMaps[pid] + if sm == nil { + return + } + + // stop waiting for socketline creation requests + sm.mu.Lock() + sm.closeCh <- struct{}{} + sm.mu.Unlock() + + // reset + ci.SocketMaps[pid] = nil +} diff --git a/aggregator/data.go b/aggregator/data.go index 2406a21..e85aa09 100644 --- a/aggregator/data.go +++ b/aggregator/data.go @@ -11,24 +11,25 @@ import ( "bytes" "context" "encoding/binary" + "encoding/json" "fmt" "io" "net" + "net/http" "os" - "os/exec" - "path" "regexp" "runtime" "strconv" "strings" "sync" - "sync/atomic" "syscall" "golang.org/x/time/rate" "time" + "github.com/ddosify/alaz/aggregator/kafka" + "github.com/ddosify/alaz/cri" "github.com/ddosify/alaz/datastore" "github.com/ddosify/alaz/ebpf" "github.com/ddosify/alaz/ebpf/l7_req" @@ -44,14 +45,26 @@ import ( "k8s.io/apimachinery/pkg/types" ) +const ( + POD = "pod" + SVC = "service" + OUTBOUND = "outbound" +) + +const ( + KAFKA = "kafka" // LOG_CONTEXT_KEY should match + REDIS = "redis" +) + type Aggregator struct { ctx context.Context + ct *cri.CRITool // listen to events from different sources - k8sChan <-chan interface{} - ebpfChan <-chan interface{} - ebpfProcChan <-chan interface{} - ebpfTcpChan <-chan interface{} + k8sChan chan interface{} + ebpfChan chan interface{} + ebpfProcChan chan interface{} + ebpfTcpChan chan interface{} tlsAttachSignalChan chan uint32 // store the service map @@ -65,36 +78,19 @@ type Aggregator struct { h2Ch chan *l7_req.L7Event h2Frames map[string]*FrameArrival // pid-fd-streamId -> frame + h2ParserMu sync.RWMutex + h2Parsers map[string]*http2Parser // pid-fd -> http2Parser + // postgres prepared stmt pgStmtsMu sync.RWMutex pgStmts map[string]string // pid-fd-stmtname -> query - h2ParserMu sync.RWMutex - h2Parsers map[string]*http2Parser // pid-fd -> http2Parser - liveProcessesMu sync.RWMutex liveProcesses map[uint32]struct{} // pid -> struct{} // Used to rate limit and drop trace events based on pid rateLimiters map[uint32]*rate.Limiter // pid -> rateLimiter rateLimitMu sync.RWMutex - - // Used to find the correct mutex for the pid, some pids can share the same mutex - muIndex atomic.Uint64 - muArray []*sync.RWMutex -} - -// We need to keep track of the following -// in order to build find relationships between -// connections and pods/services - -type SockInfo struct { - Pid uint32 `json:"pid"` - Fd uint64 `json:"fd"` - Saddr string `json:"saddr"` - Sport uint16 `json:"sport"` - Daddr string `json:"daddr"` - Dport uint16 `json:"dport"` } type http2Parser struct { @@ -109,33 +105,6 @@ type http2Parser struct { serverHpackDecoder *hpack.Decoder } -// type SocketMap -type SocketMap struct { - mu *sync.RWMutex - M map[uint64]*SocketLine `json:"fdToSockLine"` // fd -> SockLine -} - -type ClusterInfo struct { - k8smu sync.RWMutex - PodIPToPodUid map[string]types.UID `json:"podIPToPodUid"` - ServiceIPToServiceUid map[string]types.UID `json:"serviceIPToServiceUid"` - - // Pid -> SocketMap - // pid -> fd -> {saddr, sport, daddr, dport} - SocketMaps []*SocketMap // index symbolizes pid -} - -// If we have information from the container runtimes -// we would have pid's of the containers within the pod -// and we can use that to find the podUid directly - -// If we don't have the pid's of the containers -// we can use the following to find the podUid -// {saddr+sport} -> search in podIPToPodUid -> podUid -// {daddr+dport} -> search in serviceIPToServiceUid -> serviceUid -// or -// {daddr+dport} -> search in podIPToPodUid -> podUid - var ( // default exponential backoff (*2) // when attemptLimit is increased, we are blocking the events that we wait it to be processed more @@ -149,6 +118,7 @@ var ( var reverseDnsCache *cache.Cache var re *regexp.Regexp +var maxPid int func init() { reverseDnsCache = cache.New(defaultExpiration, purgeTime) @@ -157,27 +127,31 @@ func init() { // Case-insensitive matching re = regexp.MustCompile(strings.Join(keywords, "|")) + + var err error + maxPid, err = getPidMax() + if err != nil { + log.Logger.Fatal().Err(err).Msg("error getting max pid") + } } -func NewAggregator(parentCtx context.Context, k8sChan <-chan interface{}, +func NewAggregator(parentCtx context.Context, ct *cri.CRITool, k8sChan chan interface{}, events chan interface{}, procEvents chan interface{}, tcpEvents chan interface{}, tlsAttachSignalChan chan uint32, ds datastore.DataStore) *Aggregator { + ctx, _ := context.WithCancel(parentCtx) - clusterInfo := &ClusterInfo{ - PodIPToPodUid: map[string]types.UID{}, - ServiceIPToServiceUid: map[string]types.UID{}, - } a := &Aggregator{ - ctx: ctx, - k8sChan: k8sChan, - ebpfChan: events, - ebpfProcChan: procEvents, - ebpfTcpChan: tcpEvents, - clusterInfo: clusterInfo, + ctx: ctx, + ct: ct, + k8sChan: k8sChan, + ebpfChan: events, + ebpfProcChan: procEvents, + ebpfTcpChan: tcpEvents, + // clusterInfo: clusterInfo, ds: ds, tlsAttachSignalChan: tlsAttachSignalChan, h2Ch: make(chan *l7_req.L7Event, 1000000), @@ -186,84 +160,34 @@ func NewAggregator(parentCtx context.Context, k8sChan <-chan interface{}, liveProcesses: make(map[uint32]struct{}), rateLimiters: make(map[uint32]*rate.Limiter), pgStmts: make(map[string]string), - muIndex: atomic.Uint64{}, - muArray: nil, } - maxPid, err := getPidMax() + var err error + a.liveProcesses, err = ct.GetPidsRunningOnContainers() if err != nil { - log.Logger.Fatal().Err(err).Msg("error getting max pid") + log.Logger.Fatal().Err(err).Msg("could not get running containers") } - sockMaps := make([]*SocketMap, maxPid+1) // index=pid - // initialize sockMaps - for i := range sockMaps { - sockMaps[i] = &SocketMap{ - M: nil, // initialized on demand later - mu: nil, - } - } - clusterInfo.SocketMaps = sockMaps - - a.getLiveProcesses() a.liveProcessesMu.RLock() - countLiveProcesses := len(a.liveProcesses) + liveProcCount := len(a.liveProcesses) a.liveProcessesMu.RUnlock() - // normally, mutex per pid is straightforward solution - // on regular systems, maxPid is around 32768 - // so, we allocate 32768 mutexes, which is 32768 * 24 bytes = 786KB - // but on 64-bit systems, maxPid can be 4194304 - // and we don't want to allocate 4194304 mutexes, it adds up to 4194304 * 24 bytes = 100MB - // So, some process will have to share the mutex - - // assume liveprocesses can increase up to 100 times of current count - // if processes exceeds the count of mutex, they will share the mutex - countMuArray := countLiveProcesses * 100 - if countMuArray > maxPid { - countMuArray = maxPid - } - // for 2k processes, 200k mutex => 200k * 24 bytes = 4.80MB - // in case of maxPid is 32678, 32678 * 24 bytes = 784KB, pick the smaller one - a.muArray = make([]*sync.RWMutex, countMuArray) - - // set distinct mutex for every live process - for pid := range a.liveProcesses { - a.muIndex.Add(1) - a.muArray[a.muIndex.Load()] = &sync.RWMutex{} - sockMaps[pid].mu = a.muArray[a.muIndex.Load()] - a.getAlreadyExistingSockets(pid) - } + a.clusterInfo = newClusterInfo(liveProcCount) go a.clearSocketLines(ctx) - go a.updateSocketMap(ctx) - return a -} -func (a *Aggregator) getLiveProcesses() { - // get all alive processes, populate liveProcesses - cmd := exec.Command("ps", "-e", "-o", "pid=") - output, err := cmd.Output() - if err != nil { - log.Logger.Fatal().Err(err).Msg("error getting all alive processes") - } + go func() { + t := time.NewTicker(2 * time.Minute) - lines := strings.Split(string(output), "\n") - for _, line := range lines { - line = strings.TrimSpace(line) - if line != "" { - fields := strings.Fields(line) - if len(fields) > 0 { - pid := fields[0] - pidInt, err := strconv.Atoi(pid) - if err != nil { - log.Logger.Error().Err(err).Msgf("error converting pid to int %s", pid) - continue - } - a.liveProcesses[uint32(pidInt)] = struct{}{} - } + for range t.C { + log.Logger.Debug(). + Int("ebpfChan-lag", len(a.ebpfChan)). + Int("ebpfTcpChan-lag", len(a.ebpfTcpChan)). + Msg("lag of channels") } - } + }() + + return a } func (a *Aggregator) Run() { @@ -287,33 +211,8 @@ func (a *Aggregator) Run() { err := syscall.Kill(int(pid), 0) if err != nil { - // pid does not exist delete(a.liveProcesses, pid) - a.removeFromClusterInfo(pid) - - a.h2ParserMu.Lock() - for key, parser := range a.h2Parsers { - // h2Parsers map[string]*http2Parser // pid-fd -> http2Parser - if strings.HasPrefix(key, fmt.Sprint(pid)) { - parser.clientHpackDecoder.Close() - parser.serverHpackDecoder.Close() - - delete(a.h2Parsers, key) - } - } - a.h2ParserMu.Unlock() - - a.rateLimitMu.Lock() - delete(a.rateLimiters, pid) - a.rateLimitMu.Unlock() - - a.pgStmtsMu.Lock() - for key, _ := range a.pgStmts { - if strings.HasPrefix(key, fmt.Sprint(pid)) { - delete(a.pgStmts, key) - } - } - a.pgStmtsMu.Unlock() + a.processExit(pid) } } @@ -322,19 +221,18 @@ func (a *Aggregator) Run() { }() go a.processk8s() - // TODO: determine the number of workers with benchmarking cpuCount := runtime.NumCPU() - numWorker := 5 * cpuCount - if numWorker < 50 { - numWorker = 50 // min number - } + numWorker := cpuCount for i := 0; i < numWorker; i++ { - go a.processEbpf(a.ctx) go a.processEbpfTcp(a.ctx) go a.processEbpfProc(a.ctx) } + for i := 0; i < 4*cpuCount; i++ { + go a.processEbpf(a.ctx) + } + for i := 0; i < 2*cpuCount; i++ { go a.processHttp2Frames() } @@ -404,7 +302,8 @@ func (a *Aggregator) processEbpfTcp(ctx context.Context) { switch bpfEvent.Type() { case tcp_state.TCP_CONNECT_EVENT: d := data.(*tcp_state.TcpConnectEvent) // copy data's value - a.processTcpConnect(d) + ctxPid := context.WithValue(a.ctx, log.LOG_CONTEXT, fmt.Sprint(d.Pid)) + a.processTcpConnect(ctxPid, d) } } } @@ -424,7 +323,8 @@ func (a *Aggregator) processEbpf(ctx context.Context) { switch bpfEvent.Type() { case l7_req.L7_EVENT: d := data.(*l7_req.L7Event) // copy data's value - a.processL7(ctx, d) + ctxPid := context.WithValue(a.ctx, log.LOG_CONTEXT, fmt.Sprint(d.Pid)) + a.processL7(ctxPid, d) case l7_req.TRACE_EVENT: d := data.(*l7_req.TraceEvent) rateLimiter := a.getRateLimiterForPid(d.Pid) @@ -454,27 +354,14 @@ func (a *Aggregator) getRateLimiterForPid(pid uint32) *rate.Limiter { func (a *Aggregator) processExec(d *proc.ProcEvent) { a.liveProcessesMu.Lock() - defer a.liveProcessesMu.Unlock() - a.liveProcesses[d.Pid] = struct{}{} + a.liveProcessesMu.Unlock() - // if duplicate exec event comes, underlying mutex will be changed - // if first assigned mutex is locked and another exec event comes, mutex will be changed - // and unlock of unlocked mutex now is a possibility - // to avoid this case, if a socket map already has a mutex, don't change it - if a.clusterInfo.SocketMaps[d.Pid].mu == nil { - // create lock on demand - a.muIndex.Add(1) - a.muArray[(a.muIndex.Load())%uint64(len(a.muArray))] = &sync.RWMutex{} - a.clusterInfo.SocketMaps[d.Pid].mu = a.muArray[(a.muIndex.Load())%uint64(len(a.muArray))] - } + a.clusterInfo.SignalSocketMapCreation(d.Pid) } func (a *Aggregator) processExit(pid uint32) { - a.liveProcessesMu.Lock() - delete(a.liveProcesses, pid) - a.removeFromClusterInfo(pid) - a.liveProcessesMu.Unlock() + a.clusterInfo.clearProc(pid) a.h2ParserMu.Lock() pid_s := fmt.Sprint(pid) @@ -506,7 +393,7 @@ func (a *Aggregator) signalTlsAttachment(pid uint32) { a.tlsAttachSignalChan <- pid } -func (a *Aggregator) processTcpConnect(d *tcp_state.TcpConnectEvent) { +func (a *Aggregator) processTcpConnect(ctx context.Context, d *tcp_state.TcpConnectEvent) { go a.signalTlsAttachment(d.Pid) if d.Type_ == tcp_state.EVENT_TCP_ESTABLISHED { @@ -519,21 +406,26 @@ func (a *Aggregator) processTcpConnect(d *tcp_state.TcpConnectEvent) { var ok bool sockMap = a.clusterInfo.SocketMaps[d.Pid] - var skLine *SocketLine + if sockMap == nil { + // signal socket map creation and requeue event + log.Logger.Warn().Ctx(ctx). + Uint32("pid", d.Pid).Str("func", "processTcpConnect").Str("event", "ESTABLISHED").Msg("socket map not initialized") - if sockMap.mu == nil { + go a.clusterInfo.SignalSocketMapCreation(d.Pid) + a.ebpfTcpChan <- d return } - sockMap.mu.Lock() // lock for reading - if sockMap.M == nil { - sockMap.M = make(map[uint64]*SocketLine) - } + var skLine *SocketLine + sockMap.mu.RLock() skLine, ok = sockMap.M[d.Fd] + sockMap.mu.RUnlock() if !ok { - skLine = NewSocketLine(d.Pid, d.Fd) - sockMap.M[d.Fd] = skLine + go sockMap.SignalSocketLine(ctx, d.Fd) // signal for creation + // requeue connect event + a.ebpfTcpChan <- d + return } skLine.AddValue( @@ -547,9 +439,6 @@ func (a *Aggregator) processTcpConnect(d *tcp_state.TcpConnectEvent) { Dport: d.DPort, }, ) - - sockMap.mu.Unlock() // unlock for writing - } else if d.Type_ == tcp_state.EVENT_TCP_CLOSED { var sockMap *SocketMap var ok bool @@ -560,23 +449,21 @@ func (a *Aggregator) processTcpConnect(d *tcp_state.TcpConnectEvent) { } sockMap = a.clusterInfo.SocketMaps[d.Pid] + if sockMap == nil { + // signal socket map creation and requeue event + log.Logger.Warn().Ctx(ctx). + Uint32("pid", d.Pid).Str("func", "processTcpConnect").Str("event", "ESTABLISHED").Msg("socket map not initialized") - var skLine *SocketLine - - if sockMap.mu == nil { + go a.clusterInfo.SignalSocketMapCreation(d.Pid) + a.ebpfTcpChan <- d return } - sockMap.mu.Lock() // lock for reading - if sockMap.M == nil { - sockMap.M = make(map[uint64]*SocketLine) - } + var skLine *SocketLine skLine, ok = sockMap.M[d.Fd] if !ok { - sockMap.mu.Unlock() // unlock for reading return } - sockMap.mu.Unlock() // unlock for reading // If connection is established before, add the close event skLine.AddValue( @@ -684,8 +571,8 @@ func (a *Aggregator) processHttp2Frames() { return } - skInfo := a.findRelatedSocket(a.ctx, d) - if skInfo == nil { + skInfo, err := a.findRelatedSocket(a.ctx, d) + if skInfo == nil || err != nil { return } @@ -709,7 +596,7 @@ func (a *Aggregator) processHttp2Frames() { } // toUID is set to :authority header in client frame - err := a.setFromTo(skInfo, d, req, req.ToUID) + err = a.setFromTo(skInfo, d, req, req.ToUID) if err != nil { return } @@ -929,43 +816,43 @@ func (a *Aggregator) getSvcWithIP(addr string) (types.UID, bool) { return svcUid, ok } -func (a *Aggregator) setFromTo(skInfo *SockInfo, d *l7_req.L7Event, reqDto *datastore.Request, hostHeader string) error { +func (a *Aggregator) setFromTo(skInfo *SockInfo, d *l7_req.L7Event, event datastore.DirectionalEvent, hostHeader string) error { // find pod info podUid, ok := a.getPodWithIP(skInfo.Saddr) if !ok { return fmt.Errorf("error finding pod with sockets saddr") } - reqDto.FromUID = string(podUid) - reqDto.FromType = "pod" - reqDto.FromPort = skInfo.Sport - reqDto.ToPort = skInfo.Dport + event.SetFromUID(string(podUid)) + event.SetFromType(POD) + event.SetFromPort(skInfo.Sport) + event.SetToPort(skInfo.Dport) // find service info svcUid, ok := a.getSvcWithIP(skInfo.Daddr) if ok { - reqDto.ToUID = string(svcUid) - reqDto.ToType = "service" + event.SetToUID(string(svcUid)) + event.SetToType(SVC) } else { podUid, ok := a.getPodWithIP(skInfo.Daddr) if ok { - reqDto.ToUID = string(podUid) - reqDto.ToType = "pod" + event.SetToUID(string(podUid)) + event.SetToType(POD) } else { // 3rd party url if hostHeader != "" { - reqDto.ToUID = hostHeader - reqDto.ToType = "outbound" + event.SetToUID(hostHeader) + event.SetToType(OUTBOUND) } else { remoteDnsHost, err := getHostnameFromIP(skInfo.Daddr) if err == nil { // dns lookup successful - reqDto.ToUID = remoteDnsHost - reqDto.ToType = "outbound" + event.SetToUID(remoteDnsHost) + event.SetToType(OUTBOUND) } else { - reqDto.ToUID = skInfo.Daddr - reqDto.ToType = "outbound" + event.SetToUID(skInfo.Daddr) + event.SetToType(OUTBOUND) } } } @@ -978,54 +865,199 @@ func (a *Aggregator) getConnKey(pid uint32, fd uint64) string { return fmt.Sprintf("%d-%d", pid, fd) } -func (a *Aggregator) processL7(ctx context.Context, d *l7_req.L7Event) { - // other protocols events come as whole, but http2 events come as frames - // we need to aggregate frames to get the whole request - defer func() { - if r := recover(); r != nil { - // TODO: we need to fix this properly - log.Logger.Debug().Msgf("probably a http2 frame sent on a closed chan: %v", r) +type KafkaMessage struct { + TopicName string + Partition int32 + Key string + Value string + Type string // PUBLISH or CONSUME +} + +func (a *Aggregator) decodeKafkaPayload(d *l7_req.L7Event) ([]*KafkaMessage, error) { + // apiVersion is written in request header + // response header only has correlation_id + // so while returning a response message from kafka, we need to send the api version to userspace + // in order to parse the response message. + // d.KafkaApiVersion is set in kafka request event + + // r := bytes.NewReader(d.Payload[:d.PayloadSize]) + + // var apiVersion int16 // only in request + // var clientID string // only in request + // var correlationID int32 // both in request and response + // var message protocol.Message + // var err error + + result := make([]*KafkaMessage, 0) + + if d.Method == l7_req.KAFKA_PRODUCE_REQUEST { + saramaReq, _, err := kafka.DecodeRequest(bytes.NewReader(d.Payload[:d.PayloadSize])) + if err != nil { + // non-kafka messages sometimes classifed as kafka messages on kernel side + return nil, fmt.Errorf("kafka decode request failure: %w", err) + } else { + rs := saramaReq.Body.(*kafka.ProduceRequest).Records + for topicName, r := range rs { + for partition, record := range r { + records := record.RecordBatch.Records + for _, msg := range records { + result = append(result, &KafkaMessage{ + TopicName: topicName, + Partition: partition, + Key: string(msg.Key), + Value: string(msg.Value), + Type: "PUBLISH", + }) + } + } + } + } + } else if d.Method == l7_req.KAFKA_FETCH_RESPONSE { + payload := d.Payload[:d.PayloadSize] + // decode response header first + decodedHeader := &kafka.ResponseHeader{} + off, err := kafka.VersionedDecode(payload, decodedHeader, kafka.ResponseHeaderVersion(1, d.KafkaApiVersion)) + if err != nil { + return nil, fmt.Errorf("kafka decode response header failure: %w", err) } - }() - if d.Protocol == l7_req.L7_PROTOCOL_HTTP2 { - var ok bool + // skip header + payload = payload[off:] + fetchApiVersion := d.KafkaApiVersion - a.liveProcessesMu.RLock() - _, ok = a.liveProcesses[d.Pid] - a.liveProcessesMu.RUnlock() - if !ok { - return // if a late event comes, do not create parsers and new worker to avoid memory leak + res := &kafka.FetchResponse{} + _, err = kafka.VersionedDecode(payload, res, fetchApiVersion) + if err != nil { + return nil, fmt.Errorf("kafka decode fetch response failure: %w", err) + } else { + for topic, mapfrb := range res.Blocks { + for partition, frb := range mapfrb { + log.Logger.Warn().Int32("partition", partition).Msg("sarama kafka fetch data- partition") + recordSet := frb.RecordsSet + for _, record := range recordSet { + // record.MsgSet --> legacy records + // record.RecordBatch --> default records + for _, r := range record.RecordBatch.Records { + result = append(result, &KafkaMessage{ + TopicName: topic, + Partition: partition, + Key: string(r.Key), + Value: string(r.Value), + Type: "CONSUME", + }) + } + } + } + } + } + + } + + return result, nil +} + +func (a *Aggregator) processHttp2Event(d *l7_req.L7Event) { + // http2 events come as frames + // we need to aggregate frames to get the whole request + var ok bool + + a.liveProcessesMu.RLock() + _, ok = a.liveProcesses[d.Pid] + a.liveProcessesMu.RUnlock() + if !ok { + return // if a late event comes, do not create parsers and new worker to avoid memory leak + } + + a.h2Ch <- d + return +} + +func (a *Aggregator) processKafkaEvent(ctx context.Context, d *l7_req.L7Event) { + kafkaMessages, err := a.decodeKafkaPayload(d) + if err != nil || len(kafkaMessages) == 0 { + return + } + + skInfo, err := a.findRelatedSocket(ctx, d) + if skInfo == nil || err != nil { + // requeue event if this is its first time + if !d.PutBack { + d.PutBack = true + a.ebpfChan <- d + return } - a.h2Ch <- d + log.Logger.Debug(). + Ctx(ctx). + Err(err). + Uint32("pid", d.Pid). + Uint64("fd", d.Fd). + Uint64("writeTime", d.WriteTimeNs). + Str("protocol", d.Protocol). + Any("payload", string(d.Payload[:d.PayloadSize])). + Msg("discarding kafka event, socket not found") + return } + for _, msg := range kafkaMessages { + event := &datastore.KafkaEvent{ + StartTime: int64(convertKernelTimeToUserspaceTime(d.WriteTimeNs) / 1e6), + Latency: d.Duration, + FromIP: skInfo.Saddr, + FromType: "", + FromUID: "", + FromPort: 0, + ToIP: skInfo.Daddr, + ToType: "", + ToUID: "", + ToPort: 0, + Tls: d.Tls, + Topic: msg.TopicName, + Partition: uint32(msg.Partition), + Key: msg.Key, + Value: msg.Value, + Type: msg.Type, + Tid: d.Tid, + Seq: d.Seq, + } - var path string - if d.Protocol == l7_req.L7_PROTOCOL_POSTGRES { - // parse sql command from payload - // path = sql command - // method = sql message type - var err error - path, err = a.parseSqlCommand(d) + err := a.setFromTo(skInfo, d, event, "") if err != nil { - log.Logger.Error().AnErr("err", err) return } + + if event.Type == "CONSUME" { + // TODO: reverse the from and to + // do we show arrows originating from outbound services ? + } + + log.Logger.Warn().Ctx(ctx).Any("kafkaEvent", event).Msg("persist kafka event") + err = a.ds.PersistKafkaEvent(event) + if err != nil { + log.Logger.Error().Err(err).Msg("error persisting kafka event") + } } + return - skInfo := a.findRelatedSocket(ctx, d) - if skInfo == nil { - log.Logger.Debug().Uint32("pid", d.Pid). - Uint64("fd", d.Fd).Uint64("writeTime", d.WriteTimeNs). - Str("protocol", d.Protocol).Any("payload", string(d.Payload[:d.PayloadSize])).Msg("socket not found") +} - // go check pid-fd for the socket - a.fetchSocketOnNotFound(ctx, d) +func (a *Aggregator) processAmqpEvent(ctx context.Context, d *l7_req.L7Event) { + skInfo, err := a.findRelatedSocket(ctx, d) + if skInfo == nil || err != nil { + // requeue event if this is its first time + if !d.PutBack { + d.PutBack = true + a.ebpfChan <- d + return + } + log.Logger.Debug().Uint32("pid", d.Pid).Err(err). + Uint64("fd", d.Fd).Uint64("writeTime", d.WriteTimeNs). + Str("protocol", d.Protocol).Any("payload", string(d.Payload[:d.PayloadSize])). + Msg("discarding amqp event, socket not found") + return } - reqDto := datastore.Request{ + reqDto := &datastore.Request{ StartTime: int64(convertKernelTimeToUserspaceTime(d.WriteTimeNs) / 1e6), Latency: d.Duration, FromIP: skInfo.Saddr, @@ -1036,316 +1068,305 @@ func (a *Aggregator) processL7(ctx context.Context, d *l7_req.L7Event) { StatusCode: d.Status, FailReason: "", Method: d.Method, + Path: "", Tid: d.Tid, Seq: d.Seq, } - // Since we process events concurrently - // TCP events and L7 events can be processed out of order - - var reqHostHeader string - // parse http payload, extract path, query params, headers - if d.Protocol == l7_req.L7_PROTOCOL_HTTP { - _, path, _, reqHostHeader = parseHttpPayload(string(d.Payload[0:d.PayloadSize])) - } - - if d.Protocol == l7_req.L7_PROTOCOL_REDIS { - path = string(d.Payload[0:d.PayloadSize]) - } - - err := a.setFromTo(skInfo, d, &reqDto, reqHostHeader) + err = a.setFromTo(skInfo, d, reqDto, "") if err != nil { return } - reqDto.Path = path - reqDto.Completed = !d.Failed - // In AMQP-DELIVER or REDIS-PUSHED_EVENT event, we are capturing from read syscall, // exchange sockets // In Alaz context, From is always the one that makes the write // and To is the one that makes the read - if (d.Protocol == l7_req.L7_PROTOCOL_AMQP && d.Method == l7_req.DELIVER) || - (d.Protocol == l7_req.L7_PROTOCOL_REDIS && d.Method == l7_req.REDIS_PUSHED_EVENT) { - reqDto.FromIP, reqDto.ToIP = reqDto.ToIP, reqDto.FromIP - reqDto.FromPort, reqDto.ToPort = reqDto.ToPort, reqDto.FromPort - reqDto.FromUID, reqDto.ToUID = reqDto.ToUID, reqDto.FromUID - reqDto.FromType, reqDto.ToType = reqDto.ToType, reqDto.FromType - } - - if d.Protocol == l7_req.L7_PROTOCOL_HTTP && d.Tls { - reqDto.Protocol = "HTTPS" + if d.Method == l7_req.DELIVER { + reqDto.ReverseDirection() } - err = a.ds.PersistRequest(&reqDto) + err = a.ds.PersistRequest(reqDto) if err != nil { log.Logger.Error().Err(err).Msg("error persisting request") } - } -// reverse dns lookup -func getHostnameFromIP(ipAddr string) (string, error) { - // return from cache, if exists - // consumes too much memory otherwise - if host, ok := reverseDnsCache.Get(ipAddr); ok { - return host.(string), nil - } else { - addrs, err := net.LookupAddr(ipAddr) - if err != nil { - return "", err - } +func (a *Aggregator) processRedisEvent(ctx context.Context, d *l7_req.L7Event) { + query := string(d.Payload[0:d.PayloadSize]) - // The reverse DNS lookup can return multiple names for the same IP. - // In this example, we return the first name found. - if len(addrs) > 0 { - reverseDnsCache.Set(ipAddr, addrs[0], 0) - return addrs[0], nil + skInfo, err := a.findRelatedSocket(ctx, d) + if skInfo == nil || err != nil { + // requeue event if this is its first time + if !d.PutBack { + d.PutBack = true + a.ebpfChan <- d + return } - return "", fmt.Errorf("no hostname found for IP address: %s", ipAddr) + log.Logger.Debug(). + Ctx(ctx). + Err(err). + Uint32("pid", d.Pid). + Uint64("fd", d.Fd).Uint64("writeTime", d.WriteTimeNs). + Str("protocol", d.Protocol).Any("payload", string(d.Payload[:d.PayloadSize])).Msg("discarding redis event, socket not found") + return } -} -// get all tcp sockets for the pid -// iterate through all sockets -// create a new socket line for each socket -// add it to the socket map -func (a *Aggregator) getAlreadyExistingSockets(pid uint32) { - // no need for locking because this is called firstmost and no other goroutine is running - - socks := map[string]sock{} - sockMap := a.fetchSocketMap(pid) - - // Get the sockets for the process. - var err error - for _, f := range []string{"tcp", "tcp6"} { - sockPath := strings.Join([]string{"/proc", fmt.Sprint(pid), "net", f}, "/") - - ss, err := readSockets(sockPath) - if err != nil { - continue - } - - for _, s := range ss { - socks[s.Inode] = sock{TcpSocket: s} - } + reqDto := &datastore.Request{ + StartTime: int64(convertKernelTimeToUserspaceTime(d.WriteTimeNs) / 1e6), + Latency: d.Duration, + FromIP: skInfo.Saddr, + ToIP: skInfo.Daddr, + Protocol: d.Protocol, + Tls: d.Tls, + Completed: true, + StatusCode: d.Status, + FailReason: "", + Method: d.Method, + Path: query, + Tid: d.Tid, + Seq: d.Seq, } - // Get the file descriptors for the process. - fdDir := strings.Join([]string{"/proc", fmt.Sprint(pid), "fd"}, "/") - fdEntries, err := os.ReadDir(fdDir) + err = a.setFromTo(skInfo, d, reqDto, "") if err != nil { return } - fds := make([]Fd, 0, len(fdEntries)) - for _, entry := range fdEntries { - fd, err := strconv.ParseUint(entry.Name(), 10, 64) - if err != nil { - continue - } - dest, err := os.Readlink(path.Join(fdDir, entry.Name())) - if err != nil { - continue - } - var socketInode string - if strings.HasPrefix(dest, "socket:[") && strings.HasSuffix(dest, "]") { - socketInode = dest[len("socket:[") : len(dest)-1] - } - fds = append(fds, Fd{Fd: fd, Dest: dest, SocketInode: socketInode}) + // REDIS-PUSHED_EVENT event, we are capturing from read syscall, + // exchange sockets + // In Alaz context, From is always the one that makes the write + // and To is the one that makes the read + if d.Method == l7_req.REDIS_PUSHED_EVENT { + reqDto.ReverseDirection() } - // Match the sockets to the file descriptors. - for _, fd := range fds { - if fd.SocketInode != "" { - // add to values - s := socks[fd.SocketInode].TcpSocket - sockInfo := &SockInfo{ - Pid: pid, - Fd: fd.Fd, - Saddr: s.SAddr.IP().String(), - Sport: s.SAddr.Port(), - Daddr: s.DAddr.IP().String(), - Dport: s.DAddr.Port(), - } + err = a.ds.PersistRequest(reqDto) + if err != nil { + log.Logger.Error().Ctx(ctx). + Err(err).Msg("error persisting request") + } +} - if sockInfo.Saddr == "zero IP" || sockInfo.Daddr == "zero IP" || sockInfo.Sport == 0 || sockInfo.Dport == 0 { - continue +func (a *Aggregator) AdvertiseDebugData() { + http.HandleFunc("/pid-sock-map", + func(w http.ResponseWriter, r *http.Request) { + queryParam := r.URL.Query().Get("number") + if queryParam == "" { + http.Error(w, "Missing query parameter 'number'", http.StatusBadRequest) + return } - - skLine := NewSocketLine(pid, fd.Fd) - skLine.AddValue(0, sockInfo) - - if sockMap.mu == nil { + number, err := strconv.ParseUint(queryParam, 10, 32) + if err != nil { + http.Error(w, "Invalid query parameter 'number'", http.StatusBadRequest) return } + pid := uint32(number) - sockMap.mu.Lock() - if sockMap.M == nil { - sockMap.M = make(map[uint64]*SocketLine) + sockMap := a.clusterInfo.SocketMaps[pid] + if sockMap == nil { + http.Error(w, "Pid not found", http.StatusNotFound) + return + } else { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + _ = json.NewEncoder(w).Encode(sockMap) + return } - sockMap.M[fd.Fd] = skLine - sockMap.mu.Unlock() - } - } - + }, + ) + + // http.HandleFunc("/process-latency", + // func(w http.ResponseWriter, r *http.Request) { + // latency := a.totalLatency.Load() + // count := a.latencyCount.Load() + // if count == 0 { + // http.Error(w, "No data available", http.StatusNotFound) + // return + // } + // avgLatency := float64(latency) / float64(count) + // w.Header().Set("Content-Type", "application/json") + // w.WriteHeader(http.StatusOK) + // _ = json.NewEncoder(w).Encode(map[string]float64{ + // "average_latency_in_ns": avgLatency, + // }) + // return + // }) } -func (a *Aggregator) fetchSkInfo(ctx context.Context, skLine *SocketLine, d *l7_req.L7Event) *SockInfo { - rc := attemptLimit - rt := retryInterval - var skInfo *SockInfo - var err error - - for { - skInfo, err = skLine.GetValue(d.WriteTimeNs) - if err == nil && skInfo != nil { - break - } - // log.Logger.Debug().Err(err).Uint32("pid", d.Pid).Uint64("fd", d.Fd).Uint64("writeTime", d.WriteTimeNs).Msg("retry to get skInfo...") - rc-- - if rc == 0 { - break - } - time.Sleep(rt) - rt *= 2 // exponential backoff +func (a *Aggregator) processHttpEvent(ctx context.Context, d *l7_req.L7Event) { + var reqHostHeader string + var path string + // parse http payload, extract path, query params, headers + if d.Protocol == l7_req.L7_PROTOCOL_HTTP { + _, path, _, reqHostHeader = parseHttpPayload(string(d.Payload[0:d.PayloadSize])) + } - select { - case <-ctx.Done(): - log.Logger.Debug().Msg("processL7 exiting, stop retrying...") - return nil - default: - continue + skInfo, err := a.findRelatedSocket(ctx, d) + if skInfo == nil || err != nil { + // requeue event if this is its first time + if !d.PutBack { + d.PutBack = true + a.ebpfChan <- d + return } + log.Logger.Debug().Uint32("pid", d.Pid). + Err(err). + Uint64("fd", d.Fd).Uint64("writeTime", d.WriteTimeNs). + Str("protocol", d.Protocol). + Any("payload", string(d.Payload[:d.PayloadSize])). + Msg("discarding http event, socket not found") + return } - return skInfo -} + reqDto := &datastore.Request{ + StartTime: int64(convertKernelTimeToUserspaceTime(d.WriteTimeNs) / 1e6), + Latency: d.Duration, + FromIP: skInfo.Saddr, + ToIP: skInfo.Daddr, + Protocol: d.Protocol, + Tls: d.Tls, + Completed: true, + StatusCode: d.Status, + FailReason: "", + Method: d.Method, + Path: path, + Tid: d.Tid, + Seq: d.Seq, + } -func (a *Aggregator) removeFromClusterInfo(pid uint32) { - sockMap := a.clusterInfo.SocketMaps[pid] - if sockMap.mu == nil { + err = a.setFromTo(skInfo, d, reqDto, reqHostHeader) + if err != nil { return } - sockMap.mu.Lock() - sockMap.M = nil - sockMap.mu.Unlock() -} - -func (a *Aggregator) fetchSocketMap(pid uint32) *SocketMap { - sockMap := a.clusterInfo.SocketMaps[pid] - if sockMap.mu == nil { - return nil + if d.Protocol == l7_req.L7_PROTOCOL_HTTP && d.Tls { + reqDto.Protocol = "HTTPS" } - sockMap.mu.Lock() - if sockMap.M == nil { - sockMap.M = make(map[uint64]*SocketLine) + err = a.ds.PersistRequest(reqDto) + if err != nil { + log.Logger.Error().Err(err).Msg("error persisting request") } - sockMap.mu.Unlock() - return sockMap } -// This is a mitigation for the case a tcp event is missed -func (a *Aggregator) updateSocketMap(ctx context.Context) { - ticker := time.NewTicker(3 * time.Minute) +func (a *Aggregator) processPostgresEvent(ctx context.Context, d *l7_req.L7Event) { + // parse sql command from payload + // path = sql command + // method = sql message type - f := func() { - a.liveProcessesMu.RLock() - defer a.liveProcessesMu.RUnlock() - for pid := range a.liveProcesses { - sockMap := a.clusterInfo.SocketMaps[pid] - if sockMap.mu == nil { - continue - } - - sockMap.mu.Lock() - for _, skLine := range sockMap.M { - skLine.getConnectionInfo() - } - sockMap.mu.Unlock() - } + query, err := a.parseSqlCommand(d) + if err != nil { + log.Logger.Error().AnErr("err", err) + return } - for { - select { - case <-ticker.C: - f() - case <-ctx.Done(): + skInfo, err := a.findRelatedSocket(ctx, d) + if skInfo == nil { + // requeue event if this is its first time + if !d.PutBack { + d.PutBack = true + a.ebpfChan <- d return } + + log.Logger.Debug().Uint32("pid", d.Pid). + Err(err). + Uint64("fd", d.Fd).Uint64("writeTime", d.WriteTimeNs). + Str("protocol", d.Protocol).Any("payload", string(d.Payload[:d.PayloadSize])).Msg("discarding postgres event, socket not found") + + return } -} -func (a *Aggregator) fetchSocketOnNotFound(ctx context.Context, d *l7_req.L7Event) bool { - a.liveProcessesMu.Lock() + reqDto := &datastore.Request{ + StartTime: int64(convertKernelTimeToUserspaceTime(d.WriteTimeNs) / 1e6), + Latency: d.Duration, + FromIP: skInfo.Saddr, + ToIP: skInfo.Daddr, + Protocol: d.Protocol, + Tls: d.Tls, + Completed: true, + StatusCode: d.Status, + FailReason: "", + Method: d.Method, + Path: query, + Tid: d.Tid, + Seq: d.Seq, + } - a.liveProcesses[d.Pid] = struct{}{} - sockMap := a.clusterInfo.SocketMaps[d.Pid] - // pid does not exists - // acquire sockMap lock + err = a.setFromTo(skInfo, d, reqDto, "") + if err != nil { + return + } - // in case of reference to mu is nil, pid exec event did not come yet - // create a new mutex for the pid - // to avoid race around the mutex, we need to lock the liveProcessesMu - if sockMap.mu == nil { - log.Logger.Debug().Uint32("pid", d.Pid).Uint64("fd", d.Fd).Msg("fetchSocketOnNotFound: pid not found") + err = a.ds.PersistRequest(reqDto) + if err != nil { + log.Logger.Error().Err(err).Msg("error persisting request") + } +} - a.muIndex.Add(1) - a.muArray[(a.muIndex.Load())%uint64(len(a.muArray))] = &sync.RWMutex{} - a.clusterInfo.SocketMaps[d.Pid].mu = a.muArray[(a.muIndex.Load())%uint64(len(a.muArray))] +func (a *Aggregator) processL7(ctx context.Context, d *l7_req.L7Event) { + switch d.Protocol { + case l7_req.L7_PROTOCOL_HTTP2: + a.processHttp2Event(d) + case l7_req.L7_PROTOCOL_POSTGRES: + a.processPostgresEvent(ctx, d) + case l7_req.L7_PROTOCOL_HTTP: + a.processHttpEvent(ctx, d) + case l7_req.L7_PROTOCOL_REDIS: + a.processRedisEvent(ctx, d) + case l7_req.L7_PROTOCOL_AMQP: + a.processAmqpEvent(ctx, d) + case l7_req.L7_PROTOCOL_KAFKA: + a.processKafkaEvent(ctx, d) } - a.liveProcessesMu.Unlock() +} - // creates sockMap.M - skInfo := a.findRelatedSocket(ctx, d) - if skInfo == nil { - // go try reading from kernel files - err := sockMap.M[d.Fd].getConnectionInfo() +// reverse dns lookup +func getHostnameFromIP(ipAddr string) (string, error) { + // return from cache, if exists + // consumes too much memory otherwise + if host, ok := reverseDnsCache.Get(ipAddr); ok { + return host.(string), nil + } else { + addrs, err := net.LookupAddr(ipAddr) if err != nil { - log.Logger.Debug().Uint32("pid", d.Pid).Uint64("fd", d.Fd).Err(err).Msg("fetchSocketOnNotFound: failed to get connection info") - return false - } else { - log.Logger.Debug().Uint32("pid", d.Pid).Uint64("fd", d.Fd).Msg("fetchSocketOnNotFound: connection info found") - return true + return "", err } + + // The reverse DNS lookup can return multiple names for the same IP. + // In this example, we return the first name found. + if len(addrs) > 0 { + reverseDnsCache.Set(ipAddr, addrs[0], 0) + return addrs[0], nil + } + return "", fmt.Errorf("no hostname found for IP address: %s", ipAddr) } - return true } -func (a *Aggregator) findRelatedSocket(ctx context.Context, d *l7_req.L7Event) *SockInfo { +func (a *Aggregator) findRelatedSocket(ctx context.Context, d *l7_req.L7Event) (*SockInfo, error) { sockMap := a.clusterInfo.SocketMaps[d.Pid] // acquire sockMap lock - - if sockMap.mu == nil { - return nil - } - - sockMap.mu.Lock() - - if sockMap.M == nil { - sockMap.M = make(map[uint64]*SocketLine) + if sockMap == nil { + go a.clusterInfo.SignalSocketMapCreation(d.Pid) + return nil, fmt.Errorf("socket map not initialized for pid=%d, fd=%d", d.Pid, d.Fd) } + sockMap.mu.RLock() skLine, ok := sockMap.M[d.Fd] + sockMap.mu.RUnlock() if !ok { - log.Logger.Debug().Uint32("pid", d.Pid).Uint64("fd", d.Fd).Msg("create skLine...") // start new socket line, find already established connections - skLine = NewSocketLine(d.Pid, d.Fd) - sockMap.M[d.Fd] = skLine + go sockMap.SignalSocketLine(ctx, d.Fd) + return nil, fmt.Errorf("socket line not initialized for fd=%d, pid=%d", d.Fd, d.Pid) } - // release sockMap lock - sockMap.mu.Unlock() - - skInfo := a.fetchSkInfo(ctx, skLine, d) - if skInfo == nil { - return nil + skInfo, err := skLine.GetValue(d.WriteTimeNs) + if err != nil { + return nil, fmt.Errorf("could not find remote peer from given timestamp, err=%v, fd=%d, pid=%d", err, d.Fd, d.Pid) } - - return skInfo + return skInfo, nil } func (a *Aggregator) parseSqlCommand(d *l7_req.L7Event) (string, error) { @@ -1354,6 +1375,10 @@ func (a *Aggregator) parseSqlCommand(d *l7_req.L7Event) (string, error) { if d.Method == l7_req.SIMPLE_QUERY { // Q, 4 bytes of length, sql command + if len(r) < 5 { + return "", fmt.Errorf("too short for a sql query") + } + // skip Q, (simple query) r = r[1:] @@ -1490,6 +1515,7 @@ func (a *Aggregator) sendOpenConnection(sl *SocketLine) { } } +// TODO: connection send is made here, sendOpenConnection must be called, refactor this func and its calling place func (a *Aggregator) clearSocketLines(ctx context.Context) { ticker := time.NewTicker(120 * time.Second) skLineCh := make(chan *SocketLine, 1000) @@ -1510,7 +1536,7 @@ func (a *Aggregator) clearSocketLines(ctx context.Context) { for range ticker.C { for _, sockMap := range a.clusterInfo.SocketMaps { - if sockMap.mu == nil { + if sockMap == nil { continue } sockMap.mu.Lock() diff --git a/aggregator/kafka/crc32_field.go b/aggregator/kafka/crc32_field.go new file mode 100644 index 0000000..163e39a --- /dev/null +++ b/aggregator/kafka/crc32_field.go @@ -0,0 +1,78 @@ +package kafka + +import ( + "encoding/binary" + "fmt" + "hash/crc32" + "sync" +) + +type crcPolynomial int8 + +const ( + crcIEEE crcPolynomial = iota + crcCastagnoli +) + +var crc32FieldPool = sync.Pool{} + +func acquireCrc32Field(polynomial crcPolynomial) *crc32Field { + val := crc32FieldPool.Get() + if val != nil { + c := val.(*crc32Field) + c.polynomial = polynomial + return c + } + return newCRC32Field(polynomial) +} + +func releaseCrc32Field(c *crc32Field) { + crc32FieldPool.Put(c) +} + +var castagnoliTable = crc32.MakeTable(crc32.Castagnoli) + +// crc32Field implements the pushEncoder and pushDecoder interfaces for calculating CRC32s. +type crc32Field struct { + startOffset int + polynomial crcPolynomial +} + +func (c *crc32Field) saveOffset(in int) { + c.startOffset = in +} + +func (c *crc32Field) reserveLength() int { + return 4 +} + +func newCRC32Field(polynomial crcPolynomial) *crc32Field { + return &crc32Field{polynomial: polynomial} +} + +func (c *crc32Field) check(curOffset int, buf []byte) error { + crc, err := c.crc(curOffset, buf) + if err != nil { + return err + } + + expected := binary.BigEndian.Uint32(buf[c.startOffset:]) + if crc != expected { + return PacketDecodingError{fmt.Sprintf("CRC didn't match expected %#x got %#x", expected, crc)} + } + + return nil +} + +func (c *crc32Field) crc(curOffset int, buf []byte) (uint32, error) { + var tab *crc32.Table + switch c.polynomial { + case crcIEEE: + tab = crc32.IEEETable + case crcCastagnoli: + tab = castagnoliTable + default: + return 0, PacketDecodingError{"invalid CRC type"} + } + return crc32.Checksum(buf[c.startOffset+4:curOffset], tab), nil +} diff --git a/aggregator/kafka/decoder.go b/aggregator/kafka/decoder.go new file mode 100644 index 0000000..be896cd --- /dev/null +++ b/aggregator/kafka/decoder.go @@ -0,0 +1,119 @@ +package kafka + +type versionedDecoder interface { + decode(pd packetDecoder, version int16) error +} + +type packetDecoder interface { + // Primitives + getInt8() (int8, error) + getInt16() (int16, error) + getInt32() (int32, error) + getInt64() (int64, error) + getVarint() (int64, error) + getUVarint() (uint64, error) + getFloat64() (float64, error) + getArrayLength() (int, error) + getCompactArrayLength() (int, error) + getBool() (bool, error) + getEmptyTaggedFieldArray() (int, error) + + // Collections + getBytes() ([]byte, error) + getVarintBytes() ([]byte, error) + getCompactBytes() ([]byte, error) + getRawBytes(length int) ([]byte, error) + getString() (string, error) + getNullableString() (*string, error) + getCompactString() (string, error) + getCompactNullableString() (*string, error) + getCompactInt32Array() ([]int32, error) + getInt32Array() ([]int32, error) + getInt64Array() ([]int64, error) + getStringArray() ([]string, error) + + // Subsets + remaining() int + getSubset(length int) (packetDecoder, error) + peek(offset, length int) (packetDecoder, error) // similar to getSubset, but it doesn't advance the offset + peekInt8(offset int) (int8, error) // similar to peek, but just one byte + + // Stacks, see PushDecoder + push(in pushDecoder) error + pop() error +} + +// PushDecoder is the interface for decoding fields like CRCs and lengths where the validity +// of the field depends on what is after it in the packet. Start them with PacketDecoder.Push() where +// the actual value is located in the packet, then PacketDecoder.Pop() them when all the bytes they +// depend upon have been decoded. +type pushDecoder interface { + // Saves the offset into the input buffer as the location to actually read the calculated value when able. + saveOffset(in int) + + // Returns the length of data to reserve for the input of this encoder (e.g. 4 bytes for a CRC32). + reserveLength() int + + // Indicates that all required data is now available to calculate and check the field. + // SaveOffset is guaranteed to have been called first. The implementation should read ReserveLength() bytes + // of data from the saved offset, and verify it based on the data between the saved offset and curOffset. + check(curOffset int, buf []byte) error +} + +// dynamicPushDecoder extends the interface of pushDecoder for uses cases where the length of the +// fields itself is unknown until its value was decoded (for instance varint encoded length +// fields). +// During push, dynamicPushDecoder.decode() method will be called instead of reserveLength() +type dynamicPushDecoder interface { + pushDecoder + decoder +} + +type decoder interface { + decode(pd packetDecoder) error +} + +// decode takes bytes and a decoder and fills the fields of the decoder from the bytes, +// interpreted using Kafka's encoding rules. +func decode(buf []byte, in decoder) error { + if buf == nil { + return nil + } + + helper := realDecoder{ + raw: buf, + } + err := in.decode(&helper) + if err != nil { + return err + } + + if helper.off != len(buf) { + return PacketDecodingError{"invalid length"} + } + + return nil +} + +func VersionedDecode(buf []byte, in versionedDecoder, version int16) (int, error) { + if buf == nil { + return 0, nil + } + + helper := realDecoder{ + raw: buf, + } + + err := in.decode(&helper, version) + if err != nil { + return helper.off, err + } + + // if helper.off != len(buf) { + // return helper.off, PacketDecodingError{ + // Info: fmt.Sprintf("invalid length (off=%d, len=%d)", helper.off, len(buf)), + // } + // } + + return helper.off, nil +} diff --git a/aggregator/kafka/decompress.go b/aggregator/kafka/decompress.go new file mode 100644 index 0000000..07d9dce --- /dev/null +++ b/aggregator/kafka/decompress.go @@ -0,0 +1,98 @@ +package kafka + +import ( + "bytes" + "fmt" + "sync" + + snappy "github.com/eapache/go-xerial-snappy" + "github.com/klauspost/compress/gzip" + "github.com/pierrec/lz4/v4" +) + +var ( + lz4ReaderPool = sync.Pool{ + New: func() interface{} { + return lz4.NewReader(nil) + }, + } + + gzipReaderPool sync.Pool + + bufferPool = sync.Pool{ + New: func() interface{} { + return new(bytes.Buffer) + }, + } + + bytesPool = sync.Pool{ + New: func() interface{} { + res := make([]byte, 0, 4096) + return &res + }, + } +) + +func decompress(cc CompressionCodec, data []byte) ([]byte, error) { + switch cc { + case CompressionNone: + return data, nil + case CompressionGZIP: + var err error + reader, ok := gzipReaderPool.Get().(*gzip.Reader) + if !ok { + reader, err = gzip.NewReader(bytes.NewReader(data)) + } else { + err = reader.Reset(bytes.NewReader(data)) + } + + if err != nil { + return nil, err + } + + buffer := bufferPool.Get().(*bytes.Buffer) + _, err = buffer.ReadFrom(reader) + // copy the buffer to a new slice with the correct length + // reuse gzipReader and buffer + gzipReaderPool.Put(reader) + res := make([]byte, buffer.Len()) + copy(res, buffer.Bytes()) + buffer.Reset() + bufferPool.Put(buffer) + + return res, err + case CompressionSnappy: + return snappy.Decode(data) + case CompressionLZ4: + reader, ok := lz4ReaderPool.Get().(*lz4.Reader) + if !ok { + reader = lz4.NewReader(bytes.NewReader(data)) + } else { + reader.Reset(bytes.NewReader(data)) + } + buffer := bufferPool.Get().(*bytes.Buffer) + _, err := buffer.ReadFrom(reader) + // copy the buffer to a new slice with the correct length + // reuse lz4Reader and buffer + lz4ReaderPool.Put(reader) + res := make([]byte, buffer.Len()) + copy(res, buffer.Bytes()) + buffer.Reset() + bufferPool.Put(buffer) + + return res, err + case CompressionZSTD: + buffer := *bytesPool.Get().(*[]byte) + var err error + buffer, err = zstdDecompress(ZstdDecoderParams{}, buffer, data) + // copy the buffer to a new slice with the correct length and reuse buffer + res := make([]byte, len(buffer)) + copy(res, buffer) + buffer = buffer[:0] + bytesPool.Put(&buffer) + + return res, err + default: + return nil, PacketDecodingError{fmt.Sprintf("invalid compression specified (%d)", cc)} + } +} diff --git a/aggregator/kafka/errors.go b/aggregator/kafka/errors.go new file mode 100644 index 0000000..24a852d --- /dev/null +++ b/aggregator/kafka/errors.go @@ -0,0 +1,403 @@ +package kafka + +import ( + "errors" + "fmt" +) + +// ErrOutOfBrokers is the error returned when the client has run out of brokers to talk to because all of them errored +// or otherwise failed to respond. +var ErrOutOfBrokers = errors.New("kafka: client has run out of available brokers to talk to") + +// ErrBrokerNotFound is the error returned when there's no broker found for the requested ID. +var ErrBrokerNotFound = errors.New("kafka: broker for ID is not found") + +// ErrClosedClient is the error returned when a method is called on a client that has been closed. +var ErrClosedClient = errors.New("kafka: tried to use a client that was closed") + +// ErrIncompleteResponse is the error returned when the server returns a syntactically valid response, but it does +// not contain the expected information. +var ErrIncompleteResponse = errors.New("kafka: response did not contain all the expected topic/partition blocks") + +// ErrInvalidPartition is the error returned when a partitioner returns an invalid partition index +// (meaning one outside of the range [0...numPartitions-1]). +var ErrInvalidPartition = errors.New("kafka: partitioner returned an invalid partition index") + +// ErrAlreadyConnected is the error returned when calling Open() on a Broker that is already connected or connecting. +var ErrAlreadyConnected = errors.New("kafka: broker connection already initiated") + +// ErrNotConnected is the error returned when trying to send or call Close() on a Broker that is not connected. +var ErrNotConnected = errors.New("kafka: broker not connected") + +// ErrInsufficientData is returned when decoding and the packet is truncated. This can be expected +// when requesting messages, since as an optimization the server is allowed to return a partial message at the end +// of the message set. +var ErrInsufficientData = errors.New("kafka: insufficient data to decode packet, more bytes expected") + +// ErrShuttingDown is returned when a producer receives a message during shutdown. +var ErrShuttingDown = errors.New("kafka: message received by producer in process of shutting down") + +// ErrMessageTooLarge is returned when the next message to consume is larger than the configured Consumer.Fetch.Max +var ErrMessageTooLarge = errors.New("kafka: message is larger than Consumer.Fetch.Max") + +// ErrConsumerOffsetNotAdvanced is returned when a partition consumer didn't advance its offset after parsing +// a RecordBatch. +var ErrConsumerOffsetNotAdvanced = errors.New("kafka: consumer offset was not advanced after a RecordBatch") + +// ErrControllerNotAvailable is returned when server didn't give correct controller id. May be kafka server's version +// is lower than 0.10.0.0. +var ErrControllerNotAvailable = errors.New("kafka: controller is not available") + +// ErrNoTopicsToUpdateMetadata is returned when Meta.Full is set to false but no specific topics were found to update +// the metadata. +var ErrNoTopicsToUpdateMetadata = errors.New("kafka: no specific topics to update metadata") + +// ErrUnknownScramMechanism is returned when user tries to AlterUserScramCredentials with unknown SCRAM mechanism +var ErrUnknownScramMechanism = errors.New("kafka: unknown SCRAM mechanism provided") + +// ErrReassignPartitions is returned when altering partition assignments for a topic fails +var ErrReassignPartitions = errors.New("failed to reassign partitions for topic") + +// ErrDeleteRecords is the type of error returned when fail to delete the required records +var ErrDeleteRecords = errors.New("kafka server: failed to delete records") + +// ErrCreateACLs is the type of error returned when ACL creation failed +var ErrCreateACLs = errors.New("kafka server: failed to create one or more ACL rules") + +// ErrAddPartitionsToTxn is returned when AddPartitionsToTxn failed multiple times +var ErrAddPartitionsToTxn = errors.New("transaction manager: failed to send partitions to transaction") + +// ErrTxnOffsetCommit is returned when TxnOffsetCommit failed multiple times +var ErrTxnOffsetCommit = errors.New("transaction manager: failed to send offsets to transaction") + +// ErrTransactionNotReady when transaction status is invalid for the current action. +var ErrTransactionNotReady = errors.New("transaction manager: transaction is not ready") + +// ErrNonTransactedProducer when calling BeginTxn, CommitTxn or AbortTxn on a non transactional producer. +var ErrNonTransactedProducer = errors.New("transaction manager: you need to add TransactionalID to producer") + +// ErrTransitionNotAllowed when txnmgr state transition is not valid. +var ErrTransitionNotAllowed = errors.New("transaction manager: invalid transition attempted") + +// ErrCannotTransitionNilError when transition is attempted with an nil error. +var ErrCannotTransitionNilError = errors.New("transaction manager: cannot transition with a nil error") + +// ErrTxnUnableToParseResponse when response is nil +var ErrTxnUnableToParseResponse = errors.New("transaction manager: unable to parse response") + +// PacketEncodingError is returned from a failure while encoding a Kafka packet. This can happen, for example, +// if you try to encode a string over 2^15 characters in length, since Kafka's encoding rules do not permit that. +type PacketEncodingError struct { + Info string +} + +func (err PacketEncodingError) Error() string { + return fmt.Sprintf("kafka: error encoding packet: %s", err.Info) +} + +// PacketDecodingError is returned when there was an error (other than truncated data) decoding the Kafka broker's response. +// This can be a bad CRC or length field, or any other invalid value. +type PacketDecodingError struct { + Info string +} + +func (err PacketDecodingError) Error() string { + return fmt.Sprintf("kafka: error decoding packet: %s", err.Info) +} + +// ConfigurationError is the type of error returned from a constructor (e.g. NewClient, or NewConsumer) +// when the specified configuration is invalid. +type ConfigurationError string + +func (err ConfigurationError) Error() string { + return "kafka: invalid configuration (" + string(err) + ")" +} + +// KError is the type of error that can be returned directly by the Kafka broker. +// See https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-ErrorCodes +type KError int16 + +// Numeric error codes returned by the Kafka server. +const ( + ErrUnknown KError = -1 // Errors.UNKNOWN_SERVER_ERROR + ErrNoError KError = 0 // Errors.NONE + ErrOffsetOutOfRange KError = 1 // Errors.OFFSET_OUT_OF_RANGE + ErrInvalidMessage KError = 2 // Errors.CORRUPT_MESSAGE + ErrUnknownTopicOrPartition KError = 3 // Errors.UNKNOWN_TOPIC_OR_PARTITION + ErrInvalidMessageSize KError = 4 // Errors.INVALID_FETCH_SIZE + ErrLeaderNotAvailable KError = 5 // Errors.LEADER_NOT_AVAILABLE + ErrNotLeaderForPartition KError = 6 // Errors.NOT_LEADER_OR_FOLLOWER + ErrRequestTimedOut KError = 7 // Errors.REQUEST_TIMED_OUT + ErrBrokerNotAvailable KError = 8 // Errors.BROKER_NOT_AVAILABLE + ErrReplicaNotAvailable KError = 9 // Errors.REPLICA_NOT_AVAILABLE + ErrMessageSizeTooLarge KError = 10 // Errors.MESSAGE_TOO_LARGE + ErrStaleControllerEpochCode KError = 11 // Errors.STALE_CONTROLLER_EPOCH + ErrOffsetMetadataTooLarge KError = 12 // Errors.OFFSET_METADATA_TOO_LARGE + ErrNetworkException KError = 13 // Errors.NETWORK_EXCEPTION + ErrOffsetsLoadInProgress KError = 14 // Errors.COORDINATOR_LOAD_IN_PROGRESS + ErrConsumerCoordinatorNotAvailable KError = 15 // Errors.COORDINATOR_NOT_AVAILABLE + ErrNotCoordinatorForConsumer KError = 16 // Errors.NOT_COORDINATOR + ErrInvalidTopic KError = 17 // Errors.INVALID_TOPIC_EXCEPTION + ErrMessageSetSizeTooLarge KError = 18 // Errors.RECORD_LIST_TOO_LARGE + ErrNotEnoughReplicas KError = 19 // Errors.NOT_ENOUGH_REPLICAS + ErrNotEnoughReplicasAfterAppend KError = 20 // Errors.NOT_ENOUGH_REPLICAS_AFTER_APPEND + ErrInvalidRequiredAcks KError = 21 // Errors.INVALID_REQUIRED_ACKS + ErrIllegalGeneration KError = 22 // Errors.ILLEGAL_GENERATION + ErrInconsistentGroupProtocol KError = 23 // Errors.INCONSISTENT_GROUP_PROTOCOL + ErrInvalidGroupId KError = 24 // Errors.INVALID_GROUP_ID + ErrUnknownMemberId KError = 25 // Errors.UNKNOWN_MEMBER_ID + ErrInvalidSessionTimeout KError = 26 // Errors.INVALID_SESSION_TIMEOUT + ErrRebalanceInProgress KError = 27 // Errors.REBALANCE_IN_PROGRESS + ErrInvalidCommitOffsetSize KError = 28 // Errors.INVALID_COMMIT_OFFSET_SIZE + ErrTopicAuthorizationFailed KError = 29 // Errors.TOPIC_AUTHORIZATION_FAILED + ErrGroupAuthorizationFailed KError = 30 // Errors.GROUP_AUTHORIZATION_FAILED + ErrClusterAuthorizationFailed KError = 31 // Errors.CLUSTER_AUTHORIZATION_FAILED + ErrInvalidTimestamp KError = 32 // Errors.INVALID_TIMESTAMP + ErrUnsupportedSASLMechanism KError = 33 // Errors.UNSUPPORTED_SASL_MECHANISM + ErrIllegalSASLState KError = 34 // Errors.ILLEGAL_SASL_STATE + ErrUnsupportedVersion KError = 35 // Errors.UNSUPPORTED_VERSION + ErrTopicAlreadyExists KError = 36 // Errors.TOPIC_ALREADY_EXISTS + ErrInvalidPartitions KError = 37 // Errors.INVALID_PARTITIONS + ErrInvalidReplicationFactor KError = 38 // Errors.INVALID_REPLICATION_FACTOR + ErrInvalidReplicaAssignment KError = 39 // Errors.INVALID_REPLICA_ASSIGNMENT + ErrInvalidConfig KError = 40 // Errors.INVALID_CONFIG + ErrNotController KError = 41 // Errors.NOT_CONTROLLER + ErrInvalidRequest KError = 42 // Errors.INVALID_REQUEST + ErrUnsupportedForMessageFormat KError = 43 // Errors.UNSUPPORTED_FOR_MESSAGE_FORMAT + ErrPolicyViolation KError = 44 // Errors.POLICY_VIOLATION + ErrOutOfOrderSequenceNumber KError = 45 // Errors.OUT_OF_ORDER_SEQUENCE_NUMBER + ErrDuplicateSequenceNumber KError = 46 // Errors.DUPLICATE_SEQUENCE_NUMBER + ErrInvalidProducerEpoch KError = 47 // Errors.INVALID_PRODUCER_EPOCH + ErrInvalidTxnState KError = 48 // Errors.INVALID_TXN_STATE + ErrInvalidProducerIDMapping KError = 49 // Errors.INVALID_PRODUCER_ID_MAPPING + ErrInvalidTransactionTimeout KError = 50 // Errors.INVALID_TRANSACTION_TIMEOUT + ErrConcurrentTransactions KError = 51 // Errors.CONCURRENT_TRANSACTIONS + ErrTransactionCoordinatorFenced KError = 52 // Errors.TRANSACTION_COORDINATOR_FENCED + ErrTransactionalIDAuthorizationFailed KError = 53 // Errors.TRANSACTIONAL_ID_AUTHORIZATION_FAILED + ErrSecurityDisabled KError = 54 // Errors.SECURITY_DISABLED + ErrOperationNotAttempted KError = 55 // Errors.OPERATION_NOT_ATTEMPTED + ErrKafkaStorageError KError = 56 // Errors.KAFKA_STORAGE_ERROR + ErrLogDirNotFound KError = 57 // Errors.LOG_DIR_NOT_FOUND + ErrSASLAuthenticationFailed KError = 58 // Errors.SASL_AUTHENTICATION_FAILED + ErrUnknownProducerID KError = 59 // Errors.UNKNOWN_PRODUCER_ID + ErrReassignmentInProgress KError = 60 // Errors.REASSIGNMENT_IN_PROGRESS + ErrDelegationTokenAuthDisabled KError = 61 // Errors.DELEGATION_TOKEN_AUTH_DISABLED + ErrDelegationTokenNotFound KError = 62 // Errors.DELEGATION_TOKEN_NOT_FOUND + ErrDelegationTokenOwnerMismatch KError = 63 // Errors.DELEGATION_TOKEN_OWNER_MISMATCH + ErrDelegationTokenRequestNotAllowed KError = 64 // Errors.DELEGATION_TOKEN_REQUEST_NOT_ALLOWED + ErrDelegationTokenAuthorizationFailed KError = 65 // Errors.DELEGATION_TOKEN_AUTHORIZATION_FAILED + ErrDelegationTokenExpired KError = 66 // Errors.DELEGATION_TOKEN_EXPIRED + ErrInvalidPrincipalType KError = 67 // Errors.INVALID_PRINCIPAL_TYPE + ErrNonEmptyGroup KError = 68 // Errors.NON_EMPTY_GROUP + ErrGroupIDNotFound KError = 69 // Errors.GROUP_ID_NOT_FOUND + ErrFetchSessionIDNotFound KError = 70 // Errors.FETCH_SESSION_ID_NOT_FOUND + ErrInvalidFetchSessionEpoch KError = 71 // Errors.INVALID_FETCH_SESSION_EPOCH + ErrListenerNotFound KError = 72 // Errors.LISTENER_NOT_FOUND + ErrTopicDeletionDisabled KError = 73 // Errors.TOPIC_DELETION_DISABLED + ErrFencedLeaderEpoch KError = 74 // Errors.FENCED_LEADER_EPOCH + ErrUnknownLeaderEpoch KError = 75 // Errors.UNKNOWN_LEADER_EPOCH + ErrUnsupportedCompressionType KError = 76 // Errors.UNSUPPORTED_COMPRESSION_TYPE + ErrStaleBrokerEpoch KError = 77 // Errors.STALE_BROKER_EPOCH + ErrOffsetNotAvailable KError = 78 // Errors.OFFSET_NOT_AVAILABLE + ErrMemberIdRequired KError = 79 // Errors.MEMBER_ID_REQUIRED + ErrPreferredLeaderNotAvailable KError = 80 // Errors.PREFERRED_LEADER_NOT_AVAILABLE + ErrGroupMaxSizeReached KError = 81 // Errors.GROUP_MAX_SIZE_REACHED + ErrFencedInstancedId KError = 82 // Errors.FENCED_INSTANCE_ID + ErrEligibleLeadersNotAvailable KError = 83 // Errors.ELIGIBLE_LEADERS_NOT_AVAILABLE + ErrElectionNotNeeded KError = 84 // Errors.ELECTION_NOT_NEEDED + ErrNoReassignmentInProgress KError = 85 // Errors.NO_REASSIGNMENT_IN_PROGRESS + ErrGroupSubscribedToTopic KError = 86 // Errors.GROUP_SUBSCRIBED_TO_TOPIC + ErrInvalidRecord KError = 87 // Errors.INVALID_RECORD + ErrUnstableOffsetCommit KError = 88 // Errors.UNSTABLE_OFFSET_COMMIT + ErrThrottlingQuotaExceeded KError = 89 // Errors.THROTTLING_QUOTA_EXCEEDED + ErrProducerFenced KError = 90 // Errors.PRODUCER_FENCED +) + +func (err KError) Error() string { + // Error messages stolen/adapted from + // https://kafka.apache.org/protocol#protocol_error_codes + switch err { + case ErrNoError: + return "kafka server: Not an error, why are you printing me?" + case ErrUnknown: + return "kafka server: Unexpected (unknown?) server error" + case ErrOffsetOutOfRange: + return "kafka server: The requested offset is outside the range of offsets maintained by the server for the given topic/partition" + case ErrInvalidMessage: + return "kafka server: Message contents does not match its CRC" + case ErrUnknownTopicOrPartition: + return "kafka server: Request was for a topic or partition that does not exist on this broker" + case ErrInvalidMessageSize: + return "kafka server: The message has a negative size" + case ErrLeaderNotAvailable: + return "kafka server: In the middle of a leadership election, there is currently no leader for this partition and hence it is unavailable for writes" + case ErrNotLeaderForPartition: + return "kafka server: Tried to send a message to a replica that is not the leader for some partition. Your metadata is out of date" + case ErrRequestTimedOut: + return "kafka server: Request exceeded the user-specified time limit in the request" + case ErrBrokerNotAvailable: + return "kafka server: Broker not available. Not a client facing error, we should never receive this!!!" + case ErrReplicaNotAvailable: + return "kafka server: Replica information not available, one or more brokers are down" + case ErrMessageSizeTooLarge: + return "kafka server: Message was too large, server rejected it to avoid allocation error" + case ErrStaleControllerEpochCode: + return "kafka server: StaleControllerEpochCode (internal error code for broker-to-broker communication)" + case ErrOffsetMetadataTooLarge: + return "kafka server: Specified a string larger than the configured maximum for offset metadata" + case ErrNetworkException: + return "kafka server: The server disconnected before a response was received" + case ErrOffsetsLoadInProgress: + return "kafka server: The coordinator is still loading offsets and cannot currently process requests" + case ErrConsumerCoordinatorNotAvailable: + return "kafka server: Offset's topic has not yet been created" + case ErrNotCoordinatorForConsumer: + return "kafka server: Request was for a consumer group that is not coordinated by this broker" + case ErrInvalidTopic: + return "kafka server: The request attempted to perform an operation on an invalid topic" + case ErrMessageSetSizeTooLarge: + return "kafka server: The request included message batch larger than the configured segment size on the server" + case ErrNotEnoughReplicas: + return "kafka server: Messages are rejected since there are fewer in-sync replicas than required" + case ErrNotEnoughReplicasAfterAppend: + return "kafka server: Messages are written to the log, but to fewer in-sync replicas than required" + case ErrInvalidRequiredAcks: + return "kafka server: The number of required acks is invalid (should be either -1, 0, or 1)" + case ErrIllegalGeneration: + return "kafka server: The provided generation id is not the current generation" + case ErrInconsistentGroupProtocol: + return "kafka server: The provider group protocol type is incompatible with the other members" + case ErrInvalidGroupId: + return "kafka server: The provided group id was empty" + case ErrUnknownMemberId: + return "kafka server: The provided member is not known in the current generation" + case ErrInvalidSessionTimeout: + return "kafka server: The provided session timeout is outside the allowed range" + case ErrRebalanceInProgress: + return "kafka server: A rebalance for the group is in progress. Please re-join the group" + case ErrInvalidCommitOffsetSize: + return "kafka server: The provided commit metadata was too large" + case ErrTopicAuthorizationFailed: + return "kafka server: The client is not authorized to access this topic" + case ErrGroupAuthorizationFailed: + return "kafka server: The client is not authorized to access this group" + case ErrClusterAuthorizationFailed: + return "kafka server: The client is not authorized to send this request type" + case ErrInvalidTimestamp: + return "kafka server: The timestamp of the message is out of acceptable range" + case ErrUnsupportedSASLMechanism: + return "kafka server: The broker does not support the requested SASL mechanism" + case ErrIllegalSASLState: + return "kafka server: Request is not valid given the current SASL state" + case ErrUnsupportedVersion: + return "kafka server: The version of API is not supported" + case ErrTopicAlreadyExists: + return "kafka server: Topic with this name already exists" + case ErrInvalidPartitions: + return "kafka server: Number of partitions is invalid" + case ErrInvalidReplicationFactor: + return "kafka server: Replication-factor is invalid" + case ErrInvalidReplicaAssignment: + return "kafka server: Replica assignment is invalid" + case ErrInvalidConfig: + return "kafka server: Configuration is invalid" + case ErrNotController: + return "kafka server: This is not the correct controller for this cluster" + case ErrInvalidRequest: + return "kafka server: This most likely occurs because of a request being malformed by the client library or the message was sent to an incompatible broker. See the broker logs for more details" + case ErrUnsupportedForMessageFormat: + return "kafka server: The requested operation is not supported by the message format version" + case ErrPolicyViolation: + return "kafka server: Request parameters do not satisfy the configured policy" + case ErrOutOfOrderSequenceNumber: + return "kafka server: The broker received an out of order sequence number" + case ErrDuplicateSequenceNumber: + return "kafka server: The broker received a duplicate sequence number" + case ErrInvalidProducerEpoch: + return "kafka server: Producer attempted an operation with an old epoch" + case ErrInvalidTxnState: + return "kafka server: The producer attempted a transactional operation in an invalid state" + case ErrInvalidProducerIDMapping: + return "kafka server: The producer attempted to use a producer id which is not currently assigned to its transactional id" + case ErrInvalidTransactionTimeout: + return "kafka server: The transaction timeout is larger than the maximum value allowed by the broker (as configured by max.transaction.timeout.ms)" + case ErrConcurrentTransactions: + return "kafka server: The producer attempted to update a transaction while another concurrent operation on the same transaction was ongoing" + case ErrTransactionCoordinatorFenced: + return "kafka server: The transaction coordinator sending a WriteTxnMarker is no longer the current coordinator for a given producer" + case ErrTransactionalIDAuthorizationFailed: + return "kafka server: Transactional ID authorization failed" + case ErrSecurityDisabled: + return "kafka server: Security features are disabled" + case ErrOperationNotAttempted: + return "kafka server: The broker did not attempt to execute this operation" + case ErrKafkaStorageError: + return "kafka server: Disk error when trying to access log file on the disk" + case ErrLogDirNotFound: + return "kafka server: The specified log directory is not found in the broker config" + case ErrSASLAuthenticationFailed: + return "kafka server: SASL Authentication failed" + case ErrUnknownProducerID: + return "kafka server: The broker could not locate the producer metadata associated with the Producer ID" + case ErrReassignmentInProgress: + return "kafka server: A partition reassignment is in progress" + case ErrDelegationTokenAuthDisabled: + return "kafka server: Delegation Token feature is not enabled" + case ErrDelegationTokenNotFound: + return "kafka server: Delegation Token is not found on server" + case ErrDelegationTokenOwnerMismatch: + return "kafka server: Specified Principal is not valid Owner/Renewer" + case ErrDelegationTokenRequestNotAllowed: + return "kafka server: Delegation Token requests are not allowed on PLAINTEXT/1-way SSL channels and on delegation token authenticated channels" + case ErrDelegationTokenAuthorizationFailed: + return "kafka server: Delegation Token authorization failed" + case ErrDelegationTokenExpired: + return "kafka server: Delegation Token is expired" + case ErrInvalidPrincipalType: + return "kafka server: Supplied principalType is not supported" + case ErrNonEmptyGroup: + return "kafka server: The group is not empty" + case ErrGroupIDNotFound: + return "kafka server: The group id does not exist" + case ErrFetchSessionIDNotFound: + return "kafka server: The fetch session ID was not found" + case ErrInvalidFetchSessionEpoch: + return "kafka server: The fetch session epoch is invalid" + case ErrListenerNotFound: + return "kafka server: There is no listener on the leader broker that matches the listener on which metadata request was processed" + case ErrTopicDeletionDisabled: + return "kafka server: Topic deletion is disabled" + case ErrFencedLeaderEpoch: + return "kafka server: The leader epoch in the request is older than the epoch on the broker" + case ErrUnknownLeaderEpoch: + return "kafka server: The leader epoch in the request is newer than the epoch on the broker" + case ErrUnsupportedCompressionType: + return "kafka server: The requesting client does not support the compression type of given partition" + case ErrStaleBrokerEpoch: + return "kafka server: Broker epoch has changed" + case ErrOffsetNotAvailable: + return "kafka server: The leader high watermark has not caught up from a recent leader election so the offsets cannot be guaranteed to be monotonically increasing" + case ErrMemberIdRequired: + return "kafka server: The group member needs to have a valid member id before actually entering a consumer group" + case ErrPreferredLeaderNotAvailable: + return "kafka server: The preferred leader was not available" + case ErrGroupMaxSizeReached: + return "kafka server: Consumer group The consumer group has reached its max size. already has the configured maximum number of members" + case ErrFencedInstancedId: + return "kafka server: The broker rejected this static consumer since another consumer with the same group.instance.id has registered with a different member.id" + case ErrEligibleLeadersNotAvailable: + return "kafka server: Eligible topic partition leaders are not available" + case ErrElectionNotNeeded: + return "kafka server: Leader election not needed for topic partition" + case ErrNoReassignmentInProgress: + return "kafka server: No partition reassignment is in progress" + case ErrGroupSubscribedToTopic: + return "kafka server: Deleting offsets of a topic is forbidden while the consumer group is actively subscribed to it" + case ErrInvalidRecord: + return "kafka server: This record has failed the validation on broker and hence will be rejected" + case ErrUnstableOffsetCommit: + return "kafka server: There are unstable offsets that need to be cleared" + } + + return fmt.Sprintf("Unknown error, how did this happen? Error code = %d", err) +} diff --git a/aggregator/kafka/fetch_response.go b/aggregator/kafka/fetch_response.go new file mode 100644 index 0000000..9d0b1d0 --- /dev/null +++ b/aggregator/kafka/fetch_response.go @@ -0,0 +1,288 @@ +package kafka + +import ( + "errors" + + "github.com/ddosify/alaz/log" + + "time" +) + +type AbortedTransaction struct { + // ProducerID contains the producer id associated with the aborted transaction. + ProducerID int64 + // FirstOffset contains the first offset in the aborted transaction. + FirstOffset int64 +} + +func (t *AbortedTransaction) decode(pd packetDecoder) (err error) { + if t.ProducerID, err = pd.getInt64(); err != nil { + return err + } + + if t.FirstOffset, err = pd.getInt64(); err != nil { + return err + } + + return nil +} + +type FetchResponseBlock struct { + // Err contains the error code, or 0 if there was no fetch error. + Err KError + // HighWatermarkOffset contains the current high water mark. + HighWaterMarkOffset int64 + // LastStableOffset contains the last stable offset (or LSO) of the + // partition. This is the last offset such that the state of all + // transactional records prior to this offset have been decided (ABORTED or + // COMMITTED) + LastStableOffset int64 + LastRecordsBatchOffset *int64 + // LogStartOffset contains the current log start offset. + LogStartOffset int64 + // AbortedTransactions contains the aborted transactions. + AbortedTransactions []*AbortedTransaction + // PreferredReadReplica contains the preferred read replica for the + // consumer to use on its next fetch request + PreferredReadReplica int32 + // RecordsSet contains the record data. + RecordsSet []*Records + + Partial bool + Records *Records // deprecated: use FetchResponseBlock.RecordsSet +} + +func (b *FetchResponseBlock) decode(pd packetDecoder, version int16) (err error) { + tmp, err := pd.getInt16() + if err != nil { + return err + } + b.Err = KError(tmp) + + b.HighWaterMarkOffset, err = pd.getInt64() + if err != nil { + return err + } + + if version >= 4 { + b.LastStableOffset, err = pd.getInt64() + if err != nil { + return err + } + + if version >= 5 { + b.LogStartOffset, err = pd.getInt64() + if err != nil { + return err + } + } + + numTransact, err := pd.getArrayLength() + if err != nil { + return err + } + + if numTransact >= 0 { + b.AbortedTransactions = make([]*AbortedTransaction, numTransact) + } + + for i := 0; i < numTransact; i++ { + transact := new(AbortedTransaction) + if err = transact.decode(pd); err != nil { + return err + } + b.AbortedTransactions[i] = transact + } + } + + if version >= 11 { + b.PreferredReadReplica, err = pd.getInt32() + if err != nil { + return err + } + } else { + b.PreferredReadReplica = -1 + } + + recordsSize, err := pd.getInt32() + if err != nil { + return err + } + + recordsDecoder, err := pd.getSubset(int(recordsSize)) + if err != nil { + return err + } + + b.RecordsSet = []*Records{} + + for recordsDecoder.remaining() > 0 { + records := &Records{} + if err := records.decode(recordsDecoder); err != nil { + // If we have at least one decoded records, this is not an error + if errors.Is(err, ErrInsufficientData) { + if len(b.RecordsSet) == 0 { + b.Partial = true + } + break + } + return err + } + + b.LastRecordsBatchOffset, err = records.recordsOffset() + if err != nil { + return err + } + + partial, err := records.isPartial() + if err != nil { + return err + } + + n, err := records.numRecords() + if err != nil { + return err + } + + if n > 0 || (partial && len(b.RecordsSet) == 0) { + b.RecordsSet = append(b.RecordsSet, records) + + if b.Records == nil { + b.Records = records + } + } + + overflow, err := records.isOverflow() + if err != nil { + return err + } + + if partial || overflow { + break + } + } + + return nil +} + +type FetchResponse struct { + // Version defines the protocol version to use for encode and decode + Version int16 + // ThrottleTime contains the duration in milliseconds for which the request + // was throttled due to a quota violation, or zero if the request did not + // violate any quota. + ThrottleTime time.Duration + // ErrorCode contains the top level response error code. + ErrorCode int16 + // SessionID contains the fetch session ID, or 0 if this is not part of a fetch session. + SessionID int32 + // Blocks contains the response topics. + Blocks map[string]map[int32]*FetchResponseBlock + + LogAppendTime bool + Timestamp time.Time +} + +func (r *FetchResponse) decode(pd packetDecoder, version int16) (err error) { + r.Version = version + + if r.Version >= 1 { + throttle, err := pd.getInt32() + if err != nil { + return err + } + r.ThrottleTime = time.Duration(throttle) * time.Millisecond + } + + if r.Version >= 7 { + r.ErrorCode, err = pd.getInt16() + if err != nil { + return err + } + r.SessionID, err = pd.getInt32() + if err != nil { + return err + } + } + + numTopics, err := pd.getArrayLength() + if err != nil { + return err + } + + log.Logger.Warn().Msgf("sarama-numTopics: %d", numTopics) + + r.Blocks = make(map[string]map[int32]*FetchResponseBlock, numTopics) + for i := 0; i < numTopics; i++ { + name, err := pd.getString() + if err != nil { + return err + } + + numBlocks, err := pd.getArrayLength() + if err != nil { + return err + } + + r.Blocks[name] = make(map[int32]*FetchResponseBlock, numBlocks) + + for j := 0; j < numBlocks; j++ { + id, err := pd.getInt32() + if err != nil { + return err + } + + block := new(FetchResponseBlock) + err = block.decode(pd, version) + if err != nil { + return err + } + r.Blocks[name][id] = block + } + } + + return nil +} + +func (r *FetchResponse) key() int16 { + return 1 +} + +func (r *FetchResponse) version() int16 { + return r.Version +} + +func (r *FetchResponse) headerVersion() int16 { + return 0 +} + +func (r *FetchResponse) isValidVersion() bool { + return r.Version >= 0 && r.Version <= 11 +} + +func (r *FetchResponse) requiredVersion() KafkaVersion { + switch r.Version { + case 11: + return V2_3_0_0 + case 9, 10: + return V2_1_0_0 + case 8: + return V2_0_0_0 + case 7: + return V1_1_0_0 + case 6: + return V1_0_0_0 + case 4, 5: + return V0_11_0_0 + case 3: + return V0_10_1_0 + case 2: + return V0_10_0_0 + case 1: + return V0_9_0_0 + case 0: + return V0_8_2_0 + default: + return V2_3_0_0 + } +} diff --git a/aggregator/kafka/length_field.go b/aggregator/kafka/length_field.go new file mode 100644 index 0000000..f6119b2 --- /dev/null +++ b/aggregator/kafka/length_field.go @@ -0,0 +1,82 @@ +package kafka + +import ( + "encoding/binary" + "sync" +) + +// LengthField implements the PushEncoder and PushDecoder interfaces for calculating 4-byte lengths. +type lengthField struct { + startOffset int + length int32 +} + +var lengthFieldPool = sync.Pool{} + +func acquireLengthField() *lengthField { + val := lengthFieldPool.Get() + if val != nil { + return val.(*lengthField) + } + return &lengthField{} +} + +func releaseLengthField(m *lengthField) { + lengthFieldPool.Put(m) +} + +func (l *lengthField) decode(pd packetDecoder) error { + var err error + l.length, err = pd.getInt32() + if err != nil { + return err + } + if l.length > int32(pd.remaining()) { + return ErrInsufficientData + } + return nil +} + +func (l *lengthField) saveOffset(in int) { + l.startOffset = in +} + +func (l *lengthField) reserveLength() int { + return 4 +} + +func (l *lengthField) check(curOffset int, buf []byte) error { + if int32(curOffset-l.startOffset-4) != l.length { + return PacketDecodingError{"length field invalid"} + } + + return nil +} + +type varintLengthField struct { + startOffset int + length int64 +} + +func (l *varintLengthField) decode(pd packetDecoder) error { + var err error + l.length, err = pd.getVarint() + return err +} + +func (l *varintLengthField) saveOffset(in int) { + l.startOffset = in +} + +func (l *varintLengthField) reserveLength() int { + var tmp [binary.MaxVarintLen64]byte + return binary.PutVarint(tmp[:], l.length) +} + +func (l *varintLengthField) check(curOffset int, buf []byte) error { + if int64(curOffset-l.startOffset-l.reserveLength()) != l.length { + return PacketDecodingError{"length field invalid"} + } + + return nil +} diff --git a/aggregator/kafka/message.go b/aggregator/kafka/message.go new file mode 100644 index 0000000..a019494 --- /dev/null +++ b/aggregator/kafka/message.go @@ -0,0 +1,146 @@ +package kafka + +import ( + "fmt" + "time" +) + +const ( + // CompressionNone no compression + CompressionNone CompressionCodec = iota + // CompressionGZIP compression using GZIP + CompressionGZIP + // CompressionSnappy compression using snappy + CompressionSnappy + // CompressionLZ4 compression using LZ4 + CompressionLZ4 + // CompressionZSTD compression using ZSTD + CompressionZSTD + + // The lowest 3 bits contain the compression codec used for the message + compressionCodecMask int8 = 0x07 + + // Bit 3 set for "LogAppend" timestamps + timestampTypeMask = 0x08 + + // CompressionLevelDefault is the constant to use in CompressionLevel + // to have the default compression level for any codec. The value is picked + // that we don't use any existing compression levels. + CompressionLevelDefault = -1000 +) + +// CompressionCodec represents the various compression codecs recognized by Kafka in messages. +type CompressionCodec int8 + +func (cc CompressionCodec) String() string { + return []string{ + "none", + "gzip", + "snappy", + "lz4", + "zstd", + }[int(cc)] +} + +// UnmarshalText returns a CompressionCodec from its string representation. +func (cc *CompressionCodec) UnmarshalText(text []byte) error { + codecs := map[string]CompressionCodec{ + "none": CompressionNone, + "gzip": CompressionGZIP, + "snappy": CompressionSnappy, + "lz4": CompressionLZ4, + "zstd": CompressionZSTD, + } + codec, ok := codecs[string(text)] + if !ok { + return fmt.Errorf("cannot parse %q as a compression codec", string(text)) + } + *cc = codec + return nil +} + +// MarshalText transforms a CompressionCodec into its string representation. +func (cc CompressionCodec) MarshalText() ([]byte, error) { + return []byte(cc.String()), nil +} + +// Message is a kafka message type +type Message struct { + Codec CompressionCodec // codec used to compress the message contents + CompressionLevel int // compression level + LogAppendTime bool // the used timestamp is LogAppendTime + Key []byte // the message key, may be nil + Value []byte // the message contents + Set *MessageSet // the message set a message might wrap + Version int8 // v1 requires Kafka 0.10 + Timestamp time.Time // the timestamp of the message (version 1+ only) + + // compressedCache []byte + // compressedSize int // used for computing the compression ratio metrics +} + +func (m *Message) decode(pd packetDecoder) (err error) { + crc32Decoder := acquireCrc32Field(crcIEEE) + defer releaseCrc32Field(crc32Decoder) + + err = pd.push(crc32Decoder) + if err != nil { + return err + } + + m.Version, err = pd.getInt8() + if err != nil { + return err + } + + if m.Version > 1 { + return PacketDecodingError{fmt.Sprintf("unknown magic byte (%v)", m.Version)} + } + + attribute, err := pd.getInt8() + if err != nil { + return err + } + m.Codec = CompressionCodec(attribute & compressionCodecMask) + m.LogAppendTime = attribute×tampTypeMask == timestampTypeMask + + if m.Version == 1 { + if err := (Timestamp{&m.Timestamp}).decode(pd); err != nil { + return err + } + } + + m.Key, err = pd.getBytes() + if err != nil { + return err + } + + m.Value, err = pd.getBytes() + if err != nil { + return err + } + + // Required for deep equal assertion during tests but might be useful + // for future metrics about the compression ratio in fetch requests + // m.compressedSize = len(m.Value) + + if m.Value != nil && m.Codec != CompressionNone { + m.Value, err = decompress(m.Codec, m.Value) + if err != nil { + return err + } + + if err := m.decodeSet(); err != nil { + return err + } + } + + return pd.pop() +} + +// decodes a message set from a previously encoded bulk-message +func (m *Message) decodeSet() (err error) { + pd := realDecoder{raw: m.Value} + m.Set = &MessageSet{} + return m.Set.decode(&pd) +} diff --git a/aggregator/kafka/message_set.go b/aggregator/kafka/message_set.go new file mode 100644 index 0000000..0c20568 --- /dev/null +++ b/aggregator/kafka/message_set.go @@ -0,0 +1,86 @@ +package kafka + +import "errors" + +type MessageBlock struct { + Offset int64 + Msg *Message +} + +// Messages convenience helper which returns either all the +// messages that are wrapped in this block +func (msb *MessageBlock) Messages() []*MessageBlock { + if msb.Msg.Set != nil { + return msb.Msg.Set.Messages + } + return []*MessageBlock{msb} +} + +func (msb *MessageBlock) decode(pd packetDecoder) (err error) { + if msb.Offset, err = pd.getInt64(); err != nil { + return err + } + + lengthDecoder := acquireLengthField() + defer releaseLengthField(lengthDecoder) + + if err = pd.push(lengthDecoder); err != nil { + return err + } + + msb.Msg = new(Message) + if err = msb.Msg.decode(pd); err != nil { + return err + } + + if err = pd.pop(); err != nil { + return err + } + + return nil +} + +type MessageSet struct { + PartialTrailingMessage bool // whether the set on the wire contained an incomplete trailing MessageBlock + OverflowMessage bool // whether the set on the wire contained an overflow message + Messages []*MessageBlock +} + +func (ms *MessageSet) decode(pd packetDecoder) (err error) { + ms.Messages = nil + + for pd.remaining() > 0 { + magic, err := magicValue(pd) + if err != nil { + if errors.Is(err, ErrInsufficientData) { + ms.PartialTrailingMessage = true + return nil + } + return err + } + + if magic > 1 { + return nil + } + + msb := new(MessageBlock) + err = msb.decode(pd) + if err == nil { + ms.Messages = append(ms.Messages, msb) + } else if errors.Is(err, ErrInsufficientData) { + // As an optimization the server is allowed to return a partial message at the + // end of the message set. Clients should handle this case. So we just ignore such things. + if msb.Offset == -1 { + // This is an overflow message caused by chunked down conversion + ms.OverflowMessage = true + } else { + ms.PartialTrailingMessage = true + } + return nil + } else { + return err + } + } + + return nil +} diff --git a/aggregator/kafka/produce_request.go b/aggregator/kafka/produce_request.go new file mode 100644 index 0000000..08b7002 --- /dev/null +++ b/aggregator/kafka/produce_request.go @@ -0,0 +1,126 @@ +package kafka + +// RequiredAcks is used in Produce Requests to tell the broker how many replica acknowledgements +// it must see before responding. Any of the constants defined here are valid. On broker versions +// prior to 0.8.2.0 any other positive int16 is also valid (the broker will wait for that many +// acknowledgements) but in 0.8.2.0 and later this will raise an exception (it has been replaced +// by setting the `min.isr` value in the brokers configuration). +type RequiredAcks int16 + +const ( + // NoResponse doesn't send any response, the TCP ACK is all you get. + NoResponse RequiredAcks = 0 + // WaitForLocal waits for only the local commit to succeed before responding. + WaitForLocal RequiredAcks = 1 + // WaitForAll waits for all in-sync replicas to commit before responding. + // The minimum number of in-sync replicas is configured on the broker via + // the `min.insync.replicas` configuration key. + WaitForAll RequiredAcks = -1 +) + +type ProduceRequest struct { + TransactionalID *string + RequiredAcks RequiredAcks + Timeout int32 + Version int16 // v1 requires Kafka 0.9, v2 requires Kafka 0.10, v3 requires Kafka 0.11 + Records map[string]map[int32]Records +} + +func (r *ProduceRequest) decode(pd packetDecoder, version int16) error { + r.Version = version + + if version >= 3 { + id, err := pd.getNullableString() + if err != nil { + return err + } + r.TransactionalID = id + } + requiredAcks, err := pd.getInt16() + if err != nil { + return err + } + r.RequiredAcks = RequiredAcks(requiredAcks) + if r.Timeout, err = pd.getInt32(); err != nil { + return err + } + topicCount, err := pd.getArrayLength() + if err != nil { + return err + } + if topicCount == 0 { + return nil + } + + r.Records = make(map[string]map[int32]Records) + for i := 0; i < topicCount; i++ { + topic, err := pd.getString() + if err != nil { + return err + } + partitionCount, err := pd.getArrayLength() + if err != nil { + return err + } + r.Records[topic] = make(map[int32]Records) + + for j := 0; j < partitionCount; j++ { + partition, err := pd.getInt32() + if err != nil { + return err + } + size, err := pd.getInt32() + if err != nil { + return err + } + recordsDecoder, err := pd.getSubset(int(size)) + if err != nil { + return err + } + var records Records + if err := records.decode(recordsDecoder); err != nil { + return err + } + r.Records[topic][partition] = records + } + } + + return nil +} + +func (r *ProduceRequest) key() int16 { + return 0 +} + +func (r *ProduceRequest) version() int16 { + return r.Version +} + +func (r *ProduceRequest) headerVersion() int16 { + return 1 +} + +func (r *ProduceRequest) isValidVersion() bool { + return r.Version >= 0 && r.Version <= 7 +} + +func (r *ProduceRequest) requiredVersion() KafkaVersion { + switch r.Version { + case 7: + return V2_1_0_0 + case 6: + return V2_0_0_0 + case 4, 5: + return V1_0_0_0 + case 3: + return V0_11_0_0 + case 2: + return V0_10_0_0 + case 1: + return V0_9_0_0 + case 0: + return V0_8_2_0 + default: + return V2_1_0_0 + } +} diff --git a/aggregator/kafka/real_decoder.go b/aggregator/kafka/real_decoder.go new file mode 100644 index 0000000..a5bd8c4 --- /dev/null +++ b/aggregator/kafka/real_decoder.go @@ -0,0 +1,461 @@ +package kafka + +import ( + "encoding/binary" + "math" +) + +var ( + errInvalidArrayLength = PacketDecodingError{"invalid array length"} + errInvalidByteSliceLength = PacketDecodingError{"invalid byteslice length"} + errInvalidStringLength = PacketDecodingError{"invalid string length"} + errVarintOverflow = PacketDecodingError{"varint overflow"} + errUVarintOverflow = PacketDecodingError{"uvarint overflow"} + errInvalidBool = PacketDecodingError{"invalid bool"} +) + +type realDecoder struct { + raw []byte + off int + stack []pushDecoder +} + +// primitives + +func (rd *realDecoder) getInt8() (int8, error) { + if rd.remaining() < 1 { + rd.off = len(rd.raw) + return -1, ErrInsufficientData + } + tmp := int8(rd.raw[rd.off]) + rd.off++ + return tmp, nil +} + +func (rd *realDecoder) getInt16() (int16, error) { + if rd.remaining() < 2 { + rd.off = len(rd.raw) + return -1, ErrInsufficientData + } + tmp := int16(binary.BigEndian.Uint16(rd.raw[rd.off:])) + rd.off += 2 + return tmp, nil +} + +func (rd *realDecoder) getInt32() (int32, error) { + if rd.remaining() < 4 { + rd.off = len(rd.raw) + return -1, ErrInsufficientData + } + tmp := int32(binary.BigEndian.Uint32(rd.raw[rd.off:])) + rd.off += 4 + return tmp, nil +} + +func (rd *realDecoder) getInt64() (int64, error) { + if rd.remaining() < 8 { + rd.off = len(rd.raw) + return -1, ErrInsufficientData + } + tmp := int64(binary.BigEndian.Uint64(rd.raw[rd.off:])) + rd.off += 8 + return tmp, nil +} + +func (rd *realDecoder) getVarint() (int64, error) { + tmp, n := binary.Varint(rd.raw[rd.off:]) + if n == 0 { + rd.off = len(rd.raw) + return -1, ErrInsufficientData + } + if n < 0 { + rd.off -= n + return -1, errVarintOverflow + } + rd.off += n + return tmp, nil +} + +func (rd *realDecoder) getUVarint() (uint64, error) { + tmp, n := binary.Uvarint(rd.raw[rd.off:]) + if n == 0 { + rd.off = len(rd.raw) + return 0, ErrInsufficientData + } + + if n < 0 { + rd.off -= n + return 0, errUVarintOverflow + } + + rd.off += n + return tmp, nil +} + +func (rd *realDecoder) getFloat64() (float64, error) { + if rd.remaining() < 8 { + rd.off = len(rd.raw) + return -1, ErrInsufficientData + } + tmp := math.Float64frombits(binary.BigEndian.Uint64(rd.raw[rd.off:])) + rd.off += 8 + return tmp, nil +} + +func (rd *realDecoder) getArrayLength() (int, error) { + if rd.remaining() < 4 { + rd.off = len(rd.raw) + return -1, ErrInsufficientData + } + tmp := int(int32(binary.BigEndian.Uint32(rd.raw[rd.off:]))) + rd.off += 4 + if tmp > rd.remaining() { + rd.off = len(rd.raw) + return -1, ErrInsufficientData + } else if tmp > 2*math.MaxUint16 { + return -1, errInvalidArrayLength + } + return tmp, nil +} + +func (rd *realDecoder) getCompactArrayLength() (int, error) { + n, err := rd.getUVarint() + if err != nil { + return 0, err + } + + if n == 0 { + return 0, nil + } + + return int(n) - 1, nil +} + +func (rd *realDecoder) getBool() (bool, error) { + b, err := rd.getInt8() + if err != nil || b == 0 { + return false, err + } + if b != 1 { + return false, errInvalidBool + } + return true, nil +} + +func (rd *realDecoder) getEmptyTaggedFieldArray() (int, error) { + tagCount, err := rd.getUVarint() + if err != nil { + return 0, err + } + + // skip over any tagged fields without deserializing them + // as we don't currently support doing anything with them + for i := uint64(0); i < tagCount; i++ { + // fetch and ignore tag identifier + _, err := rd.getUVarint() + if err != nil { + return 0, err + } + length, err := rd.getUVarint() + if err != nil { + return 0, err + } + if _, err := rd.getRawBytes(int(length)); err != nil { + return 0, err + } + } + + return 0, nil +} + +// collections + +func (rd *realDecoder) getBytes() ([]byte, error) { + tmp, err := rd.getInt32() + if err != nil { + return nil, err + } + if tmp == -1 { + return nil, nil + } + + return rd.getRawBytes(int(tmp)) +} + +func (rd *realDecoder) getVarintBytes() ([]byte, error) { + tmp, err := rd.getVarint() + if err != nil { + return nil, err + } + if tmp == -1 { + return nil, nil + } + + return rd.getRawBytes(int(tmp)) +} + +func (rd *realDecoder) getCompactBytes() ([]byte, error) { + n, err := rd.getUVarint() + if err != nil { + return nil, err + } + + length := int(n - 1) + return rd.getRawBytes(length) +} + +func (rd *realDecoder) getStringLength() (int, error) { + length, err := rd.getInt16() + if err != nil { + return 0, err + } + + n := int(length) + + switch { + case n < -1: + return 0, errInvalidStringLength + case n > rd.remaining(): + rd.off = len(rd.raw) + return 0, ErrInsufficientData + } + + return n, nil +} + +func (rd *realDecoder) getString() (string, error) { + n, err := rd.getStringLength() + if err != nil || n == -1 { + return "", err + } + + tmpStr := string(rd.raw[rd.off : rd.off+n]) + rd.off += n + return tmpStr, nil +} + +func (rd *realDecoder) getNullableString() (*string, error) { + n, err := rd.getStringLength() + if err != nil || n == -1 { + return nil, err + } + + tmpStr := string(rd.raw[rd.off : rd.off+n]) + rd.off += n + return &tmpStr, err +} + +func (rd *realDecoder) getCompactString() (string, error) { + n, err := rd.getUVarint() + if err != nil { + return "", err + } + + length := int(n - 1) + if length < 0 { + return "", errInvalidByteSliceLength + } + tmpStr := string(rd.raw[rd.off : rd.off+length]) + rd.off += length + return tmpStr, nil +} + +func (rd *realDecoder) getCompactNullableString() (*string, error) { + n, err := rd.getUVarint() + if err != nil { + return nil, err + } + + length := int(n - 1) + + if length < 0 { + return nil, err + } + + tmpStr := string(rd.raw[rd.off : rd.off+length]) + rd.off += length + return &tmpStr, err +} + +func (rd *realDecoder) getCompactInt32Array() ([]int32, error) { + n, err := rd.getUVarint() + if err != nil { + return nil, err + } + + if n == 0 { + return nil, nil + } + + arrayLength := int(n) - 1 + + ret := make([]int32, arrayLength) + + for i := range ret { + ret[i] = int32(binary.BigEndian.Uint32(rd.raw[rd.off:])) + rd.off += 4 + } + return ret, nil +} + +func (rd *realDecoder) getInt32Array() ([]int32, error) { + if rd.remaining() < 4 { + rd.off = len(rd.raw) + return nil, ErrInsufficientData + } + n := int(binary.BigEndian.Uint32(rd.raw[rd.off:])) + rd.off += 4 + + if rd.remaining() < 4*n { + rd.off = len(rd.raw) + return nil, ErrInsufficientData + } + + if n == 0 { + return nil, nil + } + + if n < 0 { + return nil, errInvalidArrayLength + } + + ret := make([]int32, n) + for i := range ret { + ret[i] = int32(binary.BigEndian.Uint32(rd.raw[rd.off:])) + rd.off += 4 + } + return ret, nil +} + +func (rd *realDecoder) getInt64Array() ([]int64, error) { + if rd.remaining() < 4 { + rd.off = len(rd.raw) + return nil, ErrInsufficientData + } + n := int(binary.BigEndian.Uint32(rd.raw[rd.off:])) + rd.off += 4 + + if rd.remaining() < 8*n { + rd.off = len(rd.raw) + return nil, ErrInsufficientData + } + + if n == 0 { + return nil, nil + } + + if n < 0 { + return nil, errInvalidArrayLength + } + + ret := make([]int64, n) + for i := range ret { + ret[i] = int64(binary.BigEndian.Uint64(rd.raw[rd.off:])) + rd.off += 8 + } + return ret, nil +} + +func (rd *realDecoder) getStringArray() ([]string, error) { + if rd.remaining() < 4 { + rd.off = len(rd.raw) + return nil, ErrInsufficientData + } + n := int(binary.BigEndian.Uint32(rd.raw[rd.off:])) + rd.off += 4 + + if n == 0 { + return nil, nil + } + + if n < 0 { + return nil, errInvalidArrayLength + } + + ret := make([]string, n) + for i := range ret { + str, err := rd.getString() + if err != nil { + return nil, err + } + + ret[i] = str + } + return ret, nil +} + +// subsets + +func (rd *realDecoder) remaining() int { + return len(rd.raw) - rd.off +} + +func (rd *realDecoder) getSubset(length int) (packetDecoder, error) { + buf, err := rd.getRawBytes(length) + if err != nil { + return nil, err + } + return &realDecoder{raw: buf}, nil +} + +func (rd *realDecoder) getRawBytes(length int) ([]byte, error) { + if length < 0 { + return nil, errInvalidByteSliceLength + } else if length > rd.remaining() { + rd.off = len(rd.raw) + return nil, ErrInsufficientData + } + + start := rd.off + rd.off += length + return rd.raw[start:rd.off], nil +} + +func (rd *realDecoder) peek(offset, length int) (packetDecoder, error) { + if rd.remaining() < offset+length { + return nil, ErrInsufficientData + } + off := rd.off + offset + return &realDecoder{raw: rd.raw[off : off+length]}, nil +} + +func (rd *realDecoder) peekInt8(offset int) (int8, error) { + const byteLen = 1 + if rd.remaining() < offset+byteLen { + return -1, ErrInsufficientData + } + return int8(rd.raw[rd.off+offset]), nil +} + +// stacks + +func (rd *realDecoder) push(in pushDecoder) error { + in.saveOffset(rd.off) + + var reserve int + if dpd, ok := in.(dynamicPushDecoder); ok { + if err := dpd.decode(rd); err != nil { + return err + } + } else { + reserve = in.reserveLength() + if rd.remaining() < reserve { + rd.off = len(rd.raw) + return ErrInsufficientData + } + } + + rd.stack = append(rd.stack, in) + + rd.off += reserve + + return nil +} + +func (rd *realDecoder) pop() error { + // this is go's ugly pop pattern (the inverse of append) + in := rd.stack[len(rd.stack)-1] + rd.stack = rd.stack[:len(rd.stack)-1] + + return in.check(rd.off, rd.raw) +} diff --git a/aggregator/kafka/record.go b/aggregator/kafka/record.go new file mode 100644 index 0000000..ea968bb --- /dev/null +++ b/aggregator/kafka/record.go @@ -0,0 +1,87 @@ +package kafka + +import ( + "encoding/binary" + "time" +) + +const ( + isTransactionalMask = 0x10 + controlMask = 0x20 + maximumRecordOverhead = 5*binary.MaxVarintLen32 + binary.MaxVarintLen64 + 1 +) + +// RecordHeader stores key and value for a record header +type RecordHeader struct { + Key []byte + Value []byte +} + +func (h *RecordHeader) decode(pd packetDecoder) (err error) { + if h.Key, err = pd.getVarintBytes(); err != nil { + return err + } + + if h.Value, err = pd.getVarintBytes(); err != nil { + return err + } + return nil +} + +// Record is kafka record type +type Record struct { + Headers []*RecordHeader + + Attributes int8 + TimestampDelta time.Duration + OffsetDelta int64 + Key []byte + Value []byte + length varintLengthField +} + +func (r *Record) decode(pd packetDecoder) (err error) { + if err = pd.push(&r.length); err != nil { + return err + } + + if r.Attributes, err = pd.getInt8(); err != nil { + return err + } + + timestamp, err := pd.getVarint() + if err != nil { + return err + } + r.TimestampDelta = time.Duration(timestamp) * time.Millisecond + + if r.OffsetDelta, err = pd.getVarint(); err != nil { + return err + } + + if r.Key, err = pd.getVarintBytes(); err != nil { + return err + } + + if r.Value, err = pd.getVarintBytes(); err != nil { + return err + } + + numHeaders, err := pd.getVarint() + if err != nil { + return err + } + + if numHeaders >= 0 { + r.Headers = make([]*RecordHeader, numHeaders) + } + for i := int64(0); i < numHeaders; i++ { + hdr := new(RecordHeader) + if err := hdr.decode(pd); err != nil { + return err + } + r.Headers[i] = hdr + } + + return pd.pop() +} diff --git a/aggregator/kafka/record_batch.go b/aggregator/kafka/record_batch.go new file mode 100644 index 0000000..2a700e4 --- /dev/null +++ b/aggregator/kafka/record_batch.go @@ -0,0 +1,147 @@ +package kafka + +import ( + "errors" + "time" +) + +const recordBatchOverhead = 49 + +type recordsArray []*Record + +func (e recordsArray) decode(pd packetDecoder) error { + records := make([]Record, len(e)) + for i := range e { + if err := records[i].decode(pd); err != nil { + return err + } + e[i] = &records[i] + } + return nil +} + +type RecordBatch struct { + FirstOffset int64 + PartitionLeaderEpoch int32 + Version int8 + Codec CompressionCodec + CompressionLevel int + Control bool + LogAppendTime bool + LastOffsetDelta int32 + FirstTimestamp time.Time + MaxTimestamp time.Time + ProducerID int64 + ProducerEpoch int16 + FirstSequence int32 + Records []*Record + PartialTrailingRecord bool + IsTransactional bool + + compressedRecords []byte + recordsLen int // uncompressed records size +} + +func (b *RecordBatch) LastOffset() int64 { + return b.FirstOffset + int64(b.LastOffsetDelta) +} + +func (b *RecordBatch) decode(pd packetDecoder) (err error) { + if b.FirstOffset, err = pd.getInt64(); err != nil { + return err + } + + batchLen, err := pd.getInt32() + if err != nil { + return err + } + + if b.PartitionLeaderEpoch, err = pd.getInt32(); err != nil { + return err + } + + if b.Version, err = pd.getInt8(); err != nil { + return err + } + + crc32Decoder := acquireCrc32Field(crcCastagnoli) + defer releaseCrc32Field(crc32Decoder) + + if err = pd.push(crc32Decoder); err != nil { + return err + } + + attributes, err := pd.getInt16() + if err != nil { + return err + } + b.Codec = CompressionCodec(int8(attributes) & compressionCodecMask) + b.Control = attributes&controlMask == controlMask + b.LogAppendTime = attributes×tampTypeMask == timestampTypeMask + b.IsTransactional = attributes&isTransactionalMask == isTransactionalMask + + if b.LastOffsetDelta, err = pd.getInt32(); err != nil { + return err + } + + if err = (Timestamp{&b.FirstTimestamp}).decode(pd); err != nil { + return err + } + + if err = (Timestamp{&b.MaxTimestamp}).decode(pd); err != nil { + return err + } + + if b.ProducerID, err = pd.getInt64(); err != nil { + return err + } + + if b.ProducerEpoch, err = pd.getInt16(); err != nil { + return err + } + + if b.FirstSequence, err = pd.getInt32(); err != nil { + return err + } + + numRecs, err := pd.getArrayLength() + if err != nil { + return err + } + if numRecs >= 0 { + b.Records = make([]*Record, numRecs) + } + + bufSize := int(batchLen) - recordBatchOverhead + recBuffer, err := pd.getRawBytes(bufSize) + if err != nil { + if errors.Is(err, ErrInsufficientData) { + b.PartialTrailingRecord = true + b.Records = nil + return nil + } + return err + } + + if err = pd.pop(); err != nil { + return err + } + + recBuffer, err = decompress(b.Codec, recBuffer) + if err != nil { + return err + } + + b.recordsLen = len(recBuffer) + err = decode(recBuffer, recordsArray(b.Records)) + if errors.Is(err, ErrInsufficientData) { + b.PartialTrailingRecord = true + b.Records = nil + return nil + } + return err +} + +func (b *RecordBatch) addRecord(r *Record) { + b.Records = append(b.Records, r) +} diff --git a/aggregator/kafka/records.go b/aggregator/kafka/records.go new file mode 100644 index 0000000..78c009a --- /dev/null +++ b/aggregator/kafka/records.go @@ -0,0 +1,153 @@ +package kafka + +import "fmt" + +const ( + unknownRecords = iota + legacyRecords + defaultRecords + + magicOffset = 16 +) + +// Records implements a union type containing either a RecordBatch or a legacy MessageSet. +type Records struct { + recordsType int + MsgSet *MessageSet + RecordBatch *RecordBatch +} + +// setTypeFromFields sets type of Records depending on which of MsgSet or RecordBatch is not nil. +// The first return value indicates whether both fields are nil (and the type is not set). +// If both fields are not nil, it returns an error. +func (r *Records) setTypeFromFields() (bool, error) { + if r.MsgSet == nil && r.RecordBatch == nil { + return true, nil + } + if r.MsgSet != nil && r.RecordBatch != nil { + return false, fmt.Errorf("both MsgSet and RecordBatch are set, but record type is unknown") + } + r.recordsType = defaultRecords + if r.MsgSet != nil { + r.recordsType = legacyRecords + } + return false, nil +} + +func (r *Records) setTypeFromMagic(pd packetDecoder) error { + magic, err := magicValue(pd) + if err != nil { + return err + } + + r.recordsType = defaultRecords + if magic < 2 { + r.recordsType = legacyRecords + } + + return nil +} + +func (r *Records) decode(pd packetDecoder) error { + if r.recordsType == unknownRecords { + if err := r.setTypeFromMagic(pd); err != nil { + return err + } + } + + switch r.recordsType { + case legacyRecords: + r.MsgSet = &MessageSet{} + return r.MsgSet.decode(pd) + case defaultRecords: + r.RecordBatch = &RecordBatch{} + return r.RecordBatch.decode(pd) + } + return fmt.Errorf("unknown records type: %v", r.recordsType) +} + +func (r *Records) numRecords() (int, error) { + if r.recordsType == unknownRecords { + if empty, err := r.setTypeFromFields(); err != nil || empty { + return 0, err + } + } + + switch r.recordsType { + case legacyRecords: + if r.MsgSet == nil { + return 0, nil + } + return len(r.MsgSet.Messages), nil + case defaultRecords: + if r.RecordBatch == nil { + return 0, nil + } + return len(r.RecordBatch.Records), nil + } + return 0, fmt.Errorf("unknown records type: %v", r.recordsType) +} + +func (r *Records) isPartial() (bool, error) { + if r.recordsType == unknownRecords { + if empty, err := r.setTypeFromFields(); err != nil || empty { + return false, err + } + } + + switch r.recordsType { + case unknownRecords: + return false, nil + case legacyRecords: + if r.MsgSet == nil { + return false, nil + } + return r.MsgSet.PartialTrailingMessage, nil + case defaultRecords: + if r.RecordBatch == nil { + return false, nil + } + return r.RecordBatch.PartialTrailingRecord, nil + } + return false, fmt.Errorf("unknown records type: %v", r.recordsType) +} + +func (r *Records) isOverflow() (bool, error) { + if r.recordsType == unknownRecords { + if empty, err := r.setTypeFromFields(); err != nil || empty { + return false, err + } + } + + switch r.recordsType { + case unknownRecords: + return false, nil + case legacyRecords: + if r.MsgSet == nil { + return false, nil + } + return r.MsgSet.OverflowMessage, nil + case defaultRecords: + return false, nil + } + return false, fmt.Errorf("unknown records type: %v", r.recordsType) +} + +func (r *Records) recordsOffset() (*int64, error) { + switch r.recordsType { + case unknownRecords: + return nil, nil + case legacyRecords: + return nil, nil + case defaultRecords: + if r.RecordBatch == nil { + return nil, nil + } + return &r.RecordBatch.FirstOffset, nil + } + return nil, fmt.Errorf("unknown records type: %v", r.recordsType) +} + +func magicValue(pd packetDecoder) (int8, error) { + return pd.peekInt8(magicOffset) +} diff --git a/aggregator/kafka/request.go b/aggregator/kafka/request.go new file mode 100644 index 0000000..d88b695 --- /dev/null +++ b/aggregator/kafka/request.go @@ -0,0 +1,216 @@ +package kafka + +import ( + "encoding/binary" + "fmt" + "io" +) + +// KafkaVersion instances represent versions of the upstream Kafka broker. +type KafkaVersion struct { + // it's a struct rather than just typing the array directly to make it opaque and stop people + // generating their own arbitrary versions + version [4]uint +} + +type ProtocolBody interface { + // encoder + versionedDecoder + key() int16 + version() int16 + headerVersion() int16 + isValidVersion() bool + requiredVersion() KafkaVersion +} + +const MaxRequestSize int32 = 100 * 1024 * 1024 + +func (r *Request) decode(pd packetDecoder) (err error) { + key, err := pd.getInt16() + if err != nil { + return err + } + + version, err := pd.getInt16() + if err != nil { + return err + } + + r.CorrelationID, err = pd.getInt32() + if err != nil { + return err + } + + r.ClientID, err = pd.getString() + if err != nil { + return err + } + + r.Body = allocateBody(key, version) + if r.Body == nil { + return fmt.Errorf(fmt.Sprintf("unknown request key (%d)", key)) + } + + if r.Body.headerVersion() >= 2 { + // tagged field + _, err = pd.getUVarint() + if err != nil { + return err + } + } + + return r.Body.decode(pd, version) +} + +type Request struct { + CorrelationID int32 + ClientID string + Body ProtocolBody +} + +func allocateBody(key, version int16) ProtocolBody { + switch key { + case 0: + return &ProduceRequest{Version: version} + // case 1: + // return &FetchRequest{Version: version} + // case 2: + // return &OffsetRequest{Version: version} + // case 3: + // return &MetadataRequest{Version: version} + // // 4: LeaderAndIsrRequest + // // 5: StopReplicaRequest + // // 6: UpdateMetadataRequest + // // 7: ControlledShutdownRequest + // case 8: + // return &OffsetCommitRequest{Version: version} + // case 9: + // return &OffsetFetchRequest{Version: version} + // case 10: + // return &FindCoordinatorRequest{Version: version} + // case 11: + // return &JoinGroupRequest{Version: version} + // case 12: + // return &HeartbeatRequest{Version: version} + // case 13: + // return &LeaveGroupRequest{Version: version} + // case 14: + // return &SyncGroupRequest{Version: version} + // case 15: + // return &DescribeGroupsRequest{Version: version} + // case 16: + // return &ListGroupsRequest{Version: version} + // case 17: + // return &SaslHandshakeRequest{Version: version} + // case 18: + // return &ApiVersionsRequest{Version: version} + // case 19: + // return &CreateTopicsRequest{Version: version} + // case 20: + // return &DeleteTopicsRequest{Version: version} + // case 21: + // return &DeleteRecordsRequest{Version: version} + // case 22: + // return &InitProducerIDRequest{Version: version} + // // 23: OffsetForLeaderEpochRequest + // case 24: + // return &AddPartitionsToTxnRequest{Version: version} + // case 25: + // return &AddOffsetsToTxnRequest{Version: version} + // case 26: + // return &EndTxnRequest{Version: version} + // // 27: WriteTxnMarkersRequest + // case 28: + // return &TxnOffsetCommitRequest{Version: version} + // case 29: + // return &DescribeAclsRequest{Version: int(version)} + // case 30: + // return &CreateAclsRequest{Version: version} + // case 31: + // return &DeleteAclsRequest{Version: int(version)} + // case 32: + // return &DescribeConfigsRequest{Version: version} + // case 33: + // return &AlterConfigsRequest{Version: version} + // // 34: AlterReplicaLogDirsRequest + // case 35: + // return &DescribeLogDirsRequest{Version: version} + // case 36: + // return &SaslAuthenticateRequest{Version: version} + // case 37: + // return &CreatePartitionsRequest{Version: version} + // // 38: CreateDelegationTokenRequest + // // 39: RenewDelegationTokenRequest + // // 40: ExpireDelegationTokenRequest + // // 41: DescribeDelegationTokenRequest + // case 42: + // return &DeleteGroupsRequest{Version: version} + // // 43: ElectLeadersRequest + // case 44: + // return &IncrementalAlterConfigsRequest{Version: version} + // case 45: + // return &AlterPartitionReassignmentsRequest{Version: version} + // case 46: + // return &ListPartitionReassignmentsRequest{Version: version} + // case 47: + // return &DeleteOffsetsRequest{Version: version} + // case 48: + // return &DescribeClientQuotasRequest{Version: version} + // case 49: + // return &AlterClientQuotasRequest{Version: version} + // case 50: + // return &DescribeUserScramCredentialsRequest{Version: version} + // case 51: + // return &AlterUserScramCredentialsRequest{Version: version} + // 52: VoteRequest + // 53: BeginQuorumEpochRequest + // 54: EndQuorumEpochRequest + // 55: DescribeQuorumRequest + // 56: AlterPartitionRequest + // 57: UpdateFeaturesRequest + // 58: EnvelopeRequest + // 59: FetchSnapshotRequest + // 60: DescribeClusterRequest + // 61: DescribeProducersRequest + // 62: BrokerRegistrationRequest + // 63: BrokerHeartbeatRequest + // 64: UnregisterBrokerRequest + // 65: DescribeTransactionsRequest + // 66: ListTransactionsRequest + // 67: AllocateProducerIdsRequest + // 68: ConsumerGroupHeartbeatRequest + } + return nil +} + +func DecodeRequest(r io.Reader) (*Request, int, error) { + var ( + bytesRead int + lengthBytes = make([]byte, 4) + ) + + if _, err := io.ReadFull(r, lengthBytes); err != nil { + return nil, bytesRead, err + } + + bytesRead += len(lengthBytes) + length := int32(binary.BigEndian.Uint32(lengthBytes)) + + if length <= 4 || length > MaxRequestSize { + return nil, bytesRead, PacketDecodingError{fmt.Sprintf("message of length %d too large or too small", length)} + } + + encodedReq := make([]byte, length) + if _, err := io.ReadFull(r, encodedReq); err != nil { + return nil, bytesRead, err + } + + bytesRead += len(encodedReq) + + req := &Request{} + if err := decode(encodedReq, req); err != nil { + return nil, bytesRead, err + } + + return req, bytesRead, nil +} diff --git a/aggregator/kafka/response_header.go b/aggregator/kafka/response_header.go new file mode 100644 index 0000000..384e027 --- /dev/null +++ b/aggregator/kafka/response_header.go @@ -0,0 +1,313 @@ +package kafka + +import "fmt" + +const MaxResponseSize int32 = 100 * 1024 * 1024 + +// headerVersion derives the header version from the request api key and request api version +// +//nolint:funlen,gocognit,gocyclo,cyclop,maintidx +func ResponseHeaderVersion(apiKey, apiVersion int16) int16 { + switch apiKey { + case 0: // Produce + if apiVersion >= 9 { + return 1 + } + return 0 + case 1: // Fetch + if apiVersion >= 12 { + return 1 + } + return 0 + case 2: // ListOffsets + if apiVersion >= 6 { + return 1 + } + return 0 + case 3: // Metadata + if apiVersion >= 9 { + return 1 + } + return 0 + case 4: // LeaderAndIsr + if apiVersion >= 4 { + return 1 + } + return 0 + case 5: // StopReplica + if apiVersion >= 2 { + return 1 + } + return 0 + case 6: // UpdateMetadata + if apiVersion >= 6 { + return 1 + } + return 0 + case 7: // ControlledShutdown + if apiVersion >= 3 { + return 1 + } + return 0 + case 8: // OffsetCommit + if apiVersion >= 8 { + return 1 + } + return 0 + case 9: // OffsetFetch + if apiVersion >= 6 { + return 1 + } + return 0 + case 10: // FindCoordinator + if apiVersion >= 3 { + return 1 + } + return 0 + case 11: // JoinGroup + if apiVersion >= 6 { + return 1 + } + return 0 + case 12: // Heartbeat + if apiVersion >= 4 { + return 1 + } + return 0 + case 13: // LeaveGroup + if apiVersion >= 4 { + return 1 + } + return 0 + case 14: // SyncGroup + if apiVersion >= 4 { + return 1 + } + return 0 + case 15: // DescribeGroups + if apiVersion >= 5 { + return 1 + } + return 0 + case 16: // ListGroups + if apiVersion >= 3 { + return 1 + } + return 0 + case 17: // SaslHandshake + return 0 + case 18: // ApiVersions + // ApiVersionsResponse always includes a v0 header. + // See KIP-511 for details. + return 0 + case 19: // CreateTopics + if apiVersion >= 5 { + return 1 + } + return 0 + case 20: // DeleteTopics + if apiVersion >= 4 { + return 1 + } + return 0 + case 21: // DeleteRecords + if apiVersion >= 2 { + return 1 + } + return 0 + case 22: // InitProducerId + if apiVersion >= 2 { + return 1 + } + return 0 + case 23: // OffsetForLeaderEpoch + if apiVersion >= 4 { + return 1 + } + return 0 + case 24: // AddPartitionsToTxn + if apiVersion >= 3 { + return 1 + } + return 0 + case 25: // AddOffsetsToTxn + if apiVersion >= 3 { + return 1 + } + return 0 + case 26: // EndTxn + if apiVersion >= 3 { + return 1 + } + return 0 + case 27: // WriteTxnMarkers + if apiVersion >= 1 { + return 1 + } + return 0 + case 28: // TxnOffsetCommit + if apiVersion >= 3 { + return 1 + } + return 0 + case 29: // DescribeAcls + if apiVersion >= 2 { + return 1 + } + return 0 + case 30: // CreateAcls + if apiVersion >= 2 { + return 1 + } + return 0 + case 31: // DeleteAcls + if apiVersion >= 2 { + return 1 + } + return 0 + case 32: // DescribeConfigs + if apiVersion >= 4 { + return 1 + } + return 0 + case 33: // AlterConfigs + if apiVersion >= 2 { + return 1 + } + return 0 + case 34: // AlterReplicaLogDirs + if apiVersion >= 2 { + return 1 + } + return 0 + case 35: // DescribeLogDirs + if apiVersion >= 2 { + return 1 + } + return 0 + case 36: // SaslAuthenticate + if apiVersion >= 2 { + return 1 + } + return 0 + case 37: // CreatePartitions + if apiVersion >= 2 { + return 1 + } + return 0 + case 38: // CreateDelegationToken + if apiVersion >= 2 { + return 1 + } + return 0 + case 39: // RenewDelegationToken + if apiVersion >= 2 { + return 1 + } + return 0 + case 40: // ExpireDelegationToken + if apiVersion >= 2 { + return 1 + } + return 0 + case 41: // DescribeDelegationToken + if apiVersion >= 2 { + return 1 + } + return 0 + case 42: // DeleteGroups + if apiVersion >= 2 { + return 1 + } + return 0 + case 43: // ElectLeaders + if apiVersion >= 2 { + return 1 + } + return 0 + case 44: // IncrementalAlterConfigs + if apiVersion >= 1 { + return 1 + } + return 0 + case 45: // AlterPartitionReassignments + return 1 + case 46: // ListPartitionReassignments + return 1 + case 47: // OffsetDelete + return 0 + case 48: // DescribeClientQuotas + if apiVersion >= 1 { + return 1 + } + return 0 + case 49: // AlterClientQuotas + if apiVersion >= 1 { + return 1 + } + return 0 + case 50: // DescribeUserScramCredentials + return 1 + case 51: // AlterUserScramCredentials + return 1 + case 52: // Vote + return 1 + case 53: // BeginQuorumEpoch + return 0 + case 54: // EndQuorumEpoch + return 0 + case 55: // DescribeQuorum + return 1 + case 56: // AlterIsr + return 1 + case 57: // UpdateFeatures + return 1 + case 58: // Envelope + return 1 + case 59: // FetchSnapshot + return 1 + case 60: // DescribeCluster + return 1 + case 61: // DescribeProducers + return 1 + case 62: // BrokerRegistration + return 1 + case 63: // BrokerHeartbeat + return 1 + case 64: // UnregisterBroker + return 1 + case 65: // DescribeTransactions + return 1 + case 66: // ListTransactions + return 1 + case 67: // AllocateProducerIds + return 1 + default: + return -1 + } +} + +type ResponseHeader struct { + Length int32 + CorrelationID int32 +} + +func (r *ResponseHeader) decode(pd packetDecoder, version int16) (err error) { + r.Length, err = pd.getInt32() + if err != nil { + return err + } + if r.Length <= 4 || r.Length > MaxResponseSize { + return PacketDecodingError{fmt.Sprintf("message of length %d too large or too small", r.Length)} + } + + r.CorrelationID, err = pd.getInt32() + + if version >= 1 { + if _, err := pd.getEmptyTaggedFieldArray(); err != nil { + return err + } + } + + return err +} diff --git a/aggregator/kafka/timestamp.go b/aggregator/kafka/timestamp.go new file mode 100644 index 0000000..456d396 --- /dev/null +++ b/aggregator/kafka/timestamp.go @@ -0,0 +1,26 @@ +package kafka + +import ( + "time" +) + +type Timestamp struct { + *time.Time +} + +func (t Timestamp) decode(pd packetDecoder) error { + millis, err := pd.getInt64() + if err != nil { + return err + } + + // negative timestamps are invalid, in these cases we should return + // a zero time + timestamp := time.Time{} + if millis >= 0 { + timestamp = time.Unix(millis/1000, (millis%1000)*int64(time.Millisecond)) + } + + *t.Time = timestamp + return nil +} diff --git a/aggregator/kafka/versions.go b/aggregator/kafka/versions.go new file mode 100644 index 0000000..7cb18dd --- /dev/null +++ b/aggregator/kafka/versions.go @@ -0,0 +1,294 @@ +package kafka + +import ( + "bufio" + "fmt" + "net" + "regexp" +) + +type none struct{} + +// make []int32 sortable so we can sort partition numbers +type int32Slice []int32 + +func (slice int32Slice) Len() int { + return len(slice) +} + +func (slice int32Slice) Less(i, j int) bool { + return slice[i] < slice[j] +} + +func (slice int32Slice) Swap(i, j int) { + slice[i], slice[j] = slice[j], slice[i] +} + +func dupInt32Slice(input []int32) []int32 { + ret := make([]int32, 0, len(input)) + ret = append(ret, input...) + return ret +} + +// Encoder is a simple interface for any type that can be encoded as an array of bytes +// in order to be sent as the key or value of a Kafka message. Length() is provided as an +// optimization, and must return the same as len() on the result of Encode(). +type Encoder interface { + Encode() ([]byte, error) + Length() int +} + +// make strings and byte slices encodable for convenience so they can be used as keys +// and/or values in kafka messages + +// StringEncoder implements the Encoder interface for Go strings so that they can be used +// as the Key or Value in a ProducerMessage. +type StringEncoder string + +func (s StringEncoder) Encode() ([]byte, error) { + return []byte(s), nil +} + +func (s StringEncoder) Length() int { + return len(s) +} + +// ByteEncoder implements the Encoder interface for Go byte slices so that they can be used +// as the Key or Value in a ProducerMessage. +type ByteEncoder []byte + +func (b ByteEncoder) Encode() ([]byte, error) { + return b, nil +} + +func (b ByteEncoder) Length() int { + return len(b) +} + +// bufConn wraps a net.Conn with a buffer for reads to reduce the number of +// reads that trigger syscalls. +type bufConn struct { + net.Conn + buf *bufio.Reader +} + +func newBufConn(conn net.Conn) *bufConn { + return &bufConn{ + Conn: conn, + buf: bufio.NewReader(conn), + } +} + +func (bc *bufConn) Read(b []byte) (n int, err error) { + return bc.buf.Read(b) +} + +func newKafkaVersion(major, minor, veryMinor, patch uint) KafkaVersion { + return KafkaVersion{ + version: [4]uint{major, minor, veryMinor, patch}, + } +} + +// IsAtLeast return true if and only if the version it is called on is +// greater than or equal to the version passed in: +// +// V1.IsAtLeast(V2) // false +// V2.IsAtLeast(V1) // true +func (v KafkaVersion) IsAtLeast(other KafkaVersion) bool { + for i := range v.version { + if v.version[i] > other.version[i] { + return true + } else if v.version[i] < other.version[i] { + return false + } + } + return true +} + +// Effective constants defining the supported kafka versions. +var ( + V0_8_2_0 = newKafkaVersion(0, 8, 2, 0) + V0_8_2_1 = newKafkaVersion(0, 8, 2, 1) + V0_8_2_2 = newKafkaVersion(0, 8, 2, 2) + V0_9_0_0 = newKafkaVersion(0, 9, 0, 0) + V0_9_0_1 = newKafkaVersion(0, 9, 0, 1) + V0_10_0_0 = newKafkaVersion(0, 10, 0, 0) + V0_10_0_1 = newKafkaVersion(0, 10, 0, 1) + V0_10_1_0 = newKafkaVersion(0, 10, 1, 0) + V0_10_1_1 = newKafkaVersion(0, 10, 1, 1) + V0_10_2_0 = newKafkaVersion(0, 10, 2, 0) + V0_10_2_1 = newKafkaVersion(0, 10, 2, 1) + V0_10_2_2 = newKafkaVersion(0, 10, 2, 2) + V0_11_0_0 = newKafkaVersion(0, 11, 0, 0) + V0_11_0_1 = newKafkaVersion(0, 11, 0, 1) + V0_11_0_2 = newKafkaVersion(0, 11, 0, 2) + V1_0_0_0 = newKafkaVersion(1, 0, 0, 0) + V1_0_1_0 = newKafkaVersion(1, 0, 1, 0) + V1_0_2_0 = newKafkaVersion(1, 0, 2, 0) + V1_1_0_0 = newKafkaVersion(1, 1, 0, 0) + V1_1_1_0 = newKafkaVersion(1, 1, 1, 0) + V2_0_0_0 = newKafkaVersion(2, 0, 0, 0) + V2_0_1_0 = newKafkaVersion(2, 0, 1, 0) + V2_1_0_0 = newKafkaVersion(2, 1, 0, 0) + V2_1_1_0 = newKafkaVersion(2, 1, 1, 0) + V2_2_0_0 = newKafkaVersion(2, 2, 0, 0) + V2_2_1_0 = newKafkaVersion(2, 2, 1, 0) + V2_2_2_0 = newKafkaVersion(2, 2, 2, 0) + V2_3_0_0 = newKafkaVersion(2, 3, 0, 0) + V2_3_1_0 = newKafkaVersion(2, 3, 1, 0) + V2_4_0_0 = newKafkaVersion(2, 4, 0, 0) + V2_4_1_0 = newKafkaVersion(2, 4, 1, 0) + V2_5_0_0 = newKafkaVersion(2, 5, 0, 0) + V2_5_1_0 = newKafkaVersion(2, 5, 1, 0) + V2_6_0_0 = newKafkaVersion(2, 6, 0, 0) + V2_6_1_0 = newKafkaVersion(2, 6, 1, 0) + V2_6_2_0 = newKafkaVersion(2, 6, 2, 0) + V2_6_3_0 = newKafkaVersion(2, 6, 3, 0) + V2_7_0_0 = newKafkaVersion(2, 7, 0, 0) + V2_7_1_0 = newKafkaVersion(2, 7, 1, 0) + V2_7_2_0 = newKafkaVersion(2, 7, 2, 0) + V2_8_0_0 = newKafkaVersion(2, 8, 0, 0) + V2_8_1_0 = newKafkaVersion(2, 8, 1, 0) + V2_8_2_0 = newKafkaVersion(2, 8, 2, 0) + V3_0_0_0 = newKafkaVersion(3, 0, 0, 0) + V3_0_1_0 = newKafkaVersion(3, 0, 1, 0) + V3_0_2_0 = newKafkaVersion(3, 0, 2, 0) + V3_1_0_0 = newKafkaVersion(3, 1, 0, 0) + V3_1_1_0 = newKafkaVersion(3, 1, 1, 0) + V3_1_2_0 = newKafkaVersion(3, 1, 2, 0) + V3_2_0_0 = newKafkaVersion(3, 2, 0, 0) + V3_2_1_0 = newKafkaVersion(3, 2, 1, 0) + V3_2_2_0 = newKafkaVersion(3, 2, 2, 0) + V3_2_3_0 = newKafkaVersion(3, 2, 3, 0) + V3_3_0_0 = newKafkaVersion(3, 3, 0, 0) + V3_3_1_0 = newKafkaVersion(3, 3, 1, 0) + V3_3_2_0 = newKafkaVersion(3, 3, 2, 0) + V3_4_0_0 = newKafkaVersion(3, 4, 0, 0) + V3_4_1_0 = newKafkaVersion(3, 4, 1, 0) + V3_5_0_0 = newKafkaVersion(3, 5, 0, 0) + V3_5_1_0 = newKafkaVersion(3, 5, 1, 0) + V3_6_0_0 = newKafkaVersion(3, 6, 0, 0) + + SupportedVersions = []KafkaVersion{ + V0_8_2_0, + V0_8_2_1, + V0_8_2_2, + V0_9_0_0, + V0_9_0_1, + V0_10_0_0, + V0_10_0_1, + V0_10_1_0, + V0_10_1_1, + V0_10_2_0, + V0_10_2_1, + V0_10_2_2, + V0_11_0_0, + V0_11_0_1, + V0_11_0_2, + V1_0_0_0, + V1_0_1_0, + V1_0_2_0, + V1_1_0_0, + V1_1_1_0, + V2_0_0_0, + V2_0_1_0, + V2_1_0_0, + V2_1_1_0, + V2_2_0_0, + V2_2_1_0, + V2_2_2_0, + V2_3_0_0, + V2_3_1_0, + V2_4_0_0, + V2_4_1_0, + V2_5_0_0, + V2_5_1_0, + V2_6_0_0, + V2_6_1_0, + V2_6_2_0, + V2_7_0_0, + V2_7_1_0, + V2_8_0_0, + V2_8_1_0, + V2_8_2_0, + V3_0_0_0, + V3_0_1_0, + V3_0_2_0, + V3_1_0_0, + V3_1_1_0, + V3_1_2_0, + V3_2_0_0, + V3_2_1_0, + V3_2_2_0, + V3_2_3_0, + V3_3_0_0, + V3_3_1_0, + V3_3_2_0, + V3_4_0_0, + V3_4_1_0, + V3_5_0_0, + V3_5_1_0, + V3_6_0_0, + } + MinVersion = V0_8_2_0 + MaxVersion = V3_6_0_0 + DefaultVersion = V2_1_0_0 + + // reduced set of protocol versions to matrix test + fvtRangeVersions = []KafkaVersion{ + V0_8_2_2, + V0_10_2_2, + V1_0_2_0, + V1_1_1_0, + V2_0_1_0, + V2_2_2_0, + V2_4_1_0, + V2_6_2_0, + V2_8_2_0, + V3_1_2_0, + V3_3_2_0, + V3_6_0_0, + } +) + +var ( + // This regex validates that a string complies with the pre kafka 1.0.0 format for version strings, for example 0.11.0.3 + validPreKafka1Version = regexp.MustCompile(`^0\.\d+\.\d+\.\d+$`) + + // This regex validates that a string complies with the post Kafka 1.0.0 format, for example 1.0.0 + validPostKafka1Version = regexp.MustCompile(`^\d+\.\d+\.\d+$`) +) + +// ParseKafkaVersion parses and returns kafka version or error from a string +func ParseKafkaVersion(s string) (KafkaVersion, error) { + if len(s) < 5 { + return DefaultVersion, fmt.Errorf("invalid version `%s`", s) + } + var major, minor, veryMinor, patch uint + var err error + if s[0] == '0' { + err = scanKafkaVersion(s, validPreKafka1Version, "0.%d.%d.%d", [3]*uint{&minor, &veryMinor, &patch}) + } else { + err = scanKafkaVersion(s, validPostKafka1Version, "%d.%d.%d", [3]*uint{&major, &minor, &veryMinor}) + } + if err != nil { + return DefaultVersion, err + } + return newKafkaVersion(major, minor, veryMinor, patch), nil +} + +func scanKafkaVersion(s string, pattern *regexp.Regexp, format string, v [3]*uint) error { + if !pattern.MatchString(s) { + return fmt.Errorf("invalid version `%s`", s) + } + _, err := fmt.Sscanf(s, format, v[0], v[1], v[2]) + return err +} + +func (v KafkaVersion) String() string { + if v.version[0] == 0 { + return fmt.Sprintf("0.%d.%d.%d", v.version[1], v.version[2], v.version[3]) + } + + return fmt.Sprintf("%d.%d.%d", v.version[0], v.version[1], v.version[2]) +} diff --git a/aggregator/kafka/ztsd.go b/aggregator/kafka/ztsd.go new file mode 100644 index 0000000..2d0523f --- /dev/null +++ b/aggregator/kafka/ztsd.go @@ -0,0 +1,67 @@ +package kafka + +import ( + "sync" + + "github.com/klauspost/compress/zstd" +) + +// zstdMaxBufferedEncoders maximum number of not-in-use zstd encoders +// If the pool of encoders is exhausted then new encoders will be created on the fly +const zstdMaxBufferedEncoders = 1 + +type ZstdEncoderParams struct { + Level int +} +type ZstdDecoderParams struct { +} + +var zstdDecMap sync.Map + +var zstdAvailableEncoders sync.Map + +func getZstdEncoderChannel(params ZstdEncoderParams) chan *zstd.Encoder { + if c, ok := zstdAvailableEncoders.Load(params); ok { + return c.(chan *zstd.Encoder) + } + c, _ := zstdAvailableEncoders.LoadOrStore(params, make(chan *zstd.Encoder, zstdMaxBufferedEncoders)) + return c.(chan *zstd.Encoder) +} + +func getZstdEncoder(params ZstdEncoderParams) *zstd.Encoder { + select { + case enc := <-getZstdEncoderChannel(params): + return enc + default: + encoderLevel := zstd.SpeedDefault + if params.Level != CompressionLevelDefault { + encoderLevel = zstd.EncoderLevelFromZstd(params.Level) + } + zstdEnc, _ := zstd.NewWriter(nil, zstd.WithZeroFrames(true), + zstd.WithEncoderLevel(encoderLevel), + zstd.WithEncoderConcurrency(1)) + return zstdEnc + } +} + +func releaseEncoder(params ZstdEncoderParams, enc *zstd.Encoder) { + select { + case getZstdEncoderChannel(params) <- enc: + default: + } +} + +func getDecoder(params ZstdDecoderParams) *zstd.Decoder { + if ret, ok := zstdDecMap.Load(params); ok { + return ret.(*zstd.Decoder) + } + // It's possible to race and create multiple new readers. + // Only one will survive GC after use. + zstdDec, _ := zstd.NewReader(nil, zstd.WithDecoderConcurrency(0)) + zstdDecMap.Store(params, zstdDec) + return zstdDec +} + +func zstdDecompress(params ZstdDecoderParams, dst, src []byte) ([]byte, error) { + return getDecoder(params).DecodeAll(src, dst) +} diff --git a/aggregator/sock_line_test.go b/aggregator/sock_line_test.go index adbd53f..fce5289 100644 --- a/aggregator/sock_line_test.go +++ b/aggregator/sock_line_test.go @@ -1,6 +1,7 @@ package aggregator import ( + "context" "fmt" "sync" "testing" @@ -9,7 +10,7 @@ import ( func TestSocketLine(t *testing.T) { sockLine := &SocketLine{ - Values: []TimestampedSocket{}, + Values: []*TimestampedSocket{}, } tsList := []uint64{ @@ -349,7 +350,7 @@ func TestSocketLine(t *testing.T) { func TestXxx(t *testing.T) { assumedInterval := uint64(2 * time.Second) - nl := NewSocketLine(1, 0) + nl := NewSocketLine(context.Background(), 1, 0, false) wg := sync.WaitGroup{} wg.Add(1) @@ -440,7 +441,7 @@ func TestXxx(t *testing.T) { } func TestXxx2(t *testing.T) { - nl := NewSocketLine(1, 0) + nl := NewSocketLine(context.Background(), 1, 0, false) s1 := &SockInfo{ Pid: 0, @@ -472,7 +473,7 @@ func TestXxx2(t *testing.T) { } func TestAlreadyEstablishCanBeFound(t *testing.T) { - nl := NewSocketLine(1, 0) + nl := NewSocketLine(context.Background(), 1, 0, false) s1 := &SockInfo{ Pid: 0, diff --git a/aggregator/sock_num_line.go b/aggregator/sock_num_line.go index 6e91250..4459b9d 100644 --- a/aggregator/sock_num_line.go +++ b/aggregator/sock_num_line.go @@ -3,6 +3,7 @@ package aggregator import ( "bufio" "bytes" + "context" "encoding/binary" "encoding/hex" "fmt" @@ -29,38 +30,53 @@ type SocketLine struct { mu sync.RWMutex pid uint32 fd uint64 - Values []TimestampedSocket + Values []*TimestampedSocket + + ctx context.Context } -func NewSocketLine(pid uint32, fd uint64) *SocketLine { +func NewSocketLine(ctx context.Context, pid uint32, fd uint64, fetch bool) *SocketLine { skLine := &SocketLine{ mu: sync.RWMutex{}, pid: pid, fd: fd, - Values: make([]TimestampedSocket, 0), + Values: make([]*TimestampedSocket, 0), + ctx: ctx, } + if fetch { + err := skLine.getConnectionInfo() // populate + if err != nil { + log.Logger.Error().Ctx(ctx).Err(err).Msg("getConnectionInfo failed") + } + } return skLine } +// clears all socket history +func (nl *SocketLine) ClearAll() { + clear(nl.Values) // sets all values to zero values (nil in this case), we do this for garbage collection + nl.Values = nl.Values[:0] // change len +} + func (nl *SocketLine) AddValue(timestamp uint64, sockInfo *SockInfo) { + // // ignore close events + // if sockInfo == nil { + // return + // } + nl.mu.Lock() defer nl.mu.Unlock() - // ignore close events - if sockInfo == nil { - return - } - // if last element is equal to the current element, ignore if len(nl.Values) > 0 { last := nl.Values[len(nl.Values)-1].SockInfo - if last != nil && last.Saddr == sockInfo.Saddr && last.Sport == sockInfo.Sport && last.Daddr == sockInfo.Daddr && last.Dport == sockInfo.Dport { + if last != nil && sockInfo != nil && last.Saddr == sockInfo.Saddr && last.Sport == sockInfo.Sport && last.Daddr == sockInfo.Daddr && last.Dport == sockInfo.Dport { return } } - nl.Values = insertIntoSortedSlice(nl.Values, TimestampedSocket{Timestamp: timestamp, SockInfo: sockInfo}) + nl.Values = insertIntoSortedSlice(nl.Values, &TimestampedSocket{Timestamp: timestamp, SockInfo: sockInfo}) } func (nl *SocketLine) GetValue(timestamp uint64) (*SockInfo, error) { @@ -78,10 +94,26 @@ func (nl *SocketLine) GetValue(timestamp uint64) (*SockInfo, error) { if index == len(nl.Values) { // The timestamp is after the last entry, so return the last value nl.Values[index-1].LastMatch = uint64(time.Now().UnixNano()) + if nl.Values[len(nl.Values)-1].SockInfo == nil { + if index-2 >= 0 && index-2 < len(nl.Values) && nl.Values[index-2].SockInfo != nil && + (timestamp-nl.Values[index-2].Timestamp) < uint64(1*time.Minute.Nanoseconds()) { // processing latency matters + return nl.Values[index-2].SockInfo, nil + } + return nil, fmt.Errorf("closed socket on last entry") + } return nl.Values[len(nl.Values)-1].SockInfo, nil } if index == 0 { + // In case of tcp established event read from user-space on event of socket not found, + // timestamp is set from userspace. + // and timestamps belonging to requests waiting to be processed becomes smaller. + // on that case, select first socket open, avoiding data loss. + + if nl.Values[0].SockInfo != nil { + return nl.Values[0].SockInfo, nil + } + // The timestamp is before or equal to the first entry, so return an error return nil, fmt.Errorf("no smaller value found") } @@ -89,14 +121,40 @@ func (nl *SocketLine) GetValue(timestamp uint64) (*SockInfo, error) { si := nl.Values[index-1].SockInfo if si == nil { - // The timestamp is exactly on a socket close + // The timestamp is matched on a socket close + // Check closest open sockets and if daddr+dport's are same, send one of them. + + prev := index - 2 + var prevSock *TimestampedSocket + if prev >= 0 && prev < len(nl.Values) { + prevSock = nl.Values[prev] + } + + after := index + var afterSock *TimestampedSocket + if after >= 0 && after < len(nl.Values) { + afterSock = nl.Values[after] + } + + if prevSock != nil && prevSock.SockInfo != nil && + afterSock != nil && afterSock.SockInfo != nil { + if prevSock.SockInfo.Daddr == afterSock.SockInfo.Daddr && + prevSock.SockInfo.Dport == afterSock.SockInfo.Dport { + // pick the closest one. + if timestamp-prevSock.Timestamp < afterSock.Timestamp-timestamp { + return prevSock.SockInfo, nil + } else { + return afterSock.SockInfo, nil + } + } + } + return nil, fmt.Errorf("closed socket") } // Return the value associated with the closest previous timestamp - nl.Values[index-1].LastMatch = uint64(time.Now().UnixNano()) - return nl.Values[index-1].SockInfo, nil + return si, nil } func (nl *SocketLine) DeleteUnused() { @@ -110,7 +168,7 @@ func (nl *SocketLine) DeleteUnused() { // if two open sockets are alined, delete the first one // in case first ones close event did not arrive - result := make([]TimestampedSocket, 0) + result := make([]*TimestampedSocket, 0) i := 0 for i < len(nl.Values)-1 { if nl.Values[i].SockInfo != nil && nl.Values[i+1].SockInfo != nil { @@ -251,13 +309,13 @@ const ( stateListen = "0A" ) -func insertIntoSortedSlice(sortedSlice []TimestampedSocket, newItem TimestampedSocket) []TimestampedSocket { +func insertIntoSortedSlice(sortedSlice []*TimestampedSocket, newItem *TimestampedSocket) []*TimestampedSocket { idx := sort.Search(len(sortedSlice), func(i int) bool { return sortedSlice[i].Timestamp >= newItem.Timestamp }) // Insert the new item at the correct position. - sortedSlice = append(sortedSlice, TimestampedSocket{}) + sortedSlice = append(sortedSlice, &TimestampedSocket{}) copy(sortedSlice[idx+1:], sortedSlice[idx:]) sortedSlice[idx] = newItem @@ -340,6 +398,8 @@ func parseTcpLine(line string) (localIP string, localPort int, remoteIP string, } func (nl *SocketLine) getConnectionInfo() error { + now := time.Now() + inode, err := getInodeFromFD(fmt.Sprintf("%d", nl.pid), fmt.Sprintf("%d", nl.fd)) if err != nil { return err @@ -363,7 +423,8 @@ func (nl *SocketLine) getConnectionInfo() error { // add to socket line // convert to bpf time - log.Logger.Debug().Msgf("Adding socket line read from user space %v", skInfo) - nl.AddValue(convertUserTimeToKernelTime(uint64(time.Now().UnixNano())), skInfo) + log.Logger.Debug().Ctx(nl.ctx).Msgf("Adding socket line read from user space %v", skInfo) + nl.ClearAll() // clear all previous records + nl.AddValue(convertUserTimeToKernelTime(uint64(now.UnixNano())), skInfo) return nil } diff --git a/aggregator/socket.go b/aggregator/socket.go new file mode 100644 index 0000000..da86682 --- /dev/null +++ b/aggregator/socket.go @@ -0,0 +1,75 @@ +package aggregator + +import ( + "context" + "sync" + + "github.com/ddosify/alaz/log" +) + +// We need to keep track of the following +// in order to build find relationships between +// connections and pods/services +type SockInfo struct { + Pid uint32 `json:"pid"` + Fd uint64 `json:"fd"` + Saddr string `json:"saddr"` + Sport uint16 `json:"sport"` + Daddr string `json:"daddr"` + Dport uint16 `json:"dport"` +} + +// type SocketMap +type SocketMap struct { + mu *sync.RWMutex + pid uint32 + M map[uint64]*SocketLine `json:"fdToSockLine"` // fd -> SockLine + waitingFds chan uint64 + + processedFds map[uint64]struct{} + processedFdsmu sync.RWMutex + closeCh chan struct{} + ctx context.Context +} + +// only one worker can create socket lines for a particular process(socketmap) +func (sm *SocketMap) ProcessSocketLineCreationRequests() { + for { + select { + case <-sm.closeCh: + sm.M = nil + return + case fd := <-sm.waitingFds: + if _, ok := sm.M[fd]; !ok { + sm.createSocketLine(fd, true) + log.Logger.Debug().Ctx(sm.ctx). + Uint32("pid", sm.pid). + Uint64("fd", fd). + Msgf("created socket line for fd:%d", fd) + } + } + } +} + +func (sm *SocketMap) SignalSocketLine(ctx context.Context, fd uint64) { + sm.processedFdsmu.RLock() + if _, ok := sm.processedFds[fd]; ok { + sm.processedFdsmu.RUnlock() + return + } else { + sm.processedFdsmu.RUnlock() + + sm.processedFdsmu.Lock() + sm.processedFds[fd] = struct{}{} + sm.processedFdsmu.Unlock() + } + + sm.waitingFds <- fd +} + +func (sm *SocketMap) createSocketLine(fd uint64, fetch bool) { + skLine := NewSocketLine(sm.ctx, sm.pid, fd, fetch) + sm.mu.Lock() + sm.M[fd] = skLine + sm.mu.Unlock() +} diff --git a/config/db.go b/config/db.go index db003c6..3475ff4 100644 --- a/config/db.go +++ b/config/db.go @@ -14,6 +14,7 @@ type BackendDSConfig struct { GpuMetricsExport bool MetricsExportInterval int // in seconds - ReqBufferSize int - ConnBufferSize int + ReqBufferSize int + ConnBufferSize int + KafkaEventBufferSize int } diff --git a/datastore/backend.go b/datastore/backend.go index 8bb503a..a68a40d 100644 --- a/datastore/backend.go +++ b/datastore/backend.go @@ -140,10 +140,13 @@ type BackendDS struct { c *http.Client batchSize uint64 - reqChanBuffer chan *ReqInfo - connChanBuffer chan *ConnInfo - reqInfoPool *poolutil.Pool[*ReqInfo] - aliveConnPool *poolutil.Pool[*ConnInfo] + reqChanBuffer chan *ReqInfo + connChanBuffer chan *ConnInfo + kafkaChanBuffer chan *KafkaEventInfo + + reqInfoPool *poolutil.Pool[*ReqInfo] + aliveConnPool *poolutil.Pool[*ConnInfo] + kafkaEventInfoPool *poolutil.Pool[*KafkaEventInfo] traceEventQueue *list.List traceEventMu sync.RWMutex @@ -169,16 +172,17 @@ type BackendDS struct { } const ( - podEndpoint = "/pod/" - svcEndpoint = "/svc/" - rsEndpoint = "/replicaset/" - depEndpoint = "/deployment/" - epEndpoint = "/endpoint/" - containerEndpoint = "/container/" - dsEndpoint = "/daemonset/" - ssEndpoint = "/statefulset/" - reqEndpoint = "/requests/" - connEndpoint = "/connections/" + podEndpoint = "/pod/" + svcEndpoint = "/svc/" + rsEndpoint = "/replicaset/" + depEndpoint = "/deployment/" + epEndpoint = "/endpoint/" + containerEndpoint = "/container/" + dsEndpoint = "/daemonset/" + ssEndpoint = "/statefulset/" + reqEndpoint = "/requests/" + connEndpoint = "/connections/" + kafkaEventEndpoint = "/events/kafka/" traceEventEndpoint = "/dist_tracing/traffic/" @@ -291,9 +295,11 @@ func NewBackendDS(parentCtx context.Context, conf config.BackendDSConfig) *Backe batchSize: bs, reqInfoPool: newReqInfoPool(func() *ReqInfo { return &ReqInfo{} }, func(r *ReqInfo) {}), aliveConnPool: newAliveConnPool(func() *ConnInfo { return &ConnInfo{} }, func(r *ConnInfo) {}), + kafkaEventInfoPool: newKafkaEventPool(func() *KafkaEventInfo { return &KafkaEventInfo{} }, func(r *KafkaEventInfo) {}), traceInfoPool: newTraceInfoPool(func() *TraceInfo { return &TraceInfo{} }, func(r *TraceInfo) {}), reqChanBuffer: make(chan *ReqInfo, conf.ReqBufferSize), connChanBuffer: make(chan *ConnInfo, conf.ConnBufferSize), + kafkaChanBuffer: make(chan *KafkaEventInfo, conf.ReqBufferSize), podEventChan: make(chan interface{}, 5*resourceChanSize), svcEventChan: make(chan interface{}, 2*resourceChanSize), rsEventChan: make(chan interface{}, 2*resourceChanSize), @@ -313,7 +319,8 @@ func NewBackendDS(parentCtx context.Context, conf config.BackendDSConfig) *Backe func (ds *BackendDS) Start() { go ds.sendReqsInBatch(ds.batchSize) - go ds.sendConnsInBatch(ds.batchSize) + go ds.sendConnsInBatch(ds.batchSize / 2) + go ds.sendKafkaEventsInBatch(ds.batchSize / 2) go ds.sendTraceEventsInBatch(10 * ds.batchSize) // events are resynced every 60 seconds on k8s informers @@ -458,6 +465,18 @@ func convertReqsToPayload(batch []*ReqInfo) RequestsPayload { } } +func convertKafkaEventsToPayload(batch []*KafkaEventInfo) KafkaEventInfoPayload { + return KafkaEventInfoPayload{ + Metadata: Metadata{ + MonitoringID: MonitoringID, + IdempotencyKey: string(uuid.NewUUID()), + NodeID: NodeID, + AlazVersion: tag, + }, + KafkaEvents: batch, + } +} + func convertConnsToPayload(batch []*ConnInfo) ConnInfoPayload { return ConnInfoPayload{ Metadata: Metadata{ @@ -528,7 +547,9 @@ func (b *BackendDS) sendToBackend(method string, payload interface{}, endpoint s return } - log.Logger.Debug().Str("endpoint", endpoint).Any("payload", payload).Msg("sending batch to backend") + // if endpoint == reqEndpoint { + // log.Logger.Debug().Str("endpoint", endpoint).Any("payload", payload).Msg("sending batch to backend") + // } err = b.DoRequest(httpReq) if err != nil { log.Logger.Error().Msgf("backend persist error at ep %s : %v", endpoint, err) @@ -609,6 +630,48 @@ func (b *BackendDS) sendReqsInBatch(batchSize uint64) { } +func (b *BackendDS) sendKafkaEventsInBatch(batchSize uint64) { + t := time.NewTicker(5 * time.Second) + defer t.Stop() + + send := func() { + batch := make([]*KafkaEventInfo, 0, batchSize) + loop := true + + for i := 0; (i < int(batchSize)) && loop; i++ { + select { + case req := <-b.kafkaChanBuffer: + batch = append(batch, req) + case <-time.After(50 * time.Millisecond): + loop = false + } + } + + if len(batch) == 0 { + return + } + + log.Logger.Debug().Any("batch", batch).Msg("sending batch of kafka events") + kEventsPayload := convertKafkaEventsToPayload(batch) + go b.sendToBackend(http.MethodPost, kEventsPayload, kafkaEventEndpoint) + + for _, req := range batch { + b.kafkaEventInfoPool.Put(req) + } + } + + for { + select { + case <-b.ctx.Done(): + log.Logger.Info().Msg("stopping sending kafka events to backend") + return + case <-t.C: + send() + } + } + +} + func (b *BackendDS) sendConnsInBatch(batchSize uint64) { t := time.NewTicker(30 * time.Second) defer t.Stop() @@ -724,6 +787,14 @@ func newTraceInfoPool(factory func() *TraceInfo, close func(*TraceInfo)) *poolut } } +func newKafkaEventPool(factory func() *KafkaEventInfo, close func(*KafkaEventInfo)) *poolutil.Pool[*KafkaEventInfo] { + return &poolutil.Pool[*KafkaEventInfo]{ + Items: make(chan *KafkaEventInfo, 1000), + Factory: factory, + Close: close, + } +} + func (b *BackendDS) PersistAliveConnection(aliveConn *AliveConnection) error { // get a connInfo from the pool oc := b.aliveConnPool.Get() @@ -773,6 +844,35 @@ func (b *BackendDS) PersistRequest(request *Request) error { return nil } +func (b *BackendDS) PersistKafkaEvent(ke *KafkaEvent) error { + // get a reqInfo from the pool + kafkaInfo := b.kafkaEventInfoPool.Get() + + // overwrite the reqInfo, all fields must be set in order to avoid conflict + kafkaInfo[0] = ke.StartTime + kafkaInfo[1] = ke.Latency + kafkaInfo[2] = ke.FromIP + kafkaInfo[3] = ke.FromType + kafkaInfo[4] = ke.FromUID + kafkaInfo[5] = ke.FromPort + kafkaInfo[6] = ke.ToIP + kafkaInfo[7] = ke.ToType + kafkaInfo[8] = ke.ToUID + kafkaInfo[9] = ke.ToPort + kafkaInfo[10] = ke.Topic + kafkaInfo[11] = ke.Partition + kafkaInfo[12] = ke.Key + kafkaInfo[13] = ke.Value + kafkaInfo[14] = ke.Type + kafkaInfo[15] = ke.Tls + kafkaInfo[16] = ke.Seq + kafkaInfo[17] = ke.Tid + + b.kafkaChanBuffer <- kafkaInfo + + return nil +} + func (b *BackendDS) PersistTraceEvent(trace *l7_req.TraceEvent) error { if trace == nil { return fmt.Errorf("trace event is nil") diff --git a/datastore/datastore.go b/datastore/datastore.go index d43d77a..5de1829 100644 --- a/datastore/datastore.go +++ b/datastore/datastore.go @@ -16,6 +16,8 @@ type DataStore interface { PersistRequest(request *Request) error + PersistKafkaEvent(request *KafkaEvent) error + PersistTraceEvent(trace *l7_req.TraceEvent) error PersistAliveConnection(trace *AliveConnection) error diff --git a/datastore/dto.go b/datastore/dto.go index 7ae5e28..d03a1a7 100644 --- a/datastore/dto.go +++ b/datastore/dto.go @@ -105,6 +105,74 @@ type AliveConnection struct { ToPort uint16 } +type DirectionalEvent interface { + SetFromUID(string) + SetFromIP(string) + SetFromType(string) + SetFromPort(uint16) + + SetToUID(string) + SetToIP(string) + SetToType(string) + SetToPort(uint16) + + ReverseDirection() +} + +type KafkaEvent struct { + StartTime int64 + Latency uint64 // in ns + FromIP string + FromType string + FromUID string + FromPort uint16 + ToIP string + ToType string + ToUID string + ToPort uint16 + Topic string + Partition uint32 + Key string + Value string + Type string // PUBLISH or CONSUME + Tls bool + Tid uint32 + Seq uint32 +} + +func (ke *KafkaEvent) SetFromUID(uid string) { + ke.FromUID = uid +} +func (ke *KafkaEvent) SetFromIP(ip string) { + ke.FromIP = ip +} +func (ke *KafkaEvent) SetFromType(typ string) { + ke.FromType = typ +} +func (ke *KafkaEvent) SetFromPort(port uint16) { + ke.FromPort = port +} + +func (ke *KafkaEvent) SetToUID(uid string) { + ke.ToUID = uid +} +func (ke *KafkaEvent) SetToIP(ip string) { + ke.ToIP = ip +} +func (ke *KafkaEvent) SetToType(typ string) { + ke.ToType = typ +} +func (ke *KafkaEvent) SetToPort(port uint16) { + ke.ToPort = port +} + +func (req *KafkaEvent) ReverseDirection() { + req.FromIP, req.ToIP = req.ToIP, req.FromIP + req.FromPort, req.ToPort = req.ToPort, req.FromPort + req.FromUID, req.ToUID = req.ToUID, req.FromUID + req.FromType, req.ToType = req.ToType, req.FromType +} + type Request struct { StartTime int64 Latency uint64 // in ns @@ -127,6 +195,39 @@ type Request struct { Seq uint32 } +func (r *Request) SetFromUID(uid string) { + r.FromUID = uid +} +func (r *Request) SetFromIP(ip string) { + r.FromIP = ip +} +func (r *Request) SetFromType(typ string) { + r.FromType = typ +} +func (r *Request) SetFromPort(port uint16) { + r.FromPort = port +} + +func (r *Request) SetToUID(uid string) { + r.ToUID = uid +} +func (r *Request) SetToIP(ip string) { + r.ToIP = ip +} +func (r *Request) SetToType(typ string) { + r.ToType = typ +} +func (r *Request) SetToPort(port uint16) { + r.ToPort = port +} + +func (req *Request) ReverseDirection() { + req.FromIP, req.ToIP = req.ToIP, req.FromIP + req.FromPort, req.ToPort = req.ToPort, req.FromPort + req.FromUID, req.ToUID = req.ToUID, req.FromUID + req.FromType, req.ToType = req.ToType, req.FromType +} + type BackendResponse struct { Msg string `json:"msg"` Errors []struct { diff --git a/datastore/payload.go b/datastore/payload.go index 17e90ee..85c351f 100644 --- a/datastore/payload.go +++ b/datastore/payload.go @@ -158,6 +158,31 @@ type TracePayload struct { Traces []*TraceInfo `json:"traffic"` } +// 0) StartTime +// 1) Latency +// 2) Source IP +// 3) Source Type +// 4) Source ID +// 5) Source Port +// 6) Destination IP +// 7) Destination Type +// 8) Destination ID +// 9) Destination Port +// 10) Topic +// 11) Partition +// 12) Key +// 13) Value +// 14) Type +// 15) Encrypted (bool) +// 16) Seq +// 17) Tid +type KafkaEventInfo [18]interface{} + +type KafkaEventInfoPayload struct { + Metadata Metadata `json:"metadata"` + KafkaEvents []*KafkaEventInfo `json:"kafka_events"` +} + func convertPodToPodEvent(pod Pod, eventType string) PodEvent { return PodEvent{ UID: pod.UID, diff --git a/ebpf/c/bpf.c b/ebpf/c/bpf.c index ef937c0..618b336 100644 --- a/ebpf/c/bpf.c +++ b/ebpf/c/bpf.c @@ -34,6 +34,7 @@ #include "amqp.c" #include "postgres.c" #include "redis.c" +#include "kafka.c" #include "openssl.c" #include "http2.c" #include "tcp_sock.c" diff --git a/ebpf/c/bpf_bpfeb.go b/ebpf/c/bpf_bpfeb.go index 5644d36..971c0d3 100644 --- a/ebpf/c/bpf_bpfeb.go +++ b/ebpf/c/bpf_bpfeb.go @@ -50,7 +50,8 @@ type bpfL7Event struct { _ [1]byte Seq uint32 Tid uint32 - _ [4]byte + KafkaApiVersion int16 + _ [2]byte } type bpfL7Request struct { @@ -65,6 +66,9 @@ type bpfL7Request struct { _ [2]byte Seq uint32 Tid uint32 + CorrelationId int32 + ApiKey int16 + ApiVersion int16 _ [4]byte } diff --git a/ebpf/c/bpf_bpfeb.o b/ebpf/c/bpf_bpfeb.o index 983b28c..e9eb9ae 100644 Binary files a/ebpf/c/bpf_bpfeb.o and b/ebpf/c/bpf_bpfeb.o differ diff --git a/ebpf/c/bpf_bpfel.go b/ebpf/c/bpf_bpfel.go index d7789f8..31e8dd1 100644 --- a/ebpf/c/bpf_bpfel.go +++ b/ebpf/c/bpf_bpfel.go @@ -50,7 +50,8 @@ type bpfL7Event struct { _ [1]byte Seq uint32 Tid uint32 - _ [4]byte + KafkaApiVersion int16 + _ [2]byte } type bpfL7Request struct { @@ -65,6 +66,9 @@ type bpfL7Request struct { _ [2]byte Seq uint32 Tid uint32 + CorrelationId int32 + ApiKey int16 + ApiVersion int16 _ [4]byte } diff --git a/ebpf/c/bpf_bpfel.o b/ebpf/c/bpf_bpfel.o index 42c345a..83084b6 100644 Binary files a/ebpf/c/bpf_bpfel.o and b/ebpf/c/bpf_bpfel.o differ diff --git a/ebpf/c/kafka.c b/ebpf/c/kafka.c new file mode 100644 index 0000000..4e2acc9 --- /dev/null +++ b/ebpf/c/kafka.c @@ -0,0 +1,81 @@ +//go:build ignore +// https://kafka.apache.org/protocol.html + +// RequestOrResponse => Size (RequestMessage | ResponseMessage) +// Size => int32 + + +// Request Header v0 => request_api_key request_api_version correlation_id +// request_api_key => INT16 +// request_api_version => INT16 +// correlation_id => INT32 +// client_id => NULLABLE_STRING // added in v1 + +// method will be decoded in user space +#define METHOD_KAFKA_PRODUCE_REQUEST 1 +#define METHOD_KAFKA_FETCH_RESPONSE 2 + + +#define KAFKA_API_KEY_PRODUCE_API 0 +#define KAFKA_API_KEY_FETCH_API 1 + +struct kafka_request_header { + __s32 size; + __s16 api_key; + __s16 api_version; + __s32 correlation_id; +}; + +// Response Header v1 => correlation_id TAG_BUFFER +// correlation_id => INT32 + +struct kafka_response_header { + __s32 size; + __s32 correlation_id; +}; + +static __always_inline +int is_kafka_request_header(char *buf, __u64 buf_size, __s32 *request_id, __s16 *api_key, __s16 *api_version) { + struct kafka_request_header h = {}; + if (buf_size < sizeof(h)) { + return 0; + } + + if (bpf_probe_read(&h, sizeof(h), buf) < 0) { + return 0; + } + + h.size = bpf_htonl(h.size); + + // we parse only one message in one write syscall for now. + // batch publish is not supported. + if (h.size+4 != buf_size) { + return 0; + } + + h.api_key = bpf_htons(h.api_key); // determines message api, ProduceAPI, FetchAPI, etc. + h.api_version = bpf_htons(h.api_version); // version of the API, v8, v9, etc. + h.correlation_id = bpf_htonl(h.correlation_id); + if (h.correlation_id > 0 && (h.api_key >= 0 && h.api_key <= 74)) { // https://kafka.apache.org/protocol.html#protocol_api_keys + *request_id = h.correlation_id; + *api_key = h.api_key; + *api_version = h.api_version; + return 1; + } + return 0; +} + +static __always_inline +int is_kafka_response_header(char *buf, __s32 correlation_id) { + struct kafka_response_header h = {}; + if (bpf_probe_read(&h, sizeof(h), buf) < 0) { + return 0; + } + // correlation_id match + if (bpf_htonl(h.correlation_id) == correlation_id) { + return 1; + } + return 0; +} + + diff --git a/ebpf/c/l7.c b/ebpf/c/l7.c index 15c2128..873c8f6 100644 --- a/ebpf/c/l7.c +++ b/ebpf/c/l7.c @@ -5,6 +5,8 @@ #define PROTOCOL_POSTGRES 3 #define PROTOCOL_HTTP2 4 #define PROTOCOL_REDIS 5 +#define PROTOCOL_KAFKA 6 + #define MAX_PAYLOAD_SIZE 1024 #define PAYLOAD_PREFIX_SIZE 16 @@ -28,6 +30,8 @@ struct l7_event { __u32 seq; // tcp sequence number __u32 tid; + + __s16 kafka_api_version; // used only for kafka }; struct l7_request { @@ -40,6 +44,9 @@ struct l7_request { __u8 request_type; __u32 seq; __u32 tid; + __s32 correlation_id; // used only for kafka + __s16 api_key; // used only for kafka + __s16 api_version; // used only for kafka }; struct socket_key { @@ -177,9 +184,6 @@ int process_enter_of_syscalls_write_sendto(void* ctx, __u64 fd, __u8 is_tls, cha return 0; // not a container process, ignore } #endif - - __u32 tid = id & 0xFFFFFFFF; - __u32 seq = process_for_dist_trace_write(ctx,fd); int zero = 0; struct l7_request *req = bpf_map_lookup_elem(&l7_request_heap, &zero); @@ -235,6 +239,25 @@ int process_enter_of_syscalls_write_sendto(void* ctx, __u64 fd, __u8 is_tls, cha }else if (!is_redis_pong(buf,count) && is_redis_command(buf,count)){ req->protocol = PROTOCOL_REDIS; req->method = METHOD_UNKNOWN; + }else if (is_kafka_request_header(buf, count, &req->correlation_id, &req->api_key, &req->api_version)){ + // request pipelining, batch publish + // if multiple writes are done subsequently over the same connection + // do not change record in active_l7_requests + // correlation ids can mismatch + + // order is guaranteed over the same socket on kafka. + + // write(first_part_of_batch_req_corr1 + // write(second_part_of_batch_req_corr2 ----> do not write to active_l7_requests, wait for the response + // read(first_part_of_batch_resp_corr1 + // read(second_part_of_batch_resp_corr2 + + struct l7_request *prev_req = bpf_map_lookup_elem(&active_l7_requests, &k); + if (prev_req && prev_req->protocol == PROTOCOL_KAFKA) { + return 0; + } + req->protocol = PROTOCOL_KAFKA; + req->method = METHOD_UNKNOWN; }else if (is_rabbitmq_publish(buf,count)){ req->protocol = PROTOCOL_AMQP; req->method = METHOD_PUBLISH; @@ -295,6 +318,9 @@ int process_enter_of_syscalls_write_sendto(void* ctx, __u64 fd, __u8 is_tls, cha req->payload_read_complete = 1; } + __u32 tid = id & 0xFFFFFFFF; + __u32 seq = process_for_dist_trace_write(ctx,fd); + // for distributed tracing req->seq = seq; req->tid = tid; @@ -671,6 +697,23 @@ int process_exit_of_syscalls_read_recvfrom(void* ctx, __u64 id, __u32 pid, __s64 e->status = parse_redis_response(read_info->buf, ret); e->method = METHOD_REDIS_COMMAND; } + }else if (e->protocol == PROTOCOL_KAFKA){ + e->status = is_kafka_response_header(read_info->buf, active_req->correlation_id); + if (active_req->api_key == KAFKA_API_KEY_PRODUCE_API){ + e->method = METHOD_KAFKA_PRODUCE_REQUEST; + }else if (active_req->api_key == KAFKA_API_KEY_FETCH_API){ + e->method = METHOD_KAFKA_FETCH_RESPONSE; + // send the response to userspace + // copy req payload + e->payload_size = ret; + bpf_probe_read(e->payload, MAX_PAYLOAD_SIZE, read_info->buf); + if(ret > MAX_PAYLOAD_SIZE){ + e->payload_read_complete = 0; + }else{ + e->payload_read_complete = 1; + } + e->kafka_api_version = active_req->api_version; + } } }else{ bpf_map_delete_elem(&active_reads, &id); @@ -1066,10 +1109,7 @@ int process_enter_of_go_conn_write(void *ctx, __u32 pid, __u32 fd, char *buf_ptr req->payload_read_complete = 0; req->write_time_ns = timestamp; req->request_type = 0; - req->seq = process_for_dist_trace_write(ctx,fd); - - __u32 tid = bpf_get_current_pid_tgid() & 0xFFFFFFFF; - req->tid = tid; + if(buf_ptr){ @@ -1127,6 +1167,10 @@ int process_enter_of_go_conn_write(void *ctx, __u32 pid, __u32 fd, char *buf_ptr req->payload_read_complete = 1; } + req->seq = process_for_dist_trace_write(ctx,fd); + __u32 tid = bpf_get_current_pid_tgid() & 0xFFFFFFFF; + req->tid = tid; + long res = bpf_map_update_elem(&go_active_l7_requests, &k, req, BPF_ANY); if(res < 0) { diff --git a/ebpf/c/proc.c b/ebpf/c/proc.c index c2eff87..b8ff805 100644 --- a/ebpf/c/proc.c +++ b/ebpf/c/proc.c @@ -29,7 +29,7 @@ int sched_process_exec(struct trace_event_raw_sched_process_exec* ctx) pid = id >> 32; tid = (__u32)id; - /* ignore thread exits */ + /* ignore thread exec */ if (pid != tid) return 0; diff --git a/ebpf/collector.go b/ebpf/collector.go index aea4ed9..9f9dc36 100644 --- a/ebpf/collector.go +++ b/ebpf/collector.go @@ -76,9 +76,9 @@ func NewEbpfCollector(parentCtx context.Context, ct *cri.CRITool) *EbpfCollector return &EbpfCollector{ ctx: ctx, done: make(chan struct{}), - ebpfEvents: make(chan interface{}, 100000), // interface is 16 bytes, 16 * 100000 = 8 Megabytes - ebpfProcEvents: make(chan interface{}, 2000), - ebpfTcpEvents: make(chan interface{}, 1000), + ebpfEvents: make(chan interface{}, 200000), // interface is 16 bytes, 16 * 200000 + ebpfProcEvents: make(chan interface{}, 20000), + ebpfTcpEvents: make(chan interface{}, 100000), tlsPidMap: make(map[uint32]struct{}), sslWriteUprobes: make(map[uint32]link.Link), sslReadEnterUprobes: make(map[uint32]link.Link), @@ -142,6 +142,8 @@ func (e *EbpfCollector) Init() { return } + log.Logger.Debug().Int("len", len(currentPids)).Msg("len of current container pids") + // find new pids newPids := make([]uint32, 0) for pid, _ := range currentPids { @@ -166,7 +168,6 @@ func (e *EbpfCollector) Init() { } // update oldstate - for k := range oldState { delete(oldState, k) } @@ -175,6 +176,20 @@ func (e *EbpfCollector) Init() { oldState[pid] = struct{}{} } + // send events to aggregator + for _, pid := range newPids { + e.ebpfProcEvents <- &proc.ProcEvent{ + Pid: pid, + Type_: proc.EVENT_PROC_EXEC, + } + } + for _, pid := range removedPids { + e.ebpfProcEvents <- &proc.ProcEvent{ + Pid: pid, + Type_: proc.EVENT_PROC_EXIT, + } + } + err = tcpProg.PopulateContainerPidsMap(newPids, removedPids) if err != nil { log.Logger.Error().Err(err).Msg("failed populating container pids map") diff --git a/ebpf/l7_req/l7.go b/ebpf/l7_req/l7.go index 753b27d..42b83c7 100644 --- a/ebpf/l7_req/l7.go +++ b/ebpf/l7_req/l7.go @@ -23,6 +23,7 @@ const ( BPF_L7_PROTOCOL_POSTGRES BPF_L7_PROTOCOL_HTTP2 BPF_L7_PROTOCOL_REDIS + BPF_L7_PROTOCOL_KAFKA ) // for user space @@ -32,6 +33,7 @@ const ( L7_PROTOCOL_AMQP = "AMQP" L7_PROTOCOL_POSTGRES = "POSTGRES" L7_PROTOCOL_REDIS = "REDIS" + L7_PROTOCOL_KAFKA = "KAFKA" L7_PROTOCOL_UNKNOWN = "UNKNOWN" ) @@ -51,6 +53,8 @@ func (e L7ProtocolConversion) String() string { return L7_PROTOCOL_HTTP2 case BPF_L7_PROTOCOL_REDIS: return L7_PROTOCOL_REDIS + case BPF_L7_PROTOCOL_KAFKA: + return L7_PROTOCOL_KAFKA case BPF_L7_PROTOCOL_UNKNOWN: return L7_PROTOCOL_UNKNOWN default: @@ -116,6 +120,13 @@ const ( METHOD_REDIS_PING ) +// match with values in l7.c, order is important +const ( + BPF_KAFKA_METHOD_UNKNOWN = iota + METHOD_KAFKA_PRODUCE_REQUEST + METHOD_KAFKA_FETCH_RESPONSE +) + // for http, user space const ( GET = "GET" @@ -155,6 +166,12 @@ const ( REDIS_PING = "PING" ) +// for kafka, user space +const ( + KAFKA_PRODUCE_REQUEST = "PRODUCE_REQUEST" + KAFKA_FETCH_RESPONSE = "FETCH_RESPONSE" +) + // Custom type for the enumeration type HTTPMethodConversion uint32 @@ -248,6 +265,21 @@ func (e RedisMethodConversion) String() string { } } +// Custom type for the enumeration +type KafkaMethodConversion uint32 + +// String representation of the enumeration values +func (e KafkaMethodConversion) String() string { + switch e { + case METHOD_KAFKA_PRODUCE_REQUEST: + return KAFKA_PRODUCE_REQUEST + case METHOD_KAFKA_FETCH_RESPONSE: + return KAFKA_FETCH_RESPONSE + default: + return "Unknown" + } +} + var FirstKernelTime uint64 = 0 // nanoseconds since boot var FirstUserspaceTime uint64 = 0 @@ -282,7 +314,8 @@ type bpfL7Event struct { _ [1]byte Seq uint32 Tid uint32 - _ [4]byte + KafkaApiVersion int16 + _ [2]byte } type bpfTraceEvent struct { @@ -324,6 +357,11 @@ type L7Event struct { WriteTimeNs uint64 // start time of write syscall Tid uint32 Seq uint32 // tcp seq num + KafkaApiVersion int16 + + // This bool is actually related to aggregator logic. Means this events processing somehow failed and put back into channel for retry. + // Maybe we can wrap L7Event and add this field on top. + PutBack bool } const L7_EVENT = "l7_event" @@ -574,6 +612,12 @@ func (l7p *L7Prog) Consume(ctx context.Context, ch chan interface{}) { go func() { var record perf.Record droppedCount := 0 + go func() { + t := time.NewTicker(1 * time.Minute) + for range t.C { + log.Logger.Debug().Int("count", droppedCount).Msg("dropped l7 events") + } + }() read := func() { err := l7p.l7Events.ReadInto(&record) if err != nil { @@ -611,6 +655,9 @@ func (l7p *L7Prog) Consume(ctx context.Context, ch chan interface{}) { method = Http2MethodConversion(l7Event.Method).String() case L7_PROTOCOL_REDIS: method = RedisMethodConversion(l7Event.Method).String() + case L7_PROTOCOL_KAFKA: + method = KafkaMethodConversion(l7Event.Method).String() + // no method set for kafka on kernel side default: method = "Unknown" } @@ -634,6 +681,7 @@ func (l7p *L7Prog) Consume(ctx context.Context, ch chan interface{}) { WriteTimeNs: l7Event.WriteTimeNs, Tid: l7Event.Tid, Seq: l7Event.Seq, + KafkaApiVersion: l7Event.KafkaApiVersion, } go func(l7Event *L7Event) { @@ -641,14 +689,6 @@ func (l7p *L7Prog) Consume(ctx context.Context, ch chan interface{}) { case ch <- l7Event: default: droppedCount++ - if droppedCount%100 == 0 { - log.Logger.Warn(). - Str("protocol", l7Event.Protocol). - Str("method", l7Event.Method). - Uint32("pid", l7Event.Pid). - Uint32("status", l7Event.Status). - Msg("channel full, dropping l7 event") - } } }(userspacel7Event) } @@ -664,10 +704,19 @@ func (l7p *L7Prog) Consume(ctx context.Context, ch chan interface{}) { go func() { var record perf.Record + droppedCount := 0 + + go func() { + t := time.NewTicker(1 * time.Minute) + for range t.C { + log.Logger.Debug().Int("count", droppedCount).Msg("dropped trace events") + } + }() + read := func() { err := l7p.traffic.ReadInto(&record) if err != nil { - log.Logger.Warn().Err(err).Msg("error reading from dist trace calls") + log.Logger.Warn().Err(err).Msg("error reading from dist-trace calls") } if record.LostSamples != 0 { @@ -681,14 +730,21 @@ func (l7p *L7Prog) Consume(ctx context.Context, ch chan interface{}) { bpfTraceEvent := (*bpfTraceEvent)(unsafe.Pointer(&record.RawSample[0])) - traceEvent := TraceEvent{ + traceEvent := &TraceEvent{ Pid: bpfTraceEvent.Pid, Tid: bpfTraceEvent.Tid, Tx: time.Now().UnixMilli(), Type_: bpfTraceEvent.Type_, Seq: bpfTraceEvent.Seq, } - ch <- &traceEvent + + go func(traceEvent *TraceEvent) { + select { + case ch <- traceEvent: + default: + droppedCount++ + } + }(traceEvent) } for { diff --git a/go.mod b/go.mod index 15bc7c1..75e9600 100644 --- a/go.mod +++ b/go.mod @@ -7,14 +7,17 @@ require ( github.com/alecthomas/kingpin/v2 v2.4.0 github.com/cilium/ebpf v0.10.1-0.20230626090016-654491c8a500 github.com/cilium/fake v0.6.1 + github.com/eapache/go-xerial-snappy v0.0.0-20230111030713-bf00bc1b83b6 github.com/fsnotify/fsnotify v1.7.0 github.com/go-kit/log v0.2.1 github.com/golang/protobuf v1.5.3 github.com/hashicorp/go-retryablehttp v0.7.4 + github.com/klauspost/compress v1.16.5 + github.com/pierrec/lz4/v4 v4.1.18 github.com/prometheus/client_golang v1.19.0 github.com/prometheus/common v0.48.0 github.com/prometheus/node_exporter v1.6.1 - github.com/rs/zerolog v1.29.1 + github.com/rs/zerolog v1.33.0 github.com/stretchr/testify v1.8.4 golang.org/x/arch v0.5.0 golang.org/x/mod v0.12.0 @@ -32,6 +35,7 @@ require ( github.com/cenkalti/backoff/v4 v4.2.1 // indirect github.com/felixge/httpsnoop v1.0.3 // indirect github.com/go-logr/stdr v1.2.2 // indirect + github.com/golang/snappy v0.0.4 // indirect github.com/google/gnostic-models v0.6.8 // indirect github.com/grpc-ecosystem/grpc-gateway/v2 v2.16.0 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect @@ -85,8 +89,8 @@ require ( github.com/json-iterator/go v1.1.12 // indirect github.com/lufia/iostat v1.2.1 // indirect github.com/mailru/easyjson v0.7.7 // indirect - github.com/mattn/go-colorable v0.1.12 // indirect - github.com/mattn/go-isatty v0.0.17 // indirect + github.com/mattn/go-colorable v0.1.13 // indirect + github.com/mattn/go-isatty v0.0.19 // indirect github.com/mattn/go-xmlrpc v0.0.3 // indirect github.com/mdlayher/ethtool v0.0.0-20221212131811-ba3b4bc2e02c // indirect github.com/mdlayher/genetlink v1.3.2 // indirect diff --git a/go.sum b/go.sum index f9c5845..234e5b9 100644 --- a/go.sum +++ b/go.sum @@ -37,6 +37,8 @@ github.com/dennwc/btrfs v0.0.0-20230312211831-a1f570bd01a1/go.mod h1:MYsOV9Dgsec github.com/dennwc/ioctl v1.0.0 h1:DsWAAjIxRqNcLn9x6mwfuf2pet3iB7aK90K4tF16rLg= github.com/dennwc/ioctl v1.0.0/go.mod h1:ellh2YB5ldny99SBU/VX7Nq0xiZbHphf1DrtHxxjMk0= github.com/dvyukov/go-fuzz v0.0.0-20210103155950-6a8e9d1f2415/go.mod h1:11Gm+ccJnvAhCNLlf5+cS9KjtbaD5I5zaZpFMsTHWTw= +github.com/eapache/go-xerial-snappy v0.0.0-20230111030713-bf00bc1b83b6 h1:8yY/I9ndfrgrXUbOGObLHKBR4Fl3nZXwM2c7OYTT8hM= +github.com/eapache/go-xerial-snappy v0.0.0-20230111030713-bf00bc1b83b6/go.mod h1:YvSRo5mw33fLEx1+DlK6L2VV43tJt5Eyel9n9XBcR+0= github.com/ema/qdisc v0.0.0-20230120214811-5b708f463de3 h1:Jrl8sD8wO34+EE1dV2vhOXrqFAZa/FILDnZRaV28+cw= github.com/ema/qdisc v0.0.0-20230120214811-5b708f463de3/go.mod h1:FhIc0fLYi7f+lK5maMsesDqwYojIOh3VfRs8EVd5YJQ= github.com/emicklei/go-restful/v3 v3.11.0 h1:rAQeMHw1c7zTmncogyy8VvRZwtkmkZ4FxERmMY4rD+g= @@ -81,6 +83,8 @@ github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5y github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= +github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I= github.com/google/gnostic-models v0.6.8/go.mod h1:5n7qKqH0f5wFt+aWF8CW6pZLLNOfYuF5OpfBSENuI8U= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= @@ -122,6 +126,8 @@ github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnr github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/klauspost/compress v1.16.5 h1:IFV2oUNUzZaz+XyusxpLzpzS8Pt5rh0Z16For/djlyI= +github.com/klauspost/compress v1.16.5/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE= github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= @@ -134,13 +140,13 @@ github.com/lufia/iostat v1.2.1/go.mod h1:rEPNA0xXgjHQjuI5Cy05sLlS2oRcSlWHRLrvh/A github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= github.com/mattn/go-colorable v0.1.4/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE= -github.com/mattn/go-colorable v0.1.12 h1:jF+Du6AlPIjs2BiUiQlKOX0rt3SujHxPnksPKZbaA40= -github.com/mattn/go-colorable v0.1.12/go.mod h1:u5H1YNBxpqRaxsYJYSkiCWKzEfiAb1Gb520KVy5xxl4= +github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= +github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= github.com/mattn/go-isatty v0.0.8/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= github.com/mattn/go-isatty v0.0.10/go.mod h1:qgIWMr58cqv1PHHyhnkY9lrL7etaEgOFcMEpPG5Rm84= -github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27kJ6hsGG94= -github.com/mattn/go-isatty v0.0.17 h1:BTarxUcIeDqL27Mc+vyvdWYSL28zpIhv3RoTdsLMPng= -github.com/mattn/go-isatty v0.0.17/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= +github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= +github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA= +github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= github.com/mattn/go-xmlrpc v0.0.3 h1:Y6WEMLEsqs3RviBrAa1/7qmbGB7DVD3brZIbqMbQdGY= github.com/mattn/go-xmlrpc v0.0.3/go.mod h1:mqc2dz7tP5x5BKlCahN/n+hs7OSZKJkS9JsHNBRlrxA= github.com/mdlayher/ethtool v0.0.0-20221212131811-ba3b4bc2e02c h1:Y7LoKqIgD7vmqJ7+6ZVnADuwUO+m3tGXbf2lK0OvjIw= @@ -168,6 +174,8 @@ github.com/opencontainers/selinux v1.11.0 h1:+5Zbo97w3Lbmb3PeqQtpmTkMwsW5nRI3YaL github.com/opencontainers/selinux v1.11.0/go.mod h1:E5dMC3VPuVvVHDYmi78qvhJp8+M586T4DlDRYpFkyec= github.com/patrickmn/go-cache v2.1.0+incompatible h1:HRMgzkcYKYpi3C8ajMPV8OFXaaRUnok+kx1WdO15EQc= github.com/patrickmn/go-cache v2.1.0+incompatible/go.mod h1:3Qf8kWWT7OJRJbdiICTKqZju1ZixQ/KpMGzzAfe6+WQ= +github.com/pierrec/lz4/v4 v4.1.18 h1:xaKrnTkyoqfh1YItXl56+6KJNVYWlEEPuAQW9xsplYQ= +github.com/pierrec/lz4/v4 v4.1.18/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= @@ -185,9 +193,9 @@ github.com/prometheus/procfs v0.12.0 h1:jluTpSng7V9hY0O2R9DzzJHYb2xULk9VTR1V1R/k github.com/prometheus/procfs v0.12.0/go.mod h1:pcuDEFsWDnvcgNzo4EEweacyhjeA9Zk3cnaOZAZEfOo= github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA= -github.com/rs/xid v1.4.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg= -github.com/rs/zerolog v1.29.1 h1:cO+d60CHkknCbvzEWxP0S9K6KqyTjrCNUy1LdQLCGPc= -github.com/rs/zerolog v1.29.1/go.mod h1:Le6ESbR7hc+DP6Lt1THiV8CQSdkkNrd3R0XbEgp3ZBU= +github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg= +github.com/rs/zerolog v1.33.0 h1:1cU2KZkvPxNyfgEmhHAz/1A9Bz+llsdYzklWFzgp0r8= +github.com/rs/zerolog v1.33.0/go.mod h1:/7mN4D5sKwJLZQ2b/znpjC3/GQWY/xaDXUM0kKWRHss= github.com/safchain/ethtool v0.3.0 h1:gimQJpsI6sc1yIqP/y8GYgiXn/NjgvpM0RNoWLVVmP0= github.com/safchain/ethtool v0.3.0/go.mod h1:SA9BwrgyAqNo7M+uaL6IYbxpm5wk3L7Mm6ocLW+CJUs= github.com/siebenmann/go-kstat v0.0.0-20210513183136-173c9b0a9973 h1:GfSdC6wKfTGcgCS7BtzF5694Amne1pGCSTY252WhlEY= @@ -273,11 +281,10 @@ golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20191008105621-543471e840be/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210119212857-b64e53b001e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20210927094055-39ccf1dd6fa6/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20211031064116-611d5d643895/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.18.0 h1:DBdB3niSjOA/O0blCZBqDefyWNYveAYMNF1Wum0DYQ4= golang.org/x/sys v0.18.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.16.0 h1:m+B6fahuftsE9qjo0VWp2FW0mB3MTJvR0BaMQrq0pmE= diff --git a/log/logger.go b/log/logger.go index 922d451..6aed763 100644 --- a/log/logger.go +++ b/log/logger.go @@ -18,6 +18,10 @@ var ( Logger zerolog.Logger ) +const ( + LOG_CONTEXT = "log-context" // for hook +) + func init() { // Get the desired log level from environment variables levelStr := os.Getenv("LOG_LEVEL") @@ -39,6 +43,33 @@ func init() { if os.Getenv("DISABLE_LOGS") == "true" { Logger = zerolog.New(NoopLogger{}) } else { - Logger = zerolog.New(os.Stdout).With().Timestamp().Logger() + hook := &ContextFilterHook{ + ContextKey: LOG_CONTEXT, + ContextValue: os.Getenv("LOG_CONTEXT_KEY"), + } + + Logger = zerolog.New(os.Stdout).With().Timestamp().Logger().Hook(hook) + } +} + +type ContextFilterHook struct { + ContextKey string + ContextValue string +} + +func (cfh *ContextFilterHook) Run(e *zerolog.Event, level zerolog.Level, message string) { + if os.Getenv("LOG_CONTEXT_KEY") == "" { + // if not specified, no filtering + return + } + val := e.GetCtx().Value(cfh.ContextKey) + if val != nil { + if val.(string) == cfh.ContextValue { + e.Str(cfh.ContextKey, cfh.ContextValue) + } else { + e.Discard() + } + } else { + e.Discard() } } diff --git a/logstreamer/pool.go b/logstreamer/pool.go index 46a96d2..4e61e7e 100644 --- a/logstreamer/pool.go +++ b/logstreamer/pool.go @@ -110,12 +110,12 @@ func (c *channelPool) Get() (*PoolConn, error) { select { case conn := <-conns: if conn == nil { - return nil, ErrClosed + return nil, fmt.Errorf("connection is nil") } if conn.unusable { log.Logger.Info().Msg("connection is unusable on Get, closing it") conn.Close() - return nil, ErrClosed + return nil, fmt.Errorf("connection is unusable") } if conn.isAlive() { @@ -123,7 +123,7 @@ func (c *channelPool) Get() (*PoolConn, error) { } else { conn.MarkUnusable() conn.Close() - return nil, ErrClosed + return nil, fmt.Errorf("connection is dead") } default: conn, err := factory() diff --git a/main.go b/main.go index 622ffd6..61358d1 100644 --- a/main.go +++ b/main.go @@ -88,6 +88,7 @@ func main() { MetricsExportInterval: 10, ReqBufferSize: 40000, // TODO: get from a conf file ConnBufferSize: 1000, // TODO: get from a conf file + KafkaEventBufferSize: 2000, }) var ct *cri.CRITool @@ -101,9 +102,11 @@ func main() { if tracingEnabled { ec = ebpf.NewEbpfCollector(ctx, ct) - a := aggregator.NewAggregator(ctx, kubeEvents, ec.EbpfEvents(), ec.EbpfProcEvents(), ec.EbpfTcpEvents(), ec.TlsAttachQueue(), dsBackend) + a := aggregator.NewAggregator(ctx, ct, kubeEvents, ec.EbpfEvents(), ec.EbpfProcEvents(), ec.EbpfTcpEvents(), ec.TlsAttachQueue(), dsBackend) a.Run() + a.AdvertiseDebugData() + ec.Init() go ec.ListenEvents() }