diff --git a/.circleci/config.yml b/.circleci/config.yml index 19dd6e2612..7bf815bb1d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -65,7 +65,7 @@ commands: jobs: test: docker: - - image: cimg/go:1.20 + - image: cimg/go:1.21 - image: redis:6.2 steps: - checkout @@ -92,7 +92,7 @@ jobs: build_binaries: docker: - - image: cimg/go:1.20 + - image: cimg/go:1.21 steps: - checkout - go-build: @@ -183,7 +183,7 @@ jobs: build_docker: docker: - - image: cimg/go:1.20 + - image: cimg/go:1.21 steps: - setup_googleko - checkout @@ -194,7 +194,7 @@ jobs: publish_docker_to_ecr: docker: - - image: cimg/go:1.20 + - image: cimg/go:1.21 steps: - setup_googleko - checkout @@ -217,7 +217,7 @@ jobs: publish_docker_to_dockerhub: docker: - - image: cimg/go:1.20 + - image: cimg/go:1.21 steps: - setup_googleko - checkout diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 242bc4f6b4..a4adbb7fb4 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -13,6 +13,11 @@ updates: - "type: dependencies" reviewers: - "honeycombio/collection-team" + groups: + minor-patch: + update-types: + - "minor" + - "patch" commit-message: prefix: "maint" include: "scope" diff --git a/.github/release.yml b/.github/release.yml index 3d9ee33826..041c00a66a 100644 --- a/.github/release.yml +++ b/.github/release.yml @@ -18,6 +18,8 @@ changelog: - title: 🛠 Maintenance labels: - "type: maintenance" + - "type: dependencies" + - "type: documentation" - title: 🤷 Other Changes labels: - - "*" \ No newline at end of file + - "*" diff --git a/.gitignore b/.gitignore index 273faa93c2..ae04781b0d 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,6 @@ dockerize* .idea/ .DS_Store + +# redis dump file +*.rdb diff --git a/.tool-versions b/.tool-versions new file mode 100644 index 0000000000..9264d46300 --- /dev/null +++ b/.tool-versions @@ -0,0 +1 @@ +golang 1.21.8 diff --git a/CHANGELOG.md b/CHANGELOG.md index 1d2efdb0bb..4963368af0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,277 @@ # Refinery Changelog +## 2.8.1 2024-09-05 + +This release includes a fix to a bug that prevented Refinery from starting up a new cluster from scratch. + +### Fixes +- fix: load peer list in sharder once manually on startup (#1314) | [Yingrong Zhao](https://github.com/vinozzZ) + + +## 2.8.0 2024-09-05 + +This release has many features designed to help manage and operate Refinery at scale. +It also includes some features to help in writing sampling rules (`in` and `not-in` operators, `root.`) +See full details in [the Release Notes](./RELEASE_NOTES.md). + +### Features +- feat: add IN operator (#1302) | [Kent Quirk](https://github.com/kentquirk) +- feat: support layered (multiple) configuration files (#1301) | [Kent Quirk](https://github.com/kentquirk) +- feat: Add a cache to the cache (#1296) | [Kent Quirk](https://github.com/kentquirk) +- feat: support configure refinery to use redis in cluster mode (#1294) | [Yingrong Zhao](https://github.com/vinozzZ) +- feat: allow `root.` in field list for dynamic sampler (#1275) | [Yingrong Zhao](https://github.com/vinozzZ) +- feat: redistribute traces on peer membership changes (#1268) | [Yingrong Zhao](https://github.com/vinozzZ) +- feat: Add SpanLimit (includes some config changes) (#1266) | [Kent Quirk](https://github.com/kentquirk) +- feat: redistribute remaining traces during shutdown (#1261) | [Yingrong Zhao](https://github.com/vinozzZ) +- feat: Allow more complex key behavior (#1263) | [Kent Quirk](https://github.com/kentquirk) +- feat: unregister peer asap on shutdown (#1260) | [Yingrong Zhao](https://github.com/vinozzZ) + +### Fixes +- fix: periodically clean up recent_dropped_traces cache (#1312) | [Yingrong Zhao](https://github.com/vinozzZ) +- fix: revert the revert -- that wasn't the problem (#1311) | [Kent Quirk](https://github.com/kentquirk) +- fix: revert "Use HTTP/2 for all upstream and peer-to-peer connections… (#1310) | [Kent Quirk](https://github.com/kentquirk) +- fix: join peer list only after refinery is ready to accept traffic (#1309) | [Yingrong Zhao](https://github.com/vinozzZ) +- fix: use float histogram for otel metrics (#1303) | [Kent Quirk](https://github.com/kentquirk) +- fix: escape use input in debug route (#1299) | [Tyler Helmuth](https://github.com/TylerHelmuth) +- fix: use trace.DescendantCount for span limit (#1297) | [Yingrong Zhao](https://github.com/vinozzZ) +- fix: support TLS connections to Redis (#1285) | [Yingrong Zhao](https://github.com/vinozzZ) +- fix: only set send reason to span limit if it's configured (#1290) | [Yingrong Zhao](https://github.com/vinozzZ) +- fix: record previous value of sampler counter metrics so they report correctly (#1281) | [Kent Quirk](https://github.com/kentquirk) +- fix: set up tls for redis when it's enabled | [Yingrong Zhao](https://github.com/vinozzZ) +- fix: don't read more than max bytes from a request (#1282) | [Kent Quirk](https://github.com/kentquirk) +- fix: allow draining traces even if only 1 peer left (#1278) | [Yingrong Zhao](https://github.com/vinozzZ) +- fix: record sample rate in decision cache during stress relief (#1273) | [Yingrong Zhao](https://github.com/vinozzZ) +- fix: SpanLimit shouldn't add SendDelay (#1272) | [Kent Quirk](https://github.com/kentquirk) +- fix: Use HTTP/2 for all upstream and peer-to-peer connections (#1269) | [Irving Popovetsky](https://github.com/irvingpop) + +### Maintenance +- maint: Add some extra logging to pubsub systems (#1308) | [Kent Quirk](https://github.com/kentquirk) +- maint: Add warning about cli flags (#1293) | [Tyler Helmuth](https://github.com/TylerHelmuth) +- maint: Delete unused Dockerfile (#1292) | [Tyler Helmuth](https://github.com/TylerHelmuth) +- maint: add a docker'd Redis TLS local setup (#1291) | [Robb Kidd](https://github.com/robbkidd) +- maint: change default for MaxSendMsgSize and MaxRcvMsgSize. (#1289) | [Kent Quirk](https://github.com/kentquirk) +- maint: use non-forked cuckoofilter again (#1287) | [Kent Quirk](https://github.com/kentquirk) +- maint(deps): bump the minor-patch group with 13 updates (#1304) | [dependabot[bot]](https://github.com/dependabot) +- maint(deps): bump the minor-patch group with 4 updates (#1262) | [dependabot[bot]](https://github.com/dependabot) +- refactor: Remove error returns from config functions, fix tests. (#1259) | [Kent Quirk](https://github.com/kentquirk) +- docs: fix CacheCapacity documentation (#1267) | [Kent Quirk](https://github.com/kentquirk) + +## 2.7.0 2024-07-29 + +This release incorporates a new publish/subscribe (pubsub) system for faster and cleaner communication between Refinery nodes. +In particular, the way Refinery uses Redis has changed. +See full details in [the Release Notes](./RELEASE_NOTES.md). + +### Features + +- feat: Add metrics to pubsub and peers (#1226) | [Kent Quirk](https://github.com/kentquirk) +- feat: add otel tracing support for Refinery internal operations (#1218) | [Yingrong Zhao](https://github.com/vinozzZ) +- feat: Add some useful generics (#1206) | [Kent Quirk](https://github.com/kentquirk) +- feat: gossip config reload information (#1241) | [Kent Quirk](https://github.com/kentquirk) +- feat: Health/Ready system imported from R3 (#1231) | [Kent Quirk](https://github.com/kentquirk) +- feat: peer management on pubsub via callbacks (#1220) | [Kent Quirk](https://github.com/kentquirk) +- feat: track config hash on config reload (#1212) | [Yingrong Zhao](https://github.com/vinozzZ) +- feat: use pub/sub for stress relief (#1221) | [Yingrong Zhao](https://github.com/vinozzZ) +- feat: Working, tested, but unused pubsub system (#1205) | [Kent Quirk](https://github.com/kentquirk) + +### Fixes + +- fix: add injection tags for configwatcher (#1246) | [Yingrong Zhao](https://github.com/vinozzZ) +- fix: add peer logging, add debug log of peers (#1239) | [Kent Quirk](https://github.com/kentquirk) +- fix: allow a single node to activate stress relief mode during significant load increase (#1256) | [Yingrong Zhao](https://github.com/vinozzZ) +- fix: allow sending otel tracing to non honeycomb backend (#1219) | [Yingrong Zhao](https://github.com/vinozzZ) +- fix: Change pubsub interface to use callbacks. (#1217) | [Kent Quirk](https://github.com/kentquirk) +- fix: clean up a print line (#1250) | [Yingrong Zhao](https://github.com/vinozzZ) +- fix: FilePeers implies no Redis (#1251) | [Kent Quirk](https://github.com/kentquirk) +- fix: make sure stress relief pub/sub topic is consistent (#1245) | [Yingrong Zhao](https://github.com/vinozzZ) +- fix: make sure to inject Health object as a pointer (#1237) | [Yingrong Zhao](https://github.com/vinozzZ) +- fix: Record hashes at startup in metrics (#1252) | [Kent Quirk](https://github.com/kentquirk) +- fix: reduce pub/sub messages from stress relief (#1248) | [Yingrong Zhao](https://github.com/vinozzZ) +- fix: remove otel-config-go as a dependency (#1240) | [Yingrong Zhao](https://github.com/vinozzZ) +- fix: remove personal api keys (#1253) | [Kent Quirk](https://github.com/kentquirk) +- fix: Root spans must have a non-empty parent ID field (#1236) | [Mike Goldsmith](https://github.com/MikeGoldsmith) +- fix: sharder should use peer identity from Peers package (#1249) | [Yingrong Zhao](https://github.com/vinozzZ) + +### Maintenance + +- docs: Tweak docs for reload (#1247) | [Kent Quirk](https://github.com/kentquirk) +- docs: update vulnerability reporting process (#1224) | [Robb Kidd](https://github.com/robbkidd) +- maint: add instrumentation for GoRedisPubSub (#1229) | [Yingrong Zhao](https://github.com/vinozzZ) +- maint: Add jitter to peer traffic, fix startup (#1227) | [Kent Quirk](https://github.com/kentquirk) +- maint: change targeted arch to arm for local development Dockerfile (#1228) | [Yingrong Zhao](https://github.com/vinozzZ) +- maint: last changes before the final release prep (#1254) | [Kent Quirk](https://github.com/kentquirk) +- maint: update doc based on config changes (#1243) | [Yingrong Zhao](https://github.com/vinozzZ) +- maint: Update licenses (#1244) | [Tyler Helmuth](https://github.com/TylerHelmuth) +- maint(deps): bump google.golang.org/grpc from 1.64.0 to 1.64.1 (#1223) | [dependabot[bot]](https://github.com/dependabot) +- maint(deps): bump the minor-patch group across 1 directory with 9 updates (#1232) | [dependabot[bot]](https://github.com/dependabot) + + +## 2.6.1 2024-06-17 + +### Fixes + +- fix: Don’t consider log events as root spans (#1208) | @MikeGoldsmith + +### Maintenance + +- maint(deps): bump the minor-patch group with 9 updates (#1198) | @dependabot + +## 2.6.0 2024-06-17 + +### Features + +- feat: Allow URL encoded dataset in libhoney endpoint paths (#1199) | @MikeGoldsmith +- feat: Add OTLP log endpoints (gRPC & HTTP) (#1187) | @MikeGoldsmith + +### Maintenance + +- maint: Bump libhoney-go to v1.23.1 (#1200) | @MikeGoldsmith +- maint: bump libhoney-go to v1.23.0 (#1192) | @MikeGoldsmith +- maint: bump Husky to v0.30.0 (#1190) | @TylerHelmuth + +## 2.5.2 2024-05-22 + +This release fixes a race condition in OTel Metrics that caused Refinery to crash. +This update is recommended for everyone who has OTelMetrics enabled. + +### Fixes + +- fix: correct race condition in OTel metrics (#1165) | [Kent Quirk](https://github.com/kentquirk) + +Thanks to [Joshua Jones](https://github.com/senojj) for the [bug report](https://github.com/honeycombio/refinery/issues/1156) and diagnosis. + +## 2.5.1 2024-05-15 + +### Fixes + +- fix: Clarify what has-root-span does (#1114) | [Phillip Carter](https://github.com/cartermp) +- fix: Add validation for ingest keys (#1066) | [Kent Quirk](https://github.com/kentquirk) +- fix: Deal with locking issues at startup (#1060) | [Kent Quirk](https://github.com/kentquirk) +- fix: Update cache lookup to use read lock (#1145) | [Joshua Jones](https://github.com/senojj) + +### Maintenance + +- maint: Bump protobuf (#1058) | [Kent Quirk](https://github.com/kentquirk) +- maint(deps): bump the minor-patch group with 4 updates (#1073) | [dependabot[bot]](https://github.com/dependabot) + +## 2.5.0 2024-03-12 + +The main feature is support of Honeycomb Classic ingest keys; there is also a performance improvement for the new +`root.` rule feature, and a new metric to track traces dropped by rules. + +### Features + +- feat: new metric for drops caused by rules (#1047) | [Kent Quirk](https://github.com/kentquirk) +- feat: Shortcut evaluation of rules containing 'root.' (#1018) | [Kent Quirk](https://github.com/kentquirk) +- feat: support Classic Ingest Keys (#1043) | [Jason Harley](https://github.com/jharley) + +### Fixes + +- fix: change validation type for PeerManagement.Peers to be url (#1046) | [Yingrong Zhao](https://github.com/vinozzZ) +- fix: `defaulttrue` now shows up in docs as `bool` (#1045) | [Kent Quirk](https://github.com/kentquirk) +- fix: Support 'none' as a logger type (#1034) | [Kent Quirk](https://github.com/kentquirk) + +### Maintenance + +- maint: add labels to release.yml for auto-generated grouping (#1042) | [Jamie Danielson](https://github.com/JamieDanielson) +- maint(deps): bump the minor-patch group with 12 updates (#1030) | [dependabot[bot]](https://github.com/dependabot) +- maint: group minor/patch dep updates (#1028) | [Alex Boten](https://github.com/Alex Boten) + + +## 2.4.3 2024-03-01 + +A bug fix release for a regression introduced in the 2.4.2 bug fix release. +It was possible to trigger 500 errors in Refinery's OTLP error responses when sending traces in an unsupported content-type. + +### Fixes + +- fix: upgrade husky to handle and add tests for invalid content type errors (#1019) | [Mike Goldsmith](https://github.com/MikeGoldsmith) & [Robb Kidd](https://github.com/robbkidd) + +## 2.4.2 2024-02-28 + +This is a bug fix release for returning a improperly formatted OTLP error responses. +OTLP clients receiving the improper response would show errors about parsing the response, masking the error message within the response which complicated solving data send issues. +This release is a recommended upgrade for anyone sending OTLP data to Refinery. + +### Fixes + +- fix: Bring OTLP HTTP error responses in line with spec. (#1010) | [Tyler Helmuth](https://github.com/TylerHelmuth) + +## 2.4.1 2024-02-26 + +This is a bug fix release for matching fields in the root span context. + +### Fixes + +The implementation in v2.4.0 can crash if the trace's root span is not present at the time a sampling decision is being made. +Root spans are often not present when the root span is taking longer to complete than the time configured for Refinery to wait for a trace's spans to arrive (`TraceTimeout`). +This release contains a fix for this crash and is a recommended upgrade for anyone using this new feature. + +- fix: handle root prefix when no root span on trace (#1006) | [fchikwekwe](https://github.com/fchikwekwe) + +### Maintenance + +- refactor: add default true type (#998) | [fchikwekwe](https://github.com/fchikwekwe) + +## 2.4.0 2024-2-20 + +## Features + +- Update refinery_rules.md | [fchikwekwe](https://github.com/fchikwekwe) +- feat: allow user to sample on root span context (#981) | [fchikwekwe](https://github.com/fchikwekwe) + +## Fixes + +- fix: flaky TestOriginalSampleRateIsNotedInMetaField (#991) | [Robb Kidd](https://github.com/robbkidd) +- chore: consolidate routine dependency updates (#994) | [Robb Kidd](https://github.com/robbkidd) +- chore: Revert "chore: fix license tracking (#989)" (#990) | [Robb Kidd](https://github.com/robbkidd) +- chore: fix license tracking (#989) | [Robb Kidd](https://github.com/robbkidd) +- fix: allow config bools to default to true (#969) | [Robb Kidd](https://github.com/robbkidd) + +## Maintenance + +- docs: update configMeta to remove spaces | [fchikwekwe](https://github.com/fchikwekwe) +- docs: update refinery docs | [fchikwekwe](https://github.com/fchikwekwe) +- docs: Add sampler default intervals to docs (#995) | [Mike Goldsmith](https://github.com/MikeGoldsmith) +- docs: include a warning about surprising not-exists behavior (#979) | [Robb Kidd](https://github.com/robbkidd) +- maint: Refactor cuckoo cache for reusability (#975) | [Yingrong Zhao](https://github.com/vinozzZ) +- maint: create generic set and use it (#976) | [Kent Quirk](https://github.com/KentQuirk) +- maint: bump deps for 2.4 (#968) | [fchikwekwe](https://github.com/fchikwekwe) +- maint: bump Husky (#966) | [Kent Quirk](https://github.com/KentQuirk) + + +## 2.3.0 2023-12-20 + +## Features + +- feat: Add `matches` operator to rules (#939) | [Kent Quirk](https://github.com/kentquirk) +- feat: Add Fields option for rules (#949) | [Kent Quirk](https://github.com/kentquirk) +- feat: use a computed field for current descendant count in rules (#950) | [Yingrong Zhao](https://github.com/vinozzZ) +- feat: add sent reason for late arriving spans (#936) | [Yingrong Zhao](https://github.com/vinozzZ) +- docs: Add rule conditions documentation (#951) | [Kent Quirk](https://github.com/kentquirk) +- docs: document stress relief in readme (#955) | [Faith Chikwekwe](https://github.com/fchikwekwe) + +## Fixes + +- fix: Fix memory size parsing (#944) | [tvdfly](https://github.com/tvdfly) +- fix: handle otlp request with /v1/traces/ path (#933) | [Yingrong Zhao](https://github.com/vinozzZ) + +## Maintenance + +- maint: Update `firstversion` for 2.2 (#957) | [Kent Quirk](https://github.com/kentquirk) +- maint: update codeowners to pipeline (#937) | [Jamie Danielson](https://github.com/JamieDanielson) +- maint: update codeowners to pipeline-team (#942) | [Jamie Danielson](https://github.com/JamieDanielson) +- maint: update project workflow for pipeline (#938) | [Jamie Danielson](https://github.com/JamieDanielson) +- maint: upload test result to circle ci (#940) | [Yingrong Zhao](https://github.com/vinozzZ) +- maint: use command to check for other commands (#941) | [Robb Kidd](https://github.com/robbkidd) +- docs: Add section on running tests to contributing guide (#953) | [Mike Goldsmith](https://github.com/MikeGoldsmith) +- docs: update doc for release process and config/rules doc generation process (#932) | [Yingrong Zhao](https://github.com/vinozzZ) +- test: Integration tests fail in parallel (#935) | [Kent Quirk](https://github.com/kentquirk) +- test: try to deflake several flaky tests (#934) | [Kent Quirk](https://github.com/kentquirk) +- test: attempt to fix flaky integration tests (#945) | [Yingrong Zhao](https://github.com/vinozzZ) +- test: add deterministic fallback test (#948) | [Faith Chikwekwe](https://github.com/fchikwekwe) +- test: use `t.Setenv` to set env vars in tests (#947) | [Eng Zer Jun](https://github.com/Juneezee) + ## 2.2.0 2023-12-04 This is a minor release with several new configuration options and bug fixes, and is recommended for all Refinery users. See [Release Notes](./RELEASE_NOTES.md) for a summary of changes. @@ -177,8 +449,8 @@ For more information, see [the release notes](https://github.com/honeycombio/ref - docs: Fix up docs, especially envvar and cmdline (#737) | [Kent Quirk](https://github.com/kentquirk) - docs: Fix convert help and docs (#744) | [Kent Quirk](https://github.com/kentquirk) - maint: README updates -- round 1 (#742) | [Phillip Carter](https://github.com/cartermp) -- maint(deps): Bump github.com/klauspost/compress from 1.16.4 to 1.16.5 (#675) | [dependabot[bot]](https://github.com/dependabot[bot]) -- maint(deps): Bump github.com/prometheus/client_golang from 1.14.0 to 1.15.1 (#676) | [dependabot[bot]](https://github.com/dependabot[bot]) +- maint(deps): Bump github.com/klauspost/compress from 1.16.4 to 1.16.5 (#675) | [dependabot[bot]](https://github.com/dependabot) +- maint(deps): Bump github.com/prometheus/client_golang from 1.14.0 to 1.15.1 (#676) | [dependabot[bot]](https://github.com/dependabot) - refactor: Rename fields for clarity in an E&S world (#680) | [Kent Quirk](https://github.com/kentquirk) - maint: Update dependencies (#699) | [Kent Quirk](https://github.com/kentquirk) - docs: Improve generated documentation (#711) | [Kent Quirk](https://github.com/kentquirk) @@ -220,14 +492,14 @@ were already set upstream before refinery sampling for debugging purposes. ### Maintenance - chore: Update MetricsReportingInterval in config_complete.toml (#653) | [Davin](https://github.com/Davin) - maint: switch dependabot to collection (#660) | [Vera Reynolds](https://github.com/Vera Reynolds) -- maint(deps): bump google.golang.org/protobuf from 1.28.1 to 1.30.0 (#663) | [dependabot[bot]](https://github.com/dependabot[bot]) -- maint(deps): bump github.com/honeycombio/husky from 0.21.0 to 0.22.2 (#662) | [dependabot[bot]](https://github.com/dependabot[bot]) -- maint(deps): bump github.com/klauspost/compress from 1.16.3 to 1.16.4 (#661) | [dependabot[bot]](https://github.com/dependabot[bot]) -- maint(deps): bump go.uber.org/automaxprocs from 1.5.1 to 1.5.2 (#650) | [dependabot[bot]](https://github.com/dependabot[bot]) -- maint(deps): bump github.com/honeycombio/dynsampler-go from 0.3.0 to 0.4.0 (#649) | [dependabot[bot]](https://github.com/dependabot[bot]) -- maint(deps): bump google.golang.org/grpc from 1.52.3 to 1.54.0 (#652) | [dependabot[bot]](https://github.com/dependabot[bot]) -- maint(deps): bump github.com/honeycombio/husky from 0.21.0 to 0.22.2 (#651) | [dependabot[bot]](https://github.com/dependabot[bot]) -- maint(deps): bump github.com/klauspost/compress from 1.16.0 to 1.16.3 (#648) | [dependabot[bot]](https://github.com/dependabot[bot]) +- maint(deps): bump google.golang.org/protobuf from 1.28.1 to 1.30.0 (#663) | [dependabot[bot]](https://github.com/dependabot) +- maint(deps): bump github.com/honeycombio/husky from 0.21.0 to 0.22.2 (#662) | [dependabot[bot]](https://github.com/dependabot) +- maint(deps): bump github.com/klauspost/compress from 1.16.3 to 1.16.4 (#661) | [dependabot[bot]](https://github.com/dependabot) +- maint(deps): bump go.uber.org/automaxprocs from 1.5.1 to 1.5.2 (#650) | [dependabot[bot]](https://github.com/dependabot) +- maint(deps): bump github.com/honeycombio/dynsampler-go from 0.3.0 to 0.4.0 (#649) | [dependabot[bot]](https://github.com/dependabot) +- maint(deps): bump google.golang.org/grpc from 1.52.3 to 1.54.0 (#652) | [dependabot[bot]](https://github.com/dependabot) +- maint(deps): bump github.com/honeycombio/husky from 0.21.0 to 0.22.2 (#651) | [dependabot[bot]](https://github.com/dependabot) +- maint(deps): bump github.com/klauspost/compress from 1.16.0 to 1.16.3 (#648) | [dependabot[bot]](https://github.com/dependabot) - maint: Add labels to docker image (#640) | [Tyler Helmuth](https://github.com/TylerHelmuth) - maint: Add LICENSES dir (#638) | [Tyler Helmuth](https://github.com/TylerHelmuth) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 98a72a0ed9..bc35871878 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -16,6 +16,14 @@ at the root of the Refinery directory. `go run ./cmd/refinery/main.go -c config_complete.yaml -r rules_complete.yaml` +# Running Refinery tests locally + +Tests require a local installation of redis. See [here](https://redis.io/docs/install/install-stack) for how to get it running. + +You can run Refinery tests by running the command below at the root of the Refinery directory. + +`make test` + # Making changes to configuration code With the new configuration format redesign in v2.0.0, the workflow for making a configuration requires the following steps: diff --git a/LICENSES/github.com/dgryski/go-rendezvous/LICENSE b/LICENSES/github.com/dgryski/go-rendezvous/LICENSE new file mode 100644 index 0000000000..22080f736a --- /dev/null +++ b/LICENSES/github.com/dgryski/go-rendezvous/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2017-2020 Damian Gryski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/LICENSES/github.com/gomodule/redigo/redis/LICENSE b/LICENSES/github.com/gomodule/redigo/redis/LICENSE deleted file mode 100644 index f433b1a53f..0000000000 --- a/LICENSES/github.com/gomodule/redigo/redis/LICENSE +++ /dev/null @@ -1,177 +0,0 @@ - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS diff --git a/LICENSES/github.com/golang/protobuf/LICENSE b/LICENSES/github.com/google/uuid/LICENSE similarity index 83% rename from LICENSES/github.com/golang/protobuf/LICENSE rename to LICENSES/github.com/google/uuid/LICENSE index 0f646931a4..5dc68268d9 100644 --- a/LICENSES/github.com/golang/protobuf/LICENSE +++ b/LICENSES/github.com/google/uuid/LICENSE @@ -1,16 +1,16 @@ -Copyright 2010 The Go Authors. All rights reserved. +Copyright (c) 2009,2014 Google Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright + * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above + * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of Google Inc. nor the names of its + * Neither the name of Google Inc. nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. @@ -25,4 +25,3 @@ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - diff --git a/LICENSES/github.com/gorilla/mux/LICENSE b/LICENSES/github.com/gorilla/mux/LICENSE index 6903df6386..bb9d80bc9b 100644 --- a/LICENSES/github.com/gorilla/mux/LICENSE +++ b/LICENSES/github.com/gorilla/mux/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2012-2018 The Gorilla Authors. All rights reserved. +Copyright (c) 2023 The Gorilla Authors. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/LICENSES/github.com/grpc-ecosystem/grpc-gateway/v2/LICENSE.txt b/LICENSES/github.com/grpc-ecosystem/grpc-gateway/v2/LICENSE similarity index 100% rename from LICENSES/github.com/grpc-ecosystem/grpc-gateway/v2/LICENSE.txt rename to LICENSES/github.com/grpc-ecosystem/grpc-gateway/v2/LICENSE diff --git a/LICENSE b/LICENSES/github.com/honeycombio/husky/LICENSE similarity index 100% rename from LICENSE rename to LICENSES/github.com/honeycombio/husky/LICENSE diff --git a/LICENSES/github.com/honeycombio/husky/otlp/NOTICE b/LICENSES/github.com/honeycombio/husky/NOTICE similarity index 100% rename from LICENSES/github.com/honeycombio/husky/otlp/NOTICE rename to LICENSES/github.com/honeycombio/husky/NOTICE diff --git a/LICENSES/github.com/matttproud/golang_protobuf_extensions/pbutil/LICENSE b/LICENSES/github.com/jonboulle/clockwork/LICENSE similarity index 99% rename from LICENSES/github.com/matttproud/golang_protobuf_extensions/pbutil/LICENSE rename to LICENSES/github.com/jonboulle/clockwork/LICENSE index 8dada3edaf..5c304d1a4a 100644 --- a/LICENSES/github.com/matttproud/golang_protobuf_extensions/pbutil/LICENSE +++ b/LICENSES/github.com/jonboulle/clockwork/LICENSE @@ -1,4 +1,4 @@ - Apache License +Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ diff --git a/LICENSES/github.com/matttproud/golang_protobuf_extensions/pbutil/NOTICE b/LICENSES/github.com/matttproud/golang_protobuf_extensions/pbutil/NOTICE deleted file mode 100644 index 5d8cb5b72e..0000000000 --- a/LICENSES/github.com/matttproud/golang_protobuf_extensions/pbutil/NOTICE +++ /dev/null @@ -1 +0,0 @@ -Copyright 2012 Matt T. Proud (matt.proud@gmail.com) diff --git a/LICENSES/github.com/prometheus/common/internal/bitbucket.org/ww/goautoneg/README.txt b/LICENSES/github.com/munnerz/goautoneg/LICENSE similarity index 68% rename from LICENSES/github.com/prometheus/common/internal/bitbucket.org/ww/goautoneg/README.txt rename to LICENSES/github.com/munnerz/goautoneg/LICENSE index 7723656d58..bbc7b897cb 100644 --- a/LICENSES/github.com/prometheus/common/internal/bitbucket.org/ww/goautoneg/README.txt +++ b/LICENSES/github.com/munnerz/goautoneg/LICENSE @@ -1,13 +1,3 @@ -PACKAGE - -package goautoneg -import "bitbucket.org/ww/goautoneg" - -HTTP Content-Type Autonegotiation. - -The functions in this package implement the behaviour specified in -http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html - Copyright (c) 2011, Open Knowledge Foundation Ltd. All rights reserved. @@ -39,29 +29,3 @@ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -FUNCTIONS - -func Negotiate(header string, alternatives []string) (content_type string) -Negotiate the most appropriate content_type given the accept header -and a list of alternatives. - -func ParseAccept(header string) (accept []Accept) -Parse an Accept Header string returning a sorted list -of clauses - - -TYPES - -type Accept struct { - Type, SubType string - Q float32 - Params map[string]string -} -Structure to represent a clause in an HTTP Accept Header - - -SUBDIRECTORIES - - .hg diff --git a/LICENSES/github.com/vmihailenco/tagparser/LICENSE b/LICENSES/github.com/prometheus/client_golang/internal/github.com/golang/gddo/httputil/LICENSE similarity index 83% rename from LICENSES/github.com/vmihailenco/tagparser/LICENSE rename to LICENSES/github.com/prometheus/client_golang/internal/github.com/golang/gddo/httputil/LICENSE index 3fc93fdff8..65d761bc9f 100644 --- a/LICENSES/github.com/vmihailenco/tagparser/LICENSE +++ b/LICENSES/github.com/prometheus/client_golang/internal/github.com/golang/gddo/httputil/LICENSE @@ -1,5 +1,4 @@ -Copyright (c) 2019 The github.com/vmihailenco/tagparser Authors. -All rights reserved. +Copyright (c) 2013 The Go Authors. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -11,6 +10,9 @@ notice, this list of conditions and the following disclaimer. copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT diff --git a/LICENSES/github.com/prometheus/client_golang/prometheus/NOTICE b/LICENSES/github.com/prometheus/client_golang/prometheus/NOTICE index dd878a30ee..b9cc55abbb 100644 --- a/LICENSES/github.com/prometheus/client_golang/prometheus/NOTICE +++ b/LICENSES/github.com/prometheus/client_golang/prometheus/NOTICE @@ -16,8 +16,3 @@ Go support for Protocol Buffers - Google's data interchange format http://github.com/golang/protobuf/ Copyright 2010 The Go Authors See source code for license details. - -Support for streaming Protocol Buffer messages for the Go language (golang). -https://github.com/matttproud/golang_protobuf_extensions -Copyright 2013 Matt T. Proud -Licensed under the Apache License, Version 2.0 diff --git a/LICENSES/github.com/vmihailenco/msgpack/v4/LICENSE b/LICENSES/github.com/redis/go-redis/v9/LICENSE similarity index 95% rename from LICENSES/github.com/vmihailenco/msgpack/v4/LICENSE rename to LICENSES/github.com/redis/go-redis/v9/LICENSE index b749d07079..f4967dbc5c 100644 --- a/LICENSES/github.com/vmihailenco/msgpack/v4/LICENSE +++ b/LICENSES/github.com/redis/go-redis/v9/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2013 The github.com/vmihailenco/msgpack Authors. +Copyright (c) 2013 The github.com/redis/go-redis Authors. All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/LICENSES/github.com/honeycombio/husky/otlp/LICENSE b/LICENSES/go.opentelemetry.io/otel/exporters/otlp/otlptrace/LICENSE similarity index 100% rename from LICENSES/github.com/honeycombio/husky/otlp/LICENSE rename to LICENSES/go.opentelemetry.io/otel/exporters/otlp/otlptrace/LICENSE diff --git a/LICENSES/go.opentelemetry.io/otel/exporters/otlp/otlpmetric/LICENSE b/LICENSES/go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp/LICENSE similarity index 100% rename from LICENSES/go.opentelemetry.io/otel/exporters/otlp/otlpmetric/LICENSE rename to LICENSES/go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp/LICENSE diff --git a/LICENSES/golang.org/x/net/LICENSE b/LICENSES/golang.org/x/net/LICENSE index 6a66aea5ea..2a7cf70da6 100644 --- a/LICENSES/golang.org/x/net/LICENSE +++ b/LICENSES/golang.org/x/net/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2009 The Go Authors. All rights reserved. +Copyright 2009 The Go Authors. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -10,7 +10,7 @@ notice, this list of conditions and the following disclaimer. copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of Google Inc. nor the names of its + * Neither the name of Google LLC nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. diff --git a/LICENSES/golang.org/x/sys/unix/LICENSE b/LICENSES/golang.org/x/sys/unix/LICENSE index 6a66aea5ea..2a7cf70da6 100644 --- a/LICENSES/golang.org/x/sys/unix/LICENSE +++ b/LICENSES/golang.org/x/sys/unix/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2009 The Go Authors. All rights reserved. +Copyright 2009 The Go Authors. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -10,7 +10,7 @@ notice, this list of conditions and the following disclaimer. copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of Google Inc. nor the names of its + * Neither the name of Google LLC nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. diff --git a/LICENSES/golang.org/x/text/LICENSE b/LICENSES/golang.org/x/text/LICENSE index 6a66aea5ea..2a7cf70da6 100644 --- a/LICENSES/golang.org/x/text/LICENSE +++ b/LICENSES/golang.org/x/text/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2009 The Go Authors. All rights reserved. +Copyright 2009 The Go Authors. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -10,7 +10,7 @@ notice, this list of conditions and the following disclaimer. copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of Google Inc. nor the names of its + * Neither the name of Google LLC nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. diff --git a/Makefile b/Makefile index e41d26ecc1..37ef348f0e 100644 --- a/Makefile +++ b/Makefile @@ -27,6 +27,15 @@ test_all: test_results wait_for_redis test_results: @mkdir -p test_results +local_image: export KO_DOCKER_REPO=ko.local +local_image: export CIRCLE_TAG=$(shell git describe --always --match "v[0-9]*" --tags) +local_image: export CIRCLE_BRANCH=$(shell git rev-parse --abbrev-ref HEAD) +local_image: export CIRCLE_SHA1=$(shell git rev-parse HEAD) +local_image: export CIRCLE_BUILD_NUM='' +#: build the release image locally, available as "ko.local/refinery:" +local_image: + ./build-docker.sh + .PHONY: wait_for_redis # wait for Redis to become available for test suite wait_for_redis: dockerize @@ -68,17 +77,23 @@ clean: .PHONY: install-tools install-tools: - go install github.com/google/go-licenses@latest + go install github.com/google/go-licenses/v2@v2.0.0-alpha.1 .PHONY: update-licenses update-licenses: install-tools rm -rf LICENSES; \ - go-licenses save ./cmd/refinery --save_path LICENSES; + #: We ignore the standard library (go list std) as a workaround for \ + "https://github.com/google/go-licenses/issues/244." The awk script converts the output \ + of `go list std` (line separated modules) to the input that `--ignore` expects (comma separated modules). + go-licenses save --save_path LICENSES --ignore "github.com/honeycombio/refinery" \ + --ignore $(shell go list std | awk 'NR > 1 { printf(",") } { printf("%s",$$0) } END { print "" }') ./cmd/refinery; .PHONY: verify-licenses verify-licenses: install-tools - go-licenses save ./cmd/refinery --save_path temp; \ - if diff temp LICENSES > /dev/null; then \ + go-licenses save --save_path temp --ignore "github.com/honeycombio/refinery" \ + --ignore $(shell go list std | awk 'NR > 1 { printf(",") } { printf("%s",$$0) } END { print "" }') ./cmd/refinery; \ + chmod +r temp; \ + if diff temp LICENSES; then \ echo "Passed"; \ rm -rf temp; \ else \ diff --git a/README.md b/README.md index 065b7ebd84..3a34b517a2 100644 --- a/README.md +++ b/README.md @@ -80,12 +80,17 @@ This communication can be managed in two ways: via an explicit list of peers in ## Configuration -Configuration is controlled by Refinery's two configuration files, which is generally referred to as `config.yaml` for general configuration and `rules.yaml` for sampling configuration. +Configuration is controlled by Refinery's two configuration files, which is generally referred to as `config.yaml` for general configuration and `rules.yaml` for sampling configuration. These files can be loaded from an accessible filesystem, or loaded with an unauthenticated GET request from a URL. Learn more about `config.yaml` and all the parameters that control Refinery's operation in our [Refinery configuration documentation](https://docs.honeycomb.io/manage-data-volume/refinery/configuration/). Learn more about `rules.yaml` and sampler configuration in our [Refinery sampling methods documentation](https://docs.honeycomb.io/manage-data-volume/refinery/sampling-methods/). +It is valid to specify more than one configuration source. +For example, it would be possible to have a common configuration file, plus a separate file containing only keys. +On the command line, specify multiple files by repeating the command line switch. +In environment variables, separate multiple config locations with commas. + ## Running Refinery Refinery is a typical linux-style command line application, and supports several command line switches. @@ -109,12 +114,54 @@ Refinery supports the following key environment variables; please see the comman Note: `REFINERY_HONEYCOMB_METRICS_API_KEY` takes precedence over `REFINERY_HONEYCOMB_API_KEY` for the `LegacyMetrics.APIKey` configuration. +## Managing Keys + +Sending data to Honeycomb requires attaching an API key to telemetry. In order to make managing telemetry easier, Refinery support the `ReceiveKeys` and `SendKey` config options, along with `AcceptOnlyListedKeys` and `SendKeyMode`. In various combinations, they have a lot of expressive power. Please see the configuration documentation for details on how to set these parameters. + +A quick start for specific scenarios is below: + +### A small number of services +* Set keys in your applications the way you normally would, and leave Refinery set to the defaults. + +### Large number of services, central key preferred +* Do not set keys in your applications +* Set `SendKey` to a valid Honeycomb Key +* Set `SendKeyMode` to `all` + +### Applications must set a key, but control the actual key at Refinery +* Set `SendKey` to a valid Honeycomb Key +* Set `SendKeyMode` to `nonblank` + +### Replace most keys but permit exceptions +* Set `ReceiveKeys` to the list of exceptions +* Set `SendKey` to a valid Honeycomb Key +* Set `SendKeyMode` to `unlisted` + +### Some applications have custom keys, but others should use central key +* Set custom keys in your applications as needed, leave others blank +* Set `SendKey` to a valid Honeycomb Key +* Set `SendKeyMode` to `missingonly` + +### Only applications knowing a specific secret should be able to send telemetry, but a central key is preferred +* Choose an internal secret key (any arbitrary string) +* Add that secret to `ReceiveKeys` +* Set `AcceptOnlyListedKeys` to `true` +* Set `SendKey` to a valid Honeycomb Key +* Set `SendKeyMode` to `listedonly` + +### Replace specific keys used by certain applications with the central key +* Set `AcceptOnlyListedKeys` to `false` +* Set `ReceiveKeys` to the keys that should be replaced +* Set `SendKey` to a valid Honeycomb Key +* Set `SendKeyMode` to `listedonly` + + ## Dry Run Mode -When getting started with Refinery or when updating sampling rules, it may be helpful to verify that the rules are working as expected before you start dropping traffic. To do so, use Dry Run Mode in Refinery. +When getting started with Refinery or when updating sampling rules, it may be helpful to verify that the rules are working as expected before you start dropping traffic. To do so, use Dry Run Mode in Refinery. Enable [Dry Run Mode](https://docs.honeycomb.io/manage-data-volume/refinery/sampling-methods/#run-refinery-in-dry-run-mode) by adding `DryRun = true` in your configuration file (`config.yaml`). -Then, use [Query Builder in the Honeycomb UI](https://docs.honeycomb.io/working-with-your-data/queries/) to run queries to check your results and verify that the rules are working as intended. +Then, use [Query Builder in the Honeycomb UI](https://docs.honeycomb.io/working-with-your-data/queries/) to run queries to check your results and verify that the rules are working as intended. When Dry Run Mode is enabled, the metric `trace_send_kept` will increment for each trace, and the metric for `trace_send_dropped` will remain `0`, reflecting that we are sending all traces to Honeycomb. @@ -124,6 +171,23 @@ Refinery uses bounded queues and circular buffers to manage allocating traces, s Determining the number of machines necessary in the cluster is not an exact science, and is best influenced by watching for buffer overruns. But for a rough heuristic, count on a single machine using about 2GB of memory to handle 5,000 incoming events and tracking 500 sub-second traces per second (for each full trace lasting less than a second and an average size of 10 spans per trace). +### Stress Relief + +Refinery offers a mechanism called `Stress Relief` that improves stability under heavy load. +The `stress_level` metric is a synthetic metric on a scale from 0 to 100 that is constructed from several Refinery metrics relating to queue sizes and memory usage. +Under normal operation, its value should usually be in the single digits. During bursts of high traffic, the stress levels might creep up and then drop again as the volume drops. As it approaches 100, it is more and more likely that Refinery will start to fail and possibly crash. + +`Stress Relief` is a system that can monitor the `stress_level` metric and shed load when stress becomes a danger to stability. Once the `ActivationLevel`is reached, `Stress Relief` mode will become active. In this state. Refinery will deterministically sample each span based on `TraceID` without having to store the rest of the trace or evaluate rule conditions. `Stress Relief` will remain active until stress falls below the `DeactivationLevel` specified in the config. + +The stress relief settings are: + +- `Mode` - Setting to indicate how `Stress Relief` is used. `never` indicates that `Stress Relief` will not activate. `monitor` means `Stress Relief` will activate when the `ActivationLevel` and deactivate when the is reached. `always` means that `Stress Relief` mode will continuously be engaged. The `always` mode is intended for use in emergency situations. +- `ActivationLevel` - When the stress level rises above this threshold, Refinery will activate `Stress Relief`. +- `DeactivationLevel` - When the stress level falls below this threshold, Refinery will deactivate `Stress Relief`. +- `SamplingRate` - The rate at which Refinery samples while `Stress Relief` is active. + +The `stress_level` is currently the best proxy for the overall load on Refinery. Even if `Stress Relief` is not active, if `stress_level` is frequently above 50, it is a good indicator that Refinery needs more resources -- more CPUs, more memory, or more nodes. On the other hand, if `stress_level` never goes into double digits it is likely that Refinery is overprovisioned. + ## Understanding Regular Operation Refinery emits a number of metrics to give some indication about the health of the process. These metrics should be sent to Honeycomb, typically with Open Telemetry, and can also be exposed to Prometheus. The interesting ones to watch are: @@ -166,7 +230,7 @@ To retrieve the rule set that Refinery uses for the specified dataset, which wil ```curl curl --include --get $REFINERY_HOST/query/rules/$FORMAT/$DATASET --header "x-honeycomb-refinery-query: my-local-token" -``` +``` To retrieve information about the configurations currently in use, including the timestamp when the configuration was last loaded: diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 5cce390f76..77603d9d7d 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,6 +1,206 @@ # Release Notes -While [CHANGELOG.md](./CHANGELOG.md) contains detailed documentation and links to all of the source code changes in a given release, this document is intended to be aimed at a more comprehensible version of the contents of the release from the point of view of users of Refinery. +While [CHANGELOG.md](./CHANGELOG.md) contains detailed documentation and links to all the source code changes in a given release, this document is intended to be aimed at a more comprehensible version of the contents of the release from the point of view of users of Refinery. + +## Version 2.8.0 + +This release has a several significant changes that makes Refinery easier to operate at scale. + +### Draining during Shutdown + +When one Refinery in a cluster shuts down, it now: + +* Immediately marks itself as "not ready" so that load balancers will stop sending it data +* Immediately removes itself from the peer list so that other Refineries will stop sending data +* Recalculates the appropriate destination for all the traces it has been tracking, and forwards those spans to the correct instance. + +When other Refineries in a cluster see that the number of peers has changed, they: + +* Check all the traces they have been tracking; for any that belong to a different Refinery, all the spans are forwarded to the new destination. (On average in an N-node cluster, 1/N of the existing traces will need to move.) + +### Redis Cluster Support + +Refinery now supports Redis instances deployed in Cluster Mode. There is a new configuration parameter, `ClusterHosts`, which can be used to enable this support. + +This should make it easier to configure AWS ElastiCache for use with Refinery, since ElastiCache now uses Redis Cluster Mode by default. + +In addition, Refinery now supports the use of TLS for communications with Redis. + +### SpanLimit + +Until this release, Refinery has marked a trace for trace decision when either: + +* The root span arrives +* The TraceTimeout expires + +Release 2.8 introduces a new feature, `SpanLimit`, which provides a third way to cause Refinery to make a trace decision. It sets the maximum number of descendants that a trace can have before it gets marked for a trace decision. This is to help with the memory consumption due to very large traces. + +Suppose, for example, that a service generates a single trace with 10,000 spans. If SpanLimit is set to 1000, once the first 1000 spans have arrived, Refinery will immediately make a decision to keep or drop the trace. Every additional span is dispatched (using the same decision) without storing it. This means that Refinery never had to keep all 10,000 spans in its memory at one time. + +For installations that sometimes see very large traces, this feature can have a significant impact on memory usage within a cluster, and can effectively prevent one Refinery in a cluster from running out of memory due to a big trace. + +### `in` and `not-in` Operators in Rules + +This release introduces `in` and `not-in` operators for rules. These operators allow the Value field to contain a list of values, and efficiently test for the presence or absence of a particular span field within that list. +A potential use for these operators would be to keep or drop traces originating from within a specific list of services. + +### More flexible API key management with `SendKey` and `SendKeyMode` + +This release allows an API key to be deployed alongside Refinery rather than with the sources of telemetry using the `SendKey` configuration. +The `SendKeyMode` value allows `SendKey` to be used (along with the existing `ReceiveKeys` value) in a variety of modes depending on the security requirements. + +### Other Improvements + +* Refinery rules now allow specifying `root.` prefixes for fields in dynamic samplers. +* The performance of the drop cache has been improved, which should help with stability for systems with a very high drop rate. +* The default maximum message sizes for OTLP have been increased from 5MB to 15MB. +* It is now possible to specify multiple config files, which can allow a layered approach to configuration (separating keys from other configuration, for example). + + +## Version 2.7.0 + +This release is a minor release focused on better cluster stability and data quality with a new system for communicating peer information across nodes. +As a result, clusters should generally behave more consistently. + +Refinery 2.7 lays the groundwork for substantial future changes to Refinery. + +### Publish/Subscribe on Redis +In this release, Redis is no longer a database for storing a list of peers. +Instead, it is used as a more general publish/subscribe framework for rapidly sharing information between nodes in the cluster. +Things that are shared with this connection are: + +- Peer membership +- Stress levels +- News of configuration changes + +Because of this mechanism, Refinery will now react more quickly to changes in any of these factors. +When one node detects a configuration change, all of its peers will be told about it immediately. + +In addition, Refinery now publishes individual stress levels between peers. +Nodes calculate a cluster stress level as a weighted average (with nodes that are more stressed getting more weight). +If an individual node is stressed, it can enter stress relief individually. +This may happen, for example, when a single giant trace is concentrated on one node. +If the cluster as a whole is being stressed by a general burst in traffic, the entire cluster should now enter or leave stress relief at approximately the same time. + +If your existing Redis instance is particularly small, you may find it necessary to increase its CPU or network allocations. + +### Health checks now include both liveness and readiness + +Refinery has always had only a liveness check on `/alive`, which always simply returned ok. + +Starting with this release, Refinery now supports both `/alive` and `/ready`, which are based on internal status reporting. + +The liveness check is alive whenever Refinery is awake and internal systems are functional. +It will return a failure if any of the monitored systems fail to report in time. + +The readiness check returns ready whenever the monitored systems indicate readiness. +It will return a failure if any internal system returns not ready. +This is usually used to indicate to a load balancer that no new traffic should go to this node. +In this release, this will only happen when a Refinery node is shutting down. + +### Metrics changes +There have also been some minor changes to metrics in this release: + +We have two new metrics called `individual_stress_level` (the stress level as seen by a single node) and `cluster_stress_level` (the aggregated cluster level). +The `stress_level` metric indicates the maximum of the two values; it is this value which is used to determine whether an individual node activates stress relief. + +There is also a new pair of metrics, `config_hash` and `rule_config_hash`. +These are numeric Gauge metrics that are set to the numeric value of the last 4 hex digits of the hash of the current config files. +These can be used to track that all refineries are using the same configuration file. + +### Disabling Redis and using a static list of peers +Specifying `PeerManagement.Type=file` will cause Refinery to use the fixed list of peers found in the configuration. +This means that Refinery will operate without sharing changes to peers, stress, or configuration, as it has in previous releases. + +### Config Change notifications +When deploying a cluster in Kubernetes, it is often the case that configurations are managed as a ConfigMap. +In the default setup, ConfigMaps are eventually consistent. +This may mean that one Refinery node will detect a configuration change and broadcast news of it, but a different node that receives the news will attempt to read the data and get the previous configuration. +In this situation, the change will still be detected by all Refineries within the `ConfigReloadInterval`. + +## Version 2.6.1 + +This is a bug fix release. +In the log handling logic newly introduced in v2.6.0, Refinery would incorrectly consider log events to be root spans in a trace. +After this fix, log events can never be root spans. +This is recommended for everyone who wants to use the new log handling capabilities. + +## Version 2.6.0 + +With this release, Refinery begins the process of integrating multiple telemetry signal types by handling logs as well as traces. +Refinery now handles the OpenTelemetry `/logs` endpoints over both gRPC and HTTP: +- Log records that are associated with a trace by including a TraceID are sampled alongside the trace's spans. +- Log records that do not have a TraceID are treated like regular events and forwarded directly to Honeycomb. + +It also includes support for URL encoded dataset names in the non-OpenTelemetry URL paths. + +## Version 2.5.2 + +This release fixes a race condition in OTel Metrics that caused Refinery to crash. +This update is recommended for everyone who has OTelMetrics enabled. + +## Version 2.5.1 + +This is a bug fix release for a concurrent map read panic when loading items from the internal cache. +It also includes improvements for validation of ingest keys and resolves a lock issue during startup. + +## Version 2.5 + +This release's main new feature adds support of Honeycomb Classic ingest keys. +There is also a performance improvement for the new `root.` rule feature, and a new metric to track traces dropped by rules. +This release is a recommended upgrade for anyone wishing to use ingest keys within a Honeycomb Classic environment. + +## Version 2.4.3 + +A bug fix release for a regression introduced in the 2.4.2 bug fix release. +It was possible to trigger 500 errors in Refinery's OTLP error responses when sending traces in an unsupported content-type. +This release is a recommended upgrade for anyone sending OTLP data to Refinery. + +## Version 2.4.2 + +This is a bug fix release for returning a improperly formatted OTLP error responses. +OTLP clients receiving the improper response would show errors about parsing the response, masking the error message within the response which complicated solving data send issues. +This release is a recommended upgrade for anyone sending OTLP data to Refinery. + +## Version 2.4.1 + +This is a bug fix release for matching fields in the root span context. + +The implementation in v2.4.0 can crash if the trace's root span is not present at the time a sampling decision is being made. +Root spans are often not present when the root span is taking longer to complete than the time configured for Refinery to wait for a trace's spans to arrive (`TraceTimeout`). +This release contains a fix for this crash and is a recommended upgrade for anyone using this new feature. + +## Version 2.4.0 + +This release includes an update to allow users to specify root span context in their rules. It also includes some bug +fixes, improvements, and dependency updates. + +### Root Span Context + +Users can now specify rules that match only the root span of a trace (i.e. `root.http.status`). + +### Notable Fixes +* Previously, rules with a default of boolean `true` that we set to `false` by configuration would be overridden back to `true` when defaults were applied to the config. We have fixed this by using the `*bool` type for these values as well as adding helper functions to avoid strange behavior related to how booleans work in Go. + +## Version 2.3.0 + +This release is mainly focused on some improvements to rules and bug fixes. It is recommended for all Refinery users. + +### Rules Improvements + +Users of Rules-based samplers have several new features with this release: + +* A new `matches` operator can match the contents of fields using a regular expression. The regular expression language supported is the one used by the Go programming language. This will enable certain advanced rules that were previously impossible to achieve. It should be noted that complex regular expressions can be significantly slower than normal comparisons, so use this feature with the appropriate level of caution. +* A new `Fields` parameter may be used in conditions instead of `Field`. This parameter takes a list of field names, and evaluates the rule based on the first name that matches an existing field. This is intended to be used mainly when telemetry field names are changing, to avoid having to create duplicated rules. +* Refinery now supports a "virtual" `Field` called `?.NUM_DESCENDANTS`. This field is evaluated as the current number of descendants in a trace, even if the root span has not arrived. This permits a rule that correctly evaluates the number of spans in a trace even if the trace is exceptionally long-lived. This sort of rule can be used to drop exceptionally large traces to avoid sending them to Honeycomb. +* There is a [new documentation page](rules_conditions.md) in this repository containing detailed information on constructing rule conditions. + +### Other Notable Changes + +* Previously, spans that arrived after the trace decision had been made were simply marked with `meta.refinery.reason: late`. Now, refinery will remember and attache the reason used when the span decision was made. +* MemorySize parameters in config can now accept a floating point value like `2.5Gb`, which is more compatible with values used in Kubernetes. This should help eliminate bugs in Helm charts. +* OTLP requests to `/v1/traces/` will now be accepted along with `/v1/traces`, which eliminates a minor annoyance when configuring upstream senders. +* There were many improvements to testing and documentation that should improve quality of life for contributors. ## Version 2.2.0 diff --git a/RELEASING.md b/RELEASING.md index 0f89fa1127..c05c09e2f3 100644 --- a/RELEASING.md +++ b/RELEASING.md @@ -3,7 +3,9 @@ 1. Check that licenses are current with `make verify-licenses` 2. Regenerate documentation with `make all` from within the `tools/convert` folder. If there have been changes to `rules.md`, you may need to manually modify the `rules_complete.yaml` to reflect the same change. -3. If either `refinery_config.md` or `refinery_rules.md` were modified in this release, you must also copy these files to [docs](https://github.com/honeycombio/docs) and do a docs PR. Address any feedback from the the docs team and apply that feedback back into this repo. +3. If either `refinery_config.md` or `refinery_rules.md` were modified in this release, you must also open a [docs](https://github.com/honeycombio/docs) PR and update these files there under `layouts/shortcodes/subpages/refinery/` . + Replace the underscores (`_`) in the filenames with a dash (`-`) or the docs linter will be upset. + Address any feedback from the the docs team and apply that feedback back into this repo. 4. After addressing any docs change, add release entry to [changelog](./CHANGELOG.md) - Use below command to get a list of all commits since last release ``` @@ -14,13 +16,13 @@ been changes to `rules.md`, you may need to manually modify the `rules_complete. (the `git log` command can't do this automatically) - organize each commit based on their prefix into below three categories: ``` - ## Features + ### Features - - ## Fixes + ### Fixes - - ## Maintenance + ### Maintenance - ``` 5. Add a summary of release changes to [release notes](./RELEASE_NOTES.md) diff --git a/SECURITY.md b/SECURITY.md index c0ce73b5ca..71d0fbb120 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -1,3 +1,26 @@ -# Reporting Security Issues +# Security Policy -If you discover a security vulnerability, please open an issue with label `type: security`. +This security policy applies to public projects under the [honeycombio organization][gh-organization] on GitHub. +For security reports involving the services provided at `(ui|ui-eu|api|api-eu).honeycomb.io`, refer to the [Honeycomb Bug Bounty Program][bugbounty] for scope, expectations, and reporting procedures. + +## Security/Bugfix Versions + +Security and bug fixes are generally provided only for the last minor version. +Fixes are released either as part of the next minor version or as an on-demand patch version. + +Security fixes are given priority and might be enough to cause a new version to be released. + +## Reporting a Vulnerability + +We encourage responsible disclosure of security vulnerabilities. +If you find something suspicious, we encourage and appreciate your report! + +### Ways to report + +In order for the vulnerability reports to reach maintainers as soon as possible, the preferred way is to use the "Report a vulnerability" button under the "Security" tab of the associated GitHub project. +This creates a private communication channel between the reporter and the maintainers. + +If you are absolutely unable to or have strong reasons not to use GitHub's vulnerability reporting workflow, please reach out to the Honeycomb security team at [security@honeycomb.io](mailto:security@honeycomb.io). + +[gh-organization]: https://github.com/honeycombio +[bugbounty]: https://www.honeycomb.io/bugbountyprogram diff --git a/app/app.go b/app/app.go index 9bacdda238..de1ee309a3 100644 --- a/app/app.go +++ b/app/app.go @@ -25,11 +25,33 @@ type App struct { // Start exits, Stop will be called on all dependencies then on App then the // program will exit. func (a *App) Start() error { - a.Logger.Debug().Logf("Starting up App...") + // little helper function to record the current config and rules hashes; we call it in + // the callback but also at startup + record_hashes := func(msg string) { + cfgHash, rulesHash := a.Config.GetHashes() + if a.Logger != nil { + a.Logger.Warn().WithFields(map[string]interface{}{ + "configHash": cfgHash, + "rulesHash": rulesHash, + }).Logf(msg) + } + cfgMetric := config.ConfigHashMetrics(cfgHash) + ruleMetric := config.ConfigHashMetrics(rulesHash) + a.Metrics.Gauge("config_hash", cfgMetric) + a.Metrics.Gauge("rule_config_hash", ruleMetric) + } + a.Logger.Debug().Logf("Starting up App...") + a.Metrics.Register("config_hash", "gauge") + a.Metrics.Register("rule_config_hash", "gauge") a.IncomingRouter.SetVersion(a.Version) a.PeerRouter.SetVersion(a.Version) + record_hashes("loaded configuration at startup") + a.Config.RegisterReloadCallback(func(configHash, rulesHash string) { + record_hashes("configuration change was detected and the configuration was reloaded.") + }) + // launch our main routers to listen for incoming event traffic from both peers // and external sources a.IncomingRouter.LnS("incoming") diff --git a/app/app_test.go b/app/app_test.go index 71fbf37354..3b3904cc4c 100644 --- a/app/app_test.go +++ b/app/app_test.go @@ -18,14 +18,18 @@ import ( "github.com/facebookgo/inject" "github.com/facebookgo/startstop" + "github.com/jonboulle/clockwork" "github.com/klauspost/compress/zstd" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.opentelemetry.io/otel/trace/noop" "gopkg.in/alexcesaro/statsd.v2" "github.com/honeycombio/libhoney-go" "github.com/honeycombio/libhoney-go/transmission" "github.com/honeycombio/refinery/collect" "github.com/honeycombio/refinery/config" + "github.com/honeycombio/refinery/internal/health" "github.com/honeycombio/refinery/internal/peer" "github.com/honeycombio/refinery/logger" "github.com/honeycombio/refinery/metrics" @@ -84,17 +88,6 @@ func (w *countingWriterSender) waitForCount(t testing.TB, target int) { } } -type testPeers struct { - peers []string -} - -func (p *testPeers) GetPeers() ([]string, error) { - return p.peers, nil -} - -func (p *testPeers) RegisterUpdatedPeersCallback(callback func()) { -} - func newStartedApp( t testing.TB, libhoneyT transmission.Sender, @@ -103,29 +96,33 @@ func newStartedApp( enableHostMetadata bool, ) (*App, inject.Graph) { c := &config.MockConfig{ - GetSendDelayVal: 0, - GetTraceTimeoutVal: 10 * time.Millisecond, - GetMaxBatchSizeVal: 500, + GetTracesConfigVal: config.TracesConfig{ + SendTicker: config.Duration(2 * time.Millisecond), + SendDelay: config.Duration(1 * time.Millisecond), + TraceTimeout: config.Duration(10 * time.Millisecond), + MaxBatchSize: 500, + }, GetSamplerTypeVal: &config.DeterministicSamplerConfig{SampleRate: 1}, - SendTickerVal: 2 * time.Millisecond, PeerManagementType: "file", GetUpstreamBufferSizeVal: 10000, GetPeerBufferSizeVal: 10000, GetListenAddrVal: "127.0.0.1:" + strconv.Itoa(basePort), GetPeerListenAddrVal: "127.0.0.1:" + strconv.Itoa(basePort+1), - IsAPIKeyValidFunc: func(k string) bool { return k == legacyAPIKey || k == nonLegacyAPIKey }, GetHoneycombAPIVal: "http://api.honeycomb.io", - GetCollectionConfigVal: config.CollectionConfig{CacheCapacity: 10000}, + GetCollectionConfigVal: config.CollectionConfig{CacheCapacity: 10000, ShutdownDelay: config.Duration(1 * time.Second)}, AddHostMetadataToTrace: enableHostMetadata, TraceIdFieldNames: []string{"trace.trace_id"}, ParentIdFieldNames: []string{"trace.parent_id"}, SampleCache: config.SampleCacheConfig{KeptSize: 10000, DroppedSize: 100000, SizeCheckInterval: config.Duration(10 * time.Second)}, + GetAccessKeyConfigVal: config.AccessKeyConfig{ + ReceiveKeys: []string{legacyAPIKey, nonLegacyAPIKey}, + AcceptOnlyListedKeys: true, + }, } var err error if peers == nil { - peers, err = peer.NewPeers(context.Background(), c, make(chan struct{})) - assert.NoError(t, err) + peers = &peer.FilePeers{Cfg: c, Metrics: &metrics.NullMetrics{}} } a := App{} @@ -164,7 +161,7 @@ func newStartedApp( sdPeer, _ := statsd.New(statsd.Prefix("refinery.peer")) peerClient, err := libhoney.NewClient(libhoney.ClientConfig{ Transmission: &transmission.Honeycomb{ - MaxBatchSize: c.GetMaxBatchSize(), + MaxBatchSize: c.GetTracesConfigVal.MaxBatchSize, BatchTimeout: libhoney.DefaultBatchTimeout, MaxConcurrentBatches: libhoney.DefaultMaxConcurrentBatches, PendingWorkCapacity: uint(c.GetPeerBufferSize()), @@ -191,6 +188,7 @@ func newStartedApp( &inject.Object{Value: transmit.NewDefaultTransmission(upstreamClient, metricsr, "upstream"), Name: "upstreamTransmission"}, &inject.Object{Value: transmit.NewDefaultTransmission(peerClient, metricsr, "peer"), Name: "peerTransmission"}, &inject.Object{Value: shrdr}, + &inject.Object{Value: noop.NewTracerProvider().Tracer("test"), Name: "tracer"}, &inject.Object{Value: collector}, &inject.Object{Value: metricsr, Name: "metrics"}, &inject.Object{Value: metricsr, Name: "genericMetrics"}, @@ -198,6 +196,8 @@ func newStartedApp( &inject.Object{Value: metricsr, Name: "peerMetrics"}, &inject.Object{Value: "test", Name: "version"}, &inject.Object{Value: samplerFactory}, + &inject.Object{Value: &health.Health{}}, + &inject.Object{Value: clockwork.NewRealClock()}, &inject.Object{Value: &collect.MockStressReliever{}, Name: "stressRelief"}, &inject.Object{Value: &a}, ) @@ -226,8 +226,8 @@ func TestAppIntegration(t *testing.T) { t.Parallel() port := 10500 - var out bytes.Buffer - _, graph := newStartedApp(t, &transmission.WriterSender{W: &out}, port, nil, false) + sender := &transmission.MockSender{} + app, graph := newStartedApp(t, sender, port, nil, false) // Send a root span, it should be sent in short order. req := httptest.NewRequest( @@ -243,13 +243,19 @@ func TestAppIntegration(t *testing.T) { assert.Equal(t, http.StatusOK, resp.StatusCode) resp.Body.Close() + time.Sleep(5 * app.Config.GetTracesConfig().GetSendTickerValue()) + + require.EventuallyWithT(t, func(collect *assert.CollectT) { + events := sender.Events() + require.Len(collect, events, 1) + assert.Equal(collect, "dataset", events[0].Dataset) + assert.Equal(collect, "bar", events[0].Data["foo"]) + assert.Equal(collect, "1", events[0].Data["trace.trace_id"]) + assert.Equal(collect, uint(1), events[0].Data["meta.refinery.original_sample_rate"]) + }, 2*time.Second, 10*time.Millisecond) + err = startstop.Stop(graph.Objects(), nil) assert.NoError(t, err) - - assert.Eventually(t, func() bool { - return out.Len() > 62 - }, 5*time.Second, 2*time.Millisecond) - assert.Equal(t, `{"data":{"foo":"bar","meta.refinery.original_sample_rate":1,"trace.trace_id":"1"},"dataset":"dataset"}`+"\n", out.String()) } func TestAppIntegrationWithNonLegacyKey(t *testing.T) { @@ -257,8 +263,8 @@ func TestAppIntegrationWithNonLegacyKey(t *testing.T) { t.Parallel() port := 10600 - var out bytes.Buffer - a, graph := newStartedApp(t, &transmission.WriterSender{W: &out}, port, nil, false) + sender := &transmission.MockSender{} + a, graph := newStartedApp(t, sender, port, nil, false) a.IncomingRouter.SetEnvironmentCache(time.Second, func(s string) (string, error) { return "test", nil }) a.PeerRouter.SetEnvironmentCache(time.Second, func(s string) (string, error) { return "test", nil }) @@ -276,14 +282,20 @@ func TestAppIntegrationWithNonLegacyKey(t *testing.T) { assert.Equal(t, http.StatusOK, resp.StatusCode) resp.Body.Close() + // Wait for span to be sent. + var events []*transmission.Event + require.Eventually(t, func() bool { + events = sender.Events() + return len(events) == 1 + }, 2*time.Second, 2*time.Millisecond) + + assert.Equal(t, "dataset", events[0].Dataset) + assert.Equal(t, "bar", events[0].Data["foo"]) + assert.Equal(t, "1", events[0].Data["trace.trace_id"]) + assert.Equal(t, uint(1), events[0].Data["meta.refinery.original_sample_rate"]) + err = startstop.Stop(graph.Objects(), nil) assert.NoError(t, err) - - // Wait for span to be sent. - assert.Eventually(t, func() bool { - return out.Len() > 62 - }, 5*time.Second, 2*time.Millisecond) - assert.Equal(t, `{"data":{"foo":"bar","meta.refinery.original_sample_rate":1,"trace.trace_id":"1"},"dataset":"dataset"}`+"\n", out.String()) } func TestAppIntegrationWithUnauthorizedKey(t *testing.T) { @@ -291,8 +303,8 @@ func TestAppIntegrationWithUnauthorizedKey(t *testing.T) { t.Parallel() port := 10700 - var out bytes.Buffer - a, graph := newStartedApp(t, &transmission.WriterSender{W: &out}, port, nil, false) + sender := &transmission.MockSender{} + a, graph := newStartedApp(t, sender, port, nil, false) a.IncomingRouter.SetEnvironmentCache(time.Second, func(s string) (string, error) { return "test", nil }) a.PeerRouter.SetEnvironmentCache(time.Second, func(s string) (string, error) { return "test", nil }) @@ -307,7 +319,7 @@ func TestAppIntegrationWithUnauthorizedKey(t *testing.T) { resp, err := http.DefaultTransport.RoundTrip(req) assert.NoError(t, err) - assert.Equal(t, 400, resp.StatusCode) + assert.Equal(t, 401, resp.StatusCode) data, err := io.ReadAll(resp.Body) resp.Body.Close() assert.NoError(t, err) @@ -321,24 +333,20 @@ func TestPeerRouting(t *testing.T) { // Parallel integration tests need different ports! t.Parallel() - peers := &testPeers{ - peers: []string{ - "http://localhost:11001", - "http://localhost:11003", - }, - } + peerList := []string{"http://localhost:11001", "http://localhost:11003"} var apps [2]*App - var addrs [2]string var senders [2]*transmission.MockSender for i := range apps { var graph inject.Graph basePort := 11000 + (i * 2) senders[i] = &transmission.MockSender{} + peers := &peer.MockPeers{ + Peers: peerList, + ID: peerList[i], + } apps[i], graph = newStartedApp(t, senders[i], basePort, peers, false) defer startstop.Stop(graph.Objects(), nil) - - addrs[i] = "localhost:" + strconv.Itoa(basePort) } // Deliver to host 1, it should be passed to host 0 and emitted there. @@ -395,8 +403,8 @@ func TestPeerRouting(t *testing.T) { } assert.Equal(t, expectedEvent, senders[0].Events()[0]) - // Repeat, but deliver to host 1 on the peer channel, it should not be - // passed to host 0. + // Repeat, but deliver to host 1 on the peer channel, it should be + // passed to host 0 since that's who the trace belongs to. req, err = http.NewRequest( "POST", "http://localhost:11003/1/batch/dataset", @@ -409,22 +417,22 @@ func TestPeerRouting(t *testing.T) { req.Body = io.NopCloser(strings.NewReader(blob)) post(t, req) assert.Eventually(t, func() bool { - return len(senders[1].Events()) == 1 + return len(senders[0].Events()) == 1 }, 2*time.Second, 2*time.Millisecond) assert.Equal(t, expectedEvent, senders[0].Events()[0]) } func TestHostMetadataSpanAdditions(t *testing.T) { t.Parallel() + port := 14000 - var out bytes.Buffer - _, graph := newStartedApp(t, &transmission.WriterSender{W: &out}, 14000, nil, true) - hostname, _ := os.Hostname() + sender := &transmission.MockSender{} + app, graph := newStartedApp(t, sender, port, nil, true) // Send a root span, it should be sent in short order. req := httptest.NewRequest( "POST", - "http://localhost:14000/1/batch/dataset", + fmt.Sprintf("http://localhost:%d/1/batch/dataset", port), strings.NewReader(`[{"data":{"foo":"bar","trace.trace_id":"1"}}]`), ) req.Header.Set("X-Honeycomb-Team", legacyAPIKey) @@ -435,38 +443,46 @@ func TestHostMetadataSpanAdditions(t *testing.T) { assert.Equal(t, http.StatusOK, resp.StatusCode) resp.Body.Close() - err = startstop.Stop(graph.Objects(), nil) - assert.NoError(t, err) + time.Sleep(5 * app.Config.GetTracesConfig().GetSendTickerValue()) - assert.Eventually(t, func() bool { - return out.Len() > 62 - }, 5*time.Second, 2*time.Millisecond) + var events []*transmission.Event + require.Eventually(t, func() bool { + events = sender.Events() + return len(events) == 1 + }, 2*time.Second, 10*time.Millisecond) - expectedSpan := `{"data":{"foo":"bar","meta.refinery.local_hostname":"%s","meta.refinery.original_sample_rate":1,"trace.trace_id":"1"},"dataset":"dataset"}` + "\n" - assert.Equal(t, fmt.Sprintf(expectedSpan, hostname), out.String()) + assert.Equal(t, "dataset", events[0].Dataset) + assert.Equal(t, "bar", events[0].Data["foo"]) + assert.Equal(t, "1", events[0].Data["trace.trace_id"]) + assert.Equal(t, uint(1), events[0].Data["meta.refinery.original_sample_rate"]) + hostname, _ := os.Hostname() + assert.Equal(t, hostname, events[0].Data["meta.refinery.local_hostname"]) + + err = startstop.Stop(graph.Objects(), nil) + assert.NoError(t, err) } func TestEventsEndpoint(t *testing.T) { t.Parallel() - peers := &testPeers{ - peers: []string{ - "http://localhost:13001", - "http://localhost:13003", - }, + peerList := []string{ + "http://localhost:13001", + "http://localhost:13003", } var apps [2]*App - var addrs [2]string var senders [2]*transmission.MockSender for i := range apps { var graph inject.Graph basePort := 13000 + (i * 2) senders[i] = &transmission.MockSender{} + peers := &peer.MockPeers{ + Peers: peerList, + ID: peerList[i], + } + apps[i], graph = newStartedApp(t, senders[i], basePort, peers, false) defer startstop.Stop(graph.Objects(), nil) - - addrs[i] = "localhost:" + strconv.Itoa(basePort) } // Deliver to host 1, it should be passed to host 0 and emitted there. @@ -512,8 +528,8 @@ func TestEventsEndpoint(t *testing.T) { senders[0].Events()[0], ) - // Repeat, but deliver to host 1 on the peer channel, it should not be - // passed to host 0. + // Repeat, but deliver to host 1 on the peer channel, it should be + // passed to host 0 since that's the host this trace belongs to. blob = blob[:0] buf := bytes.NewBuffer(blob) @@ -535,7 +551,7 @@ func TestEventsEndpoint(t *testing.T) { post(t, req) assert.Eventually(t, func() bool { - return len(senders[1].Events()) == 1 + return len(senders[0].Events()) == 1 }, 2*time.Second, 2*time.Millisecond) assert.Equal( @@ -555,36 +571,36 @@ func TestEventsEndpoint(t *testing.T) { "api_host": "http://api.honeycomb.io", "dataset": "dataset", "environment": "", - "enqueued_at": senders[1].Events()[0].Metadata.(map[string]any)["enqueued_at"], + "enqueued_at": senders[0].Events()[0].Metadata.(map[string]any)["enqueued_at"], }, }, - senders[1].Events()[0], + senders[0].Events()[0], ) } func TestEventsEndpointWithNonLegacyKey(t *testing.T) { t.Parallel() - peers := &testPeers{ - peers: []string{ - "http://localhost:15001", - "http://localhost:15003", - }, + peerList := []string{ + "http://localhost:15001", + "http://localhost:15003", } var apps [2]*App - var addrs [2]string var senders [2]*transmission.MockSender for i := range apps { basePort := 15000 + (i * 2) senders[i] = &transmission.MockSender{} + peers := &peer.MockPeers{ + Peers: peerList, + ID: peerList[i], + } + app, graph := newStartedApp(t, senders[i], basePort, peers, false) app.IncomingRouter.SetEnvironmentCache(time.Second, func(s string) (string, error) { return "test", nil }) app.PeerRouter.SetEnvironmentCache(time.Second, func(s string) (string, error) { return "test", nil }) apps[i] = app defer startstop.Stop(graph.Objects(), nil) - - addrs[i] = "localhost:" + strconv.Itoa(basePort) } // this traceID was chosen because it hashes to the appropriate shard for this @@ -634,7 +650,7 @@ func TestEventsEndpointWithNonLegacyKey(t *testing.T) { senders[0].Events()[0], ) - // Repeat, but deliver to host 1 on the peer channel, it should not be + // Repeat, but deliver to host 1 on the peer channel, it should be // passed to host 0. blob = blob[:0] @@ -657,7 +673,7 @@ func TestEventsEndpointWithNonLegacyKey(t *testing.T) { post(t, req) assert.Eventually(t, func() bool { - return len(senders[1].Events()) == 1 + return len(senders[0].Events()) == 1 }, 2*time.Second, 2*time.Millisecond) assert.Equal( @@ -677,10 +693,10 @@ func TestEventsEndpointWithNonLegacyKey(t *testing.T) { "api_host": "http://api.honeycomb.io", "dataset": "dataset", "environment": "test", - "enqueued_at": senders[1].Events()[0].Metadata.(map[string]any)["enqueued_at"], + "enqueued_at": senders[0].Events()[0].Metadata.(map[string]any)["enqueued_at"], }, }, - senders[1].Events()[0], + senders[0].Events()[0], ) } @@ -828,14 +844,12 @@ func BenchmarkDistributedTraces(b *testing.B) { }, } - peers := &testPeers{ - peers: []string{ - "http://localhost:12001", - "http://localhost:12003", - "http://localhost:12005", - "http://localhost:12007", - "http://localhost:12009", - }, + peerList := []string{ + "http://localhost:12001", + "http://localhost:12003", + "http://localhost:12005", + "http://localhost:12007", + "http://localhost:12009", } var apps [5]*App @@ -843,6 +857,11 @@ func BenchmarkDistributedTraces(b *testing.B) { for i := range apps { var graph inject.Graph basePort := 12000 + (i * 2) + peers := &peer.MockPeers{ + Peers: peerList, + ID: peerList[i], + } + apps[i], graph = newStartedApp(b, sender, basePort, peers, false) defer startstop.Stop(graph.Objects(), nil) diff --git a/cmd/refinery/main.go b/cmd/refinery/main.go index 389e16fed9..efc176dab7 100644 --- a/cmd/refinery/main.go +++ b/cmd/refinery/main.go @@ -1,7 +1,6 @@ package main import ( - "context" "fmt" "net" "net/http" @@ -11,6 +10,8 @@ import ( "syscall" "time" + "go.opentelemetry.io/otel/trace" + "go.opentelemetry.io/otel/trace/noop" _ "go.uber.org/automaxprocs" "golang.org/x/exp/slices" @@ -18,14 +19,19 @@ import ( "github.com/facebookgo/startstop" libhoney "github.com/honeycombio/libhoney-go" "github.com/honeycombio/libhoney-go/transmission" + "github.com/jonboulle/clockwork" "github.com/sirupsen/logrus" "github.com/honeycombio/refinery/app" "github.com/honeycombio/refinery/collect" "github.com/honeycombio/refinery/config" + "github.com/honeycombio/refinery/internal/configwatcher" + "github.com/honeycombio/refinery/internal/health" + "github.com/honeycombio/refinery/internal/otelutil" "github.com/honeycombio/refinery/internal/peer" "github.com/honeycombio/refinery/logger" "github.com/honeycombio/refinery/metrics" + "github.com/honeycombio/refinery/pubsub" "github.com/honeycombio/refinery/sample" "github.com/honeycombio/refinery/service/debug" "github.com/honeycombio/refinery/sharder" @@ -96,12 +102,6 @@ func main() { fmt.Println("Config and Rules validated successfully.") os.Exit(0) } - c.RegisterReloadCallback(func() { - if a.Logger != nil { - a.Logger.Info().Logf("configuration change was detected and the configuration was reloaded") - } - }) - // get desired implementation for each dependency to inject lgr := logger.GetLoggerImplementation(c) collector := collect.GetCollectorImplementation(c) @@ -116,14 +116,35 @@ func main() { os.Exit(1) } - ctx, cancel := context.WithTimeout(context.Background(), c.GetPeerTimeout()) - defer cancel() + // when refinery receives a shutdown signal, we need to + // immediately let its peers know so they can stop sending + // data to it. done := make(chan struct{}) - peers, err := peer.NewPeers(ctx, c, done) - - if err != nil { - fmt.Printf("unable to load peers: %+v\n", err) - os.Exit(1) + // set up the peer management and pubsub implementations + var peers peer.Peers + var pubsubber pubsub.PubSub + ptype := c.GetPeerManagementType() + switch ptype { + case "file": + // In the case of file peers, we do not use Redis for anything, including pubsub, so + // we use the local pubsub implementation. Even if we have multiple peers, these + // peers cannot communicate using pubsub. + peers = &peer.FilePeers{Done: done} + pubsubber = &pubsub.LocalPubSub{} + case "redis": + // if we're using redis, we need to set it up for both peers and pubsub + peers = &peer.RedisPubsubPeers{Done: done} + pubsubber = &pubsub.GoRedisPubSub{} + case "fly-dns": + flydnsPeer, err := peer.NewDnsPeers(c, done) + if err != nil { + panic(fmt.Sprintf("error loading fly-dns: %s", err)) + } + peers = flydnsPeer + pubsubber = &pubsub.LocalPubSub{} + default: + // this should have been caught by validation + panic("invalid config option 'PeerManagement.Type'") } // upstreamTransport is the http transport used to send things on to Honeycomb @@ -133,6 +154,7 @@ func main() { Timeout: 10 * time.Second, }).Dial, TLSHandshakeTimeout: 15 * time.Second, + ForceAttemptHTTP2: true, } // peerTransport is the http transport used to send things to a local peer @@ -142,6 +164,7 @@ func main() { Timeout: 3 * time.Second, }).Dial, TLSHandshakeTimeout: 1200 * time.Millisecond, + ForceAttemptHTTP2: true, } genericMetricsRecorder := metrics.NewMetricsPrefixer("") @@ -151,8 +174,8 @@ func main() { userAgentAddition := "refinery/" + version upstreamClient, err := libhoney.NewClient(libhoney.ClientConfig{ Transmission: &transmission.Honeycomb{ - MaxBatchSize: c.GetMaxBatchSize(), - BatchTimeout: c.GetBatchTimeout(), + MaxBatchSize: c.GetTracesConfig().GetMaxBatchSize(), + BatchTimeout: time.Duration(c.GetTracesConfig().GetBatchTimeout()), MaxConcurrentBatches: libhoney.DefaultMaxConcurrentBatches, PendingWorkCapacity: uint(c.GetUpstreamBufferSize()), UserAgentAddition: userAgentAddition, @@ -169,8 +192,8 @@ func main() { peerClient, err := libhoney.NewClient(libhoney.ClientConfig{ Transmission: &transmission.Honeycomb{ - MaxBatchSize: c.GetMaxBatchSize(), - BatchTimeout: c.GetBatchTimeout(), + MaxBatchSize: c.GetTracesConfig().GetMaxBatchSize(), + BatchTimeout: time.Duration(c.GetTracesConfig().GetBatchTimeout()), MaxConcurrentBatches: libhoney.DefaultMaxConcurrentBatches, PendingWorkCapacity: uint(c.GetPeerBufferSize()), UserAgentAddition: userAgentAddition, @@ -204,6 +227,18 @@ func main() { oTelMetrics = &metrics.OTelMetrics{} } + resourceLib := "refinery" + resourceVer := version + tracer := trace.Tracer(noop.Tracer{}) + shutdown := func() {} + + if c.GetOTelTracingConfig().Enabled { + // let's set up some OTel tracing + tracer, shutdown = otelutil.SetupTracing(c.GetOTelTracingConfig(), resourceLib, resourceVer) + } + + defer shutdown() + // we need to include all the metrics types so we can inject them in case they're needed var g inject.Graph if opts.Debug { @@ -212,6 +247,7 @@ func main() { objects := []*inject.Object{ {Value: c}, {Value: peers}, + {Value: pubsubber}, {Value: lgr}, {Value: upstreamTransport, Name: "upstreamTransport"}, {Value: peerTransport, Name: "peerTransport"}, @@ -222,6 +258,8 @@ func main() { {Value: legacyMetrics, Name: "legacyMetrics"}, {Value: promMetrics, Name: "promMetrics"}, {Value: oTelMetrics, Name: "otelMetrics"}, + {Value: tracer, Name: "tracer"}, // we need to use a named injection here because trace.Tracer's struct fields are all private + {Value: clockwork.NewRealClock()}, {Value: metricsSingleton, Name: "metrics"}, {Value: genericMetricsRecorder, Name: "genericMetrics"}, {Value: upstreamMetricsRecorder, Name: "upstreamMetrics"}, @@ -229,6 +267,8 @@ func main() { {Value: version, Name: "version"}, {Value: samplerFactory}, {Value: stressRelief, Name: "stressRelief"}, + {Value: &health.Health{}}, + {Value: &configwatcher.ConfigWatcher{}}, {Value: &a}, } err = g.Provide(objects...) @@ -265,6 +305,14 @@ func main() { os.Exit(1) } + // Now that all components are started, we can notify our peers that we are ready + // to receive data. + err = peers.Ready() + if err != nil { + fmt.Printf("failed to start peer management: %v\n", err) + os.Exit(1) + } + // these have to be done after the injection (of metrics) // these are the metrics that libhoney will emit; we preregister them so that they always appear libhoneyMetricsName := map[string]string{ diff --git a/collect/cache/cache.go b/collect/cache/cache.go index 6fc6d19c91..1c7c3ca12d 100644 --- a/collect/cache/cache.go +++ b/collect/cache/cache.go @@ -3,6 +3,7 @@ package cache import ( "time" + "github.com/honeycombio/refinery/generics" "github.com/honeycombio/refinery/logger" "github.com/honeycombio/refinery/metrics" "github.com/honeycombio/refinery/types" @@ -17,11 +18,19 @@ type Cache interface { // GetAll is used during shutdown to get all in-flight traces to flush them GetAll() []*types.Trace + // GetCacheCapacity returns the number of traces that can be stored in the cache + GetCacheCapacity() int + // Retrieve and remove all traces which are past their SendBy date. // Does not check whether they've been sent. TakeExpiredTraces(now time.Time) []*types.Trace + + // RemoveTraces accepts a set of trace IDs and removes any matching ones from + RemoveTraces(toDelete generics.Set[string]) } +var _ Cache = (*DefaultInMemCache)(nil) + // DefaultInMemCache keeps a bounded number of entries to avoid growing memory // forever. Traces are expunged from the cache in insertion order (not access // order) so it is important to have a cache larger than trace throughput * @@ -67,7 +76,7 @@ func NewInMemCache( } -func (d *DefaultInMemCache) GetCacheSize() int { +func (d *DefaultInMemCache) GetCacheCapacity() int { return len(d.traceBuffer) } @@ -165,13 +174,13 @@ func (d *DefaultInMemCache) TakeExpiredTraces(now time.Time) []*types.Trace { // RemoveTraces accepts a set of trace IDs and removes any matching ones from // the insertion list. This is used in the case of a cache overrun. -func (d *DefaultInMemCache) RemoveTraces(toDelete map[string]struct{}) { +func (d *DefaultInMemCache) RemoveTraces(toDelete generics.Set[string]) { d.Metrics.Gauge("collect_cache_capacity", float64(len(d.traceBuffer))) d.Metrics.Histogram("collect_cache_entries", float64(len(d.cache))) for i, t := range d.traceBuffer { if t != nil { - if _, ok := toDelete[t.TraceID]; ok { + if toDelete.Contains(t.TraceID) { d.traceBuffer[i] = nil delete(d.cache, t.TraceID) } diff --git a/collect/cache/cache_test.go b/collect/cache/cache_test.go index 0c267d0f45..4ca35e8ecc 100644 --- a/collect/cache/cache_test.go +++ b/collect/cache/cache_test.go @@ -1,9 +1,11 @@ package cache import ( + "fmt" "testing" "time" + "github.com/honeycombio/refinery/generics" "github.com/honeycombio/refinery/logger" "github.com/honeycombio/refinery/metrics" "github.com/honeycombio/refinery/types" @@ -91,13 +93,7 @@ func TestRemoveSentTraces(t *testing.T) { c.Set(t) } - deletes := map[string]struct{}{ - "1": {}, - "3": {}, - "4": {}, - "5": {}, // not present - } - + deletes := generics.NewSet("1", "3", "4", "5") c.RemoveTraces(deletes) all := c.GetAll() @@ -145,3 +141,88 @@ func TestSkipOldUnsentTraces(t *testing.T) { // make sure we kicked out #4 assert.Equal(t, traces[3], prev) } + +// Benchamark the cache's Set method +func BenchmarkCache_Set(b *testing.B) { + s := &metrics.MockMetrics{} + s.Start() + c := NewInMemCache(100000, s, &logger.NullLogger{}) + now := time.Now() + traces := make([]*types.Trace, 0, b.N) + for i := 0; i < b.N; i++ { + traces = append(traces, &types.Trace{ + TraceID: "trace" + fmt.Sprint(i), + SendBy: now.Add(time.Duration(i) * time.Second), + }) + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + c.Set(traces[i]) + } +} + +// Benchmark the cache's Get method +func BenchmarkCache_Get(b *testing.B) { + s := &metrics.MockMetrics{} + s.Start() + c := NewInMemCache(100000, s, &logger.NullLogger{}) + now := time.Now() + traces := make([]*types.Trace, 0, b.N) + for i := 0; i < b.N; i++ { + traces = append(traces, &types.Trace{ + TraceID: "trace" + fmt.Sprint(i), + SendBy: now.Add(time.Duration(i) * time.Second), + }) + c.Set(traces[i]) + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + c.Get(traces[i].TraceID) + } +} + +// Benchmark the cache's TakeExpiredTraces method +func BenchmarkCache_TakeExpiredTraces(b *testing.B) { + s := &metrics.MockMetrics{} + s.Start() + c := NewInMemCache(100000, s, &logger.NullLogger{}) + now := time.Now() + traces := make([]*types.Trace, 0, b.N) + for i := 0; i < b.N; i++ { + traces = append(traces, &types.Trace{ + TraceID: "trace" + fmt.Sprint(i), + SendBy: now.Add(time.Duration(i) * time.Second), + }) + c.Set(traces[i]) + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + c.TakeExpiredTraces(now.Add(time.Duration(i) * time.Second)) + } +} + +// Benchmark the cache's RemoveTraces method +func BenchmarkCache_RemoveTraces(b *testing.B) { + s := &metrics.MockMetrics{} + s.Start() + c := NewInMemCache(100000, s, &logger.NullLogger{}) + now := time.Now() + traces := make([]*types.Trace, 0, b.N) + for i := 0; i < b.N; i++ { + traces = append(traces, &types.Trace{ + TraceID: "trace" + fmt.Sprint(i), + SendBy: now.Add(time.Duration(i) * time.Second), + }) + c.Set(traces[i]) + } + + deletes := generics.NewSetWithCapacity[string](b.N / 2) + for i := 0; i < b.N/2; i++ { + deletes.Add("trace" + fmt.Sprint(i)) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + c.RemoveTraces(deletes) + } +} diff --git a/collect/cache/cuckoo.go b/collect/cache/cuckoo.go index 4cf228127d..e6c4e4774d 100644 --- a/collect/cache/cuckoo.go +++ b/collect/cache/cuckoo.go @@ -56,47 +56,52 @@ func NewCuckooTraceChecker(capacity uint, m metrics.Metrics) *CuckooTraceChecker // To try to avoid blocking on Add, we have a goroutine that pulls from a // channel and adds to the filter. go func() { - for { - n := len(c.addch) - if n == 0 { - // if the channel is empty, wait for a bit - time.Sleep(AddQueueSleepTime) - continue + ticker := time.NewTicker(AddQueueSleepTime) + for range ticker.C { + // as long as there's anything still in the channel, keep trying to drain it + for len(c.addch) > 0 { + c.drain() } - c.drain() } }() return c } -// This function records all the traces that were in the channel at the -// start of the call. The idea is to add them all under a single lock. We -// tested limiting it so as to not hold the lock for too long, but it didn't -// seem to matter and it made the code more complicated. -// We track a histogram metric about lock time, though, so we can watch it. +// This function records all the traces that were in the channel at the start of +// the call. The idea is to add as many as possible under a single lock. We do +// limit our lock hold time to 1ms, so if we can't add them all in that time, we +// stop and let the next call pick up the rest. We track a histogram metric +// about lock time. func (c *CuckooTraceChecker) drain() { n := len(c.addch) if n == 0 { return } - lockStart := time.Now() c.mut.Lock() + // we don't start the timer until we have the lock, because we don't want to be counting + // the time we're waiting for the lock. + lockStart := time.Now() + timeout := time.NewTimer(1 * time.Millisecond) outer: for i := 0; i < n; i++ { select { case t := <-c.addch: - c.current.Insert([]byte(t)) + s := []byte(t) + c.current.Insert(s) // don't add anything to future if it doesn't exist yet if c.future != nil { - c.future.Insert([]byte(t)) + c.future.Insert(s) } + case <-timeout.C: + break outer default: // if the channel is empty, stop break outer } } c.mut.Unlock() + timeout.Stop() qlt := time.Since(lockStart) c.met.Histogram(AddQueueLockTime, qlt.Microseconds()) } diff --git a/collect/cache/cuckooSentCache.go b/collect/cache/cuckooSentCache.go index 838b4f0709..281a3178ba 100644 --- a/collect/cache/cuckooSentCache.go +++ b/collect/cache/cuckooSentCache.go @@ -6,6 +6,7 @@ import ( lru "github.com/hashicorp/golang-lru/v2" "github.com/honeycombio/refinery/config" + "github.com/honeycombio/refinery/generics" "github.com/honeycombio/refinery/metrics" "github.com/honeycombio/refinery/types" ) @@ -25,19 +26,34 @@ type keptTraceCacheEntry struct { spanEventCount uint32 // number of span events in the trace spanLinkCount uint32 // number of span links in the trace spanCount uint32 // number of spans in the trace + reason uint32 // which rule was used to decide to keep the trace } -func NewKeptTraceCacheEntry(trace *types.Trace) *keptTraceCacheEntry { - if trace == nil { +// KeptTrace is an interface for a trace that was kept. +// It contains all the information we need to remember about the trace. +type KeptTrace interface { + ID() string + SampleRate() uint + DescendantCount() uint32 + SpanEventCount() uint32 + SpanLinkCount() uint32 + SpanCount() uint32 + SetSentReason(uint) + SentReason() uint +} + +func NewKeptTraceCacheEntry(t KeptTrace) *keptTraceCacheEntry { + if t == nil { return &keptTraceCacheEntry{} } return &keptTraceCacheEntry{ - rate: uint32(trace.SampleRate), - eventCount: trace.DescendantCount(), - spanEventCount: trace.SpanEventCount(), - spanLinkCount: trace.SpanLinkCount(), - spanCount: trace.SpanCount(), + rate: uint32(t.SampleRate()), + eventCount: t.DescendantCount(), + spanEventCount: t.SpanEventCount(), + spanLinkCount: t.SpanLinkCount(), + spanCount: t.SpanCount(), + reason: uint32(t.SentReason()), } } @@ -116,13 +132,19 @@ func (t *cuckooDroppedRecord) SpanCount() uint { func (t *cuckooDroppedRecord) Count(*types.Span) { } +func (t *cuckooDroppedRecord) Reason() uint { + return 0 +} + // Make sure it implements TraceSentRecord var _ TraceSentRecord = (*cuckooDroppedRecord)(nil) type cuckooSentCache struct { - kept *lru.Cache[string, *keptTraceCacheEntry] - dropped *CuckooTraceChecker - cfg config.SampleCacheConfig + met metrics.Metrics + kept *lru.Cache[string, *keptTraceCacheEntry] + dropped *CuckooTraceChecker + recentDroppedIDs *generics.SetWithTTL[string] + cfg config.SampleCacheConfig // The done channel is used to decide when to terminate the monitor // goroutine. When resizing the cache, we write to the channel, but @@ -132,7 +154,8 @@ type cuckooSentCache struct { done chan struct{} // This mutex is for managing kept traces - keptMut sync.Mutex + keptMut sync.Mutex + sentReasons *SentReasonsCache } // Make sure it implements TraceSentCache @@ -144,12 +167,29 @@ func NewCuckooSentCache(cfg config.SampleCacheConfig, met metrics.Metrics) (Trac return nil, err } dropped := NewCuckooTraceChecker(cfg.DroppedSize, met) + // we want to keep track of the most recent dropped traces so we can avoid + // checking them in the dropped filter, which can have contention issues + // under high load. So we use a cache with TTL to keep track of the most + // recent dropped trace IDs, which lets us avoid checking the dropped filter + // for them for a short period of time. This means that when a whole batch + // of spans from the same trace arrives late, we don't have to check the + // dropped filter for each one. Benchmarks indicate that the Set cache is + // maybe 2-4x faster than the cuckoo filter and it also avoids lock + // contention issues in the cuckoo filter, so in practical use saves more + // than that. The TTL in this cache is short, because it's refreshed on each + // request. + recentDroppedIDs := generics.NewSetWithTTL[string](3 * time.Second) + + met.Register("cache_recent_dropped_traces", "gauge") cache := &cuckooSentCache{ - kept: stc, - dropped: dropped, - cfg: cfg, - done: make(chan struct{}), + met: met, + kept: stc, + dropped: dropped, + recentDroppedIDs: recentDroppedIDs, + cfg: cfg, + sentReasons: NewSentReasonsCache(met), + done: make(chan struct{}), } go cache.monitor() return cache, nil @@ -162,6 +202,10 @@ func (c *cuckooSentCache) monitor() { select { case <-ticker.C: c.dropped.Maintain() + // Length() returns the number of items in the cache and it will + // clean up any expired items. + numOfDroppedIDs := c.recentDroppedIDs.Length() + c.met.Gauge("cache_recent_dropped_traces", numOfDroppedIDs) case <-c.done: return } @@ -173,25 +217,35 @@ func (c *cuckooSentCache) Stop() { close(c.done) } -func (c *cuckooSentCache) Record(trace *types.Trace, keep bool) { +func (c *cuckooSentCache) Record(trace KeptTrace, keep bool, reason string) { if keep { // record this decision in the sent record LRU for future spans + trace.SetSentReason(c.sentReasons.Set(reason)) sentRecord := NewKeptTraceCacheEntry(trace) c.keptMut.Lock() defer c.keptMut.Unlock() - c.kept.Add(trace.TraceID, sentRecord) + c.kept.Add(trace.ID(), sentRecord) + return } - // if we're not keeping it, save it in the dropped trace filter - c.dropped.Add(trace.TraceID) + // if we're not keeping it, save it in the recentDroppedIDs cache + c.recentDroppedIDs.Add(trace.ID()) + // and also save it in the dropped trace filter + c.dropped.Add(trace.ID()) } -func (c *cuckooSentCache) Check(span *types.Span) (TraceSentRecord, bool) { - // was it dropped? +func (c *cuckooSentCache) CheckSpan(span *types.Span) (TraceSentRecord, string, bool) { + // was it recently dropped? + if c.recentDroppedIDs.Contains(span.TraceID) { + c.recentDroppedIDs.Add(span.TraceID) // refresh the TTL on this key + return &cuckooDroppedRecord{}, "", true + } + // was it in the drop cache? if c.dropped.Check(span.TraceID) { + c.recentDroppedIDs.Add(span.TraceID) // we recognize it as dropped, so just say so; there's nothing else to do - return &cuckooDroppedRecord{}, false + return &cuckooDroppedRecord{}, "", true } // was it kept? c.keptMut.Lock() @@ -199,10 +253,11 @@ func (c *cuckooSentCache) Check(span *types.Span) (TraceSentRecord, bool) { if sentRecord, found := c.kept.Get(span.TraceID); found { // if we kept it, then this span being checked needs counting too sentRecord.Count(span) - return sentRecord, true + reason, _ := c.sentReasons.Get(uint(sentRecord.reason)) + return sentRecord, reason, true } // we have no memory of this place - return nil, false + return nil, "", false } func (c *cuckooSentCache) Resize(cfg config.SampleCacheConfig) error { @@ -237,3 +292,23 @@ func (c *cuckooSentCache) Resize(cfg config.SampleCacheConfig) error { go c.monitor() return nil } + +// CheckTrace checks if a trace was kept or dropped, and returns the reason if it was kept. +// The bool return value is true if the trace was found in the cache. +// It does not modify the count information. +func (c *cuckooSentCache) CheckTrace(traceID string) (TraceSentRecord, string, bool) { + // was it dropped? + if c.dropped.Check(traceID) { + // we recognize it as dropped, so just say so; there's nothing else to do + return &cuckooDroppedRecord{}, "", true + } + // was it kept? + c.keptMut.Lock() + defer c.keptMut.Unlock() + if sentRecord, found := c.kept.Get(traceID); found { + reason, _ := c.sentReasons.Get(uint(sentRecord.reason)) + return sentRecord, reason, true + } + // we have no memory of this place + return nil, "", false +} diff --git a/collect/cache/cuckoo_test.go b/collect/cache/cuckoo_test.go index da3ecc20b5..881a715448 100644 --- a/collect/cache/cuckoo_test.go +++ b/collect/cache/cuckoo_test.go @@ -11,14 +11,15 @@ import ( ) // genID returns a random hex string of length numChars -func genID(numChars int) string { - seed := 3565269841805 +var seed = 3565269841805 +var rng = wyhash.Rng(seed) - const charset = "abcdef0123456789" +const charset = "abcdef0123456789" +func genID(numChars int) string { id := make([]byte, numChars) for i := 0; i < numChars; i++ { - id[i] = charset[int(wyhash.Rng(seed))%len(charset)] + id[i] = charset[int(rng.Next()%uint64(len(charset)))] } return string(id) } diff --git a/collect/cache/sent_reason_cache_test.go b/collect/cache/sent_reason_cache_test.go new file mode 100644 index 0000000000..f46cc731f3 --- /dev/null +++ b/collect/cache/sent_reason_cache_test.go @@ -0,0 +1,147 @@ +package cache_test + +import ( + "fmt" + "math/rand" + "strconv" + "sync" + "testing" + "time" + + "github.com/honeycombio/refinery/collect/cache" + "github.com/honeycombio/refinery/metrics" + "github.com/stretchr/testify/assert" +) + +func TestSentReasonCache(t *testing.T) { + s := &metrics.MockMetrics{} + s.Start() + c := cache.NewSentReasonsCache(s) + keys := make([]uint, 0) + entries := []string{"foo", "bar", "baz"} + for _, item := range entries { + keys = append(keys, c.Set(item)) + } + for i, key := range keys { + item, ok := c.Get(key) + assert.True(t, ok, "key %d should exist", key) + assert.Equal(t, entries[i], item) + } +} + +func BenchmarkSentReasonCache_Set(b *testing.B) { + s := &metrics.MockMetrics{} + s.Start() + for _, numItems := range []int{10, 100, 1000, 10000, 100000} { + entries := make([]string, numItems) + for i := 0; i < numItems; i++ { + entries[i] = randomString(50) + } + b.Run(strconv.Itoa(numItems), func(b *testing.B) { + cache := cache.NewSentReasonsCache(s) + for i := 0; i < b.N; i++ { + cache.Set(entries[seededRand.Intn(numItems)]) + } + }) + } +} +func BenchmarkSentReasonCache_Get(b *testing.B) { + s := &metrics.MockMetrics{} + s.Start() + for _, numItems := range []int{10, 100, 1000, 10000, 100000} { + cache := cache.NewSentReasonsCache(s) + for i := 0; i < numItems; i++ { + cache.Set(randomString(50)) + } + b.Run(strconv.Itoa(numItems), func(b *testing.B) { + for i := 0; i < b.N; i++ { + _, _ = cache.Get(uint(seededRand.Intn(numItems))) + } + }) + } +} + +func BenchmarkSentReasonsCache_Get_Parallel(b *testing.B) { + for _, numGoroutines := range []int{1, 50, 300} { + for _, numUniqueEntries := range []int{50, 500, 2000} { + b.Run(fmt.Sprintf("entries%d-g%d", numUniqueEntries, numGoroutines), func(b *testing.B) { + s := &metrics.MockMetrics{} + s.Start() + cache := cache.NewSentReasonsCache(s) + + entries := make([]string, numUniqueEntries) + for i := 0; i < numUniqueEntries; i++ { + entries[i] = randomString(50) + cache.Set(entries[i]) + } + + wg := sync.WaitGroup{} + count := b.N / numGoroutines + if count == 0 { + count = 1 + } + b.ResetTimer() + for g := 0; g < numGoroutines; g++ { + wg.Add(1) + go func() { + for n := 0; n < count; n++ { + _, _ = cache.Get(uint(count % numUniqueEntries)) + } + wg.Done() + }() + } + wg.Wait() + }) + } + } +} + +func BenchmarkSentReasonsCache_Set_Parallel(b *testing.B) { + for _, numGoroutines := range []int{1, 50, 300} { + for _, numUniqueEntries := range []int{50, 500, 2000} { + b.Run(fmt.Sprintf("entries%d-g%d", numUniqueEntries, numGoroutines), func(b *testing.B) { + s := &metrics.MockMetrics{} + s.Start() + entries := make([]string, numUniqueEntries) + for i := 0; i < numUniqueEntries; i++ { + entries[i] = randomString(50) + } + cache := cache.NewSentReasonsCache(s) + wg := sync.WaitGroup{} + count := b.N / numGoroutines + if count == 0 { + count = 1 + } + b.ResetTimer() + for g := 0; g < numGoroutines; g++ { + wg.Add(1) + go func() { + for n := 0; n < count; n++ { + _ = cache.Set(entries[count%numUniqueEntries]) + } + wg.Done() + }() + } + wg.Wait() + }) + } + } +} + +const charset = "abcdefghijklmnopqrstuvwxyz" + + "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" + +var seededRand *rand.Rand = rand.New( + rand.NewSource(time.Now().UnixNano())) + +func stringWithCharset(length int, charset string) string { + b := make([]byte, length) + for i := range b { + b[i] = charset[seededRand.Intn(len(charset))] + } + return string(b) +} + +func randomString(length int) string { + return stringWithCharset(length, charset) +} diff --git a/collect/cache/sent_reasons_cache.go b/collect/cache/sent_reasons_cache.go new file mode 100644 index 0000000000..5b9e35bf6e --- /dev/null +++ b/collect/cache/sent_reasons_cache.go @@ -0,0 +1,58 @@ +package cache + +import ( + "math/rand" + + "github.com/dgryski/go-wyhash" + "github.com/honeycombio/refinery/metrics" +) + +// SentReasonsCache is a cache of reasons a trace was sent. +// It acts as a mapping between the string representation of send reason +// and a uint. +// This is used to reduce the memory footprint of the trace cache. +// It is not concurrency-safe. + +type SentReasonsCache struct { + Metrics metrics.Metrics + + data []string + keys map[uint64]uint32 + + hashSeed uint64 +} + +// NewSentReasonsCache returns a new SentReasonsCache. +func NewSentReasonsCache(metrics metrics.Metrics) *SentReasonsCache { + metrics.Register("collect_sent_reasons_cache_entries", "histogram") + + return &SentReasonsCache{ + Metrics: metrics, + keys: make(map[uint64]uint32), + hashSeed: rand.Uint64(), + } +} + +// Set adds a new reason to the cache, returning the key. +// The key is generated by incrementing a counter. +func (c *SentReasonsCache) Set(key string) uint { + // generate a hash + hash := wyhash.Hash([]byte(key), c.hashSeed) + + val, ok := c.keys[hash] + if !ok { + c.data = append(c.data, key) + val = uint32(len(c.data)) + c.keys[hash] = val + c.Metrics.Increment("collect_sent_reasons_cache_entries") + } + return uint(val) +} + +// Get returns a reason from the cache, if it exists. +func (c *SentReasonsCache) Get(key uint) (string, bool) { + if key == 0 { + return "", false + } + return c.data[key-1], true +} diff --git a/collect/cache/traceSentCache.go b/collect/cache/traceSentCache.go index 59d5f7cf05..f8bf8c3531 100644 --- a/collect/cache/traceSentCache.go +++ b/collect/cache/traceSentCache.go @@ -24,10 +24,13 @@ type TraceSentRecord interface { type TraceSentCache interface { // Record preserves the record of a trace being sent or not. - Record(trace *types.Trace, keep bool) - // Check tests if a trace corresponding to the span is in the cache; if found, it returns the appropriate TraceSentRecord and true, + Record(trace KeptTrace, keep bool, reason string) + // CheckTrace if a trace is in the cache; if found, it returns the appropriate TraceSentRecord and true, else nil and false. + // It does not modify the count information. + CheckTrace(traceID string) (TraceSentRecord, string, bool) + // CheckSpan tests if a trace corresponding to the span is in the cache; if found, it returns the appropriate TraceSentRecord and true, // else nil and false. - Check(span *types.Span) (TraceSentRecord, bool) + CheckSpan(span *types.Span) (TraceSentRecord, string, bool) // Stop halts the cache in preparation for shutdown Stop() // Resize adjusts the size of the cache according to the Config passed in diff --git a/collect/collect.go b/collect/collect.go index 87e939d59e..fcd6430015 100644 --- a/collect/collect.go +++ b/collect/collect.go @@ -1,24 +1,37 @@ package collect import ( + "context" "errors" + "fmt" + "math" + "math/rand" "os" "runtime" "sort" "sync" "time" + "go.opentelemetry.io/otel/trace" + "github.com/honeycombio/refinery/collect/cache" "github.com/honeycombio/refinery/config" + "github.com/honeycombio/refinery/generics" + "github.com/honeycombio/refinery/internal/health" + "github.com/honeycombio/refinery/internal/otelutil" + "github.com/honeycombio/refinery/internal/peer" "github.com/honeycombio/refinery/logger" "github.com/honeycombio/refinery/metrics" "github.com/honeycombio/refinery/sample" + "github.com/honeycombio/refinery/sharder" "github.com/honeycombio/refinery/transmit" "github.com/honeycombio/refinery/types" + "github.com/jonboulle/clockwork" "github.com/sirupsen/logrus" ) var ErrWouldBlock = errors.New("not adding span, channel buffer is full") +var CollectorHealthKey = "collector" type Collector interface { // AddSpan adds a span to be collected, buffered, and merged into a trace. @@ -28,7 +41,7 @@ type Collector interface { AddSpanFromPeer(*types.Span) error Stressed() bool GetStressedSampleRate(traceID string) (rate uint, keep bool, reason string) - ProcessSpanImmediately(sp *types.Span, keep bool, sampleRate uint, reason string) + ProcessSpanImmediately(sp *types.Span) (processed bool, keep bool) } func GetCollectorImplementation(c config.Config) Collector { @@ -39,18 +52,26 @@ func GetCollectorImplementation(c config.Config) Collector { const ( TraceSendGotRoot = "trace_send_got_root" TraceSendExpired = "trace_send_expired" + TraceSendSpanLimit = "trace_send_span_limit" TraceSendEjectedFull = "trace_send_ejected_full" TraceSendEjectedMemsize = "trace_send_ejected_memsize" + TraceSendLateSpan = "trace_send_late_span" ) // InMemCollector is a single threaded collector. type InMemCollector struct { - Config config.Config `inject:""` - Logger logger.Logger `inject:""` + Config config.Config `inject:""` + Logger logger.Logger `inject:""` + Clock clockwork.Clock `inject:""` + Tracer trace.Tracer `inject:"tracer"` + Health health.Recorder `inject:""` + Sharder sharder.Sharder `inject:""` + Transmission transmit.Transmission `inject:"upstreamTransmission"` Metrics metrics.Metrics `inject:"genericMetrics"` SamplerFactory *sample.SamplerFactory `inject:""` StressRelief StressReliever `inject:"stressRelief"` + Peers peer.Peers `inject:""` // For test use only BlockOnAddSpan bool @@ -58,15 +79,17 @@ type InMemCollector struct { // mutex must be held whenever non-channel internal fields are accessed. // This exists to avoid data races in tests and startup/shutdown. mutex sync.RWMutex + cache cache.Cache - cache cache.Cache datasetSamplers map[string]sample.Sampler sampleTraceCache cache.TraceSentCache - incoming chan *types.Span - fromPeer chan *types.Span - reload chan struct{} + incoming chan *types.Span + fromPeer chan *types.Span + reload chan struct{} + done chan struct{} + redistributeTimer *redistributeNotifier hostname string } @@ -74,24 +97,21 @@ type InMemCollector struct { func (i *InMemCollector) Start() error { i.Logger.Debug().Logf("Starting InMemCollector") defer func() { i.Logger.Debug().Logf("Finished starting InMemCollector") }() - imcConfig, err := i.Config.GetCollectionConfig() - if err != nil { - return err - } + imcConfig := i.Config.GetCollectionConfig() i.cache = cache.NewInMemCache(imcConfig.CacheCapacity, i.Metrics, i.Logger) i.StressRelief.UpdateFromConfig(i.Config.GetStressReliefConfig()) // listen for config reloads i.Config.RegisterReloadCallback(i.sendReloadSignal) + i.Health.Register(CollectorHealthKey, 3*time.Second) + i.Metrics.Register("trace_duration_ms", "histogram") i.Metrics.Register("trace_span_count", "histogram") i.Metrics.Register("collector_incoming_queue", "histogram") i.Metrics.Register("collector_peer_queue_length", "gauge") i.Metrics.Register("collector_incoming_queue_length", "gauge") i.Metrics.Register("collector_peer_queue", "histogram") - i.Metrics.Register("stress_level", "gauge") - i.Metrics.Register("stress_relief_activated", "gauge") i.Metrics.Register("collector_cache_size", "gauge") i.Metrics.Register("memory_heap_allocation", "gauge") i.Metrics.Register("span_received", "counter") @@ -103,15 +123,23 @@ func (i *InMemCollector) Start() error { i.Metrics.Register("trace_send_dropped", "counter") i.Metrics.Register("trace_send_has_root", "counter") i.Metrics.Register("trace_send_no_root", "counter") + i.Metrics.Register("trace_forwarded_on_peer_change", "gauge") + i.Metrics.Register("trace_redistribution_count", "gauge") + i.Metrics.Register("trace_send_on_shutdown", "counter") + i.Metrics.Register("trace_forwarded_on_shutdown", "counter") + i.Metrics.Register(TraceSendGotRoot, "counter") i.Metrics.Register(TraceSendExpired, "counter") + i.Metrics.Register(TraceSendSpanLimit, "counter") i.Metrics.Register(TraceSendEjectedFull, "counter") i.Metrics.Register(TraceSendEjectedMemsize, "counter") + i.Metrics.Register(TraceSendLateSpan, "counter") sampleCacheConfig := i.Config.GetSampleCacheConfig() i.Metrics.Register(cache.CurrentCapacity, "gauge") i.Metrics.Register(cache.FutureLoadFactor, "gauge") i.Metrics.Register(cache.CurrentLoadFactor, "gauge") + var err error i.sampleTraceCache, err = cache.NewCuckooSentCache(sampleCacheConfig, i.Metrics) if err != nil { return err @@ -122,7 +150,10 @@ func (i *InMemCollector) Start() error { i.Metrics.Store("INCOMING_CAP", float64(cap(i.incoming))) i.Metrics.Store("PEER_CAP", float64(cap(i.fromPeer))) i.reload = make(chan struct{}, 1) + i.done = make(chan struct{}) i.datasetSamplers = make(map[string]sample.Sampler) + i.done = make(chan struct{}) + i.redistributeTimer = newRedistributeNotifier(i.Logger, i.Metrics, i.Clock) if i.Config.GetAddHostMetadataToTrace() { if hostname, err := os.Hostname(); err == nil && hostname != "" { @@ -130,6 +161,10 @@ func (i *InMemCollector) Start() error { } } + if !i.Config.GetCollectionConfig().DisableRedistribution { + i.Peers.RegisterUpdatedPeersCallback(i.redistributeTimer.Reset) + } + // spin up one collector because this is a single threaded collector go i.collect() @@ -137,7 +172,7 @@ func (i *InMemCollector) Start() error { } // sendReloadSignal will trigger the collector reloading its config, eventually. -func (i *InMemCollector) sendReloadSignal() { +func (i *InMemCollector) sendReloadSignal(cfgHash, ruleHash string) { // non-blocking insert of the signal here so we don't leak goroutines select { case i.reload <- struct{}{}: @@ -149,33 +184,26 @@ func (i *InMemCollector) sendReloadSignal() { func (i *InMemCollector) reloadConfigs() { i.Logger.Debug().Logf("reloading in-mem collect config") - imcConfig, err := i.Config.GetCollectionConfig() - if err != nil { - i.Logger.Error().WithField("error", err).Logf("Failed to reload InMemCollector section when reloading configs") - } - - if existingCache, ok := i.cache.(*cache.DefaultInMemCache); ok { - if imcConfig.CacheCapacity != existingCache.GetCacheSize() { - i.Logger.Debug().WithField("cache_size.previous", existingCache.GetCacheSize()).WithField("cache_size.new", imcConfig.CacheCapacity).Logf("refreshing the cache because it changed size") - c := cache.NewInMemCache(imcConfig.CacheCapacity, i.Metrics, i.Logger) - // pull the old cache contents into the new cache - for j, trace := range existingCache.GetAll() { - if j >= imcConfig.CacheCapacity { - i.send(trace, TraceSendEjectedFull) - continue - } - c.Set(trace) + imcConfig := i.Config.GetCollectionConfig() + + if imcConfig.CacheCapacity != i.cache.GetCacheCapacity() { + i.Logger.Debug().WithField("cache_size.previous", i.cache.GetCacheCapacity()).WithField("cache_size.new", imcConfig.CacheCapacity).Logf("refreshing the cache because it changed size") + c := cache.NewInMemCache(imcConfig.CacheCapacity, i.Metrics, i.Logger) + // pull the old cache contents into the new cache + for j, trace := range i.cache.GetAll() { + if j >= imcConfig.CacheCapacity { + i.send(trace, TraceSendEjectedFull) + continue } - i.cache = c - } else { - i.Logger.Debug().Logf("skipping reloading the in-memory cache on config reload because it hasn't changed capacity") + c.Set(trace) } - - i.sampleTraceCache.Resize(i.Config.GetSampleCacheConfig()) + i.cache = c } else { - i.Logger.Error().WithField("cache", i.cache.(*cache.DefaultInMemCache)).Logf("skipping reloading the cache on config reload because it's not an in-memory cache") + i.Logger.Debug().Logf("skipping reloading the in-memory cache on config reload because it hasn't changed capacity") } + i.sampleTraceCache.Resize(i.Config.GetSampleCacheConfig()) + i.StressRelief.UpdateFromConfig(i.Config.GetStressReliefConfig()) // clear out any samplers that we have previously created @@ -185,14 +213,14 @@ func (i *InMemCollector) reloadConfigs() { } func (i *InMemCollector) checkAlloc() { - inMemConfig, err := i.Config.GetCollectionConfig() + inMemConfig := i.Config.GetCollectionConfig() maxAlloc := inMemConfig.GetMaxAlloc() i.Metrics.Store("MEMORY_MAX_ALLOC", float64(maxAlloc)) var mem runtime.MemStats runtime.ReadMemStats(&mem) i.Metrics.Gauge("memory_heap_allocation", int64(mem.Alloc)) - if err != nil || maxAlloc == 0 || mem.Alloc < uint64(maxAlloc) { + if maxAlloc == 0 || mem.Alloc < uint64(maxAlloc) { return } @@ -206,16 +234,9 @@ func (i *InMemCollector) checkAlloc() { // remove the traces from the cache that have had the most impact on allocation. // To do this, we sort the traces by their CacheImpact value and then remove traces // until the total size is less than the amount to which we want to shrink. - existingCache, ok := i.cache.(*cache.DefaultInMemCache) - if !ok { - i.Logger.Error().WithField("alloc", mem.Alloc).Logf( - "total allocation exceeds limit, but unable to control cache", - ) - return - } - allTraces := existingCache.GetAll() - timeout, err := i.Config.GetTraceTimeout() - if err != nil { + allTraces := i.cache.GetAll() + timeout := i.Config.GetTracesConfig().GetTraceTimeout() + if timeout == 0 { timeout = 60 * time.Second } // Sort traces by CacheImpact, heaviest first sort.Slice(allTraces, func(i, j int) bool { @@ -226,21 +247,21 @@ func (i *InMemCollector) checkAlloc() { // successive traces until we've crossed the totalToRemove threshold // or just run out of traces to delete. - cap := existingCache.GetCacheSize() + cap := i.cache.GetCacheCapacity() i.Metrics.Gauge("collector_cache_size", cap) totalDataSizeSent := 0 - tracesSent := make(map[string]struct{}) + tracesSent := generics.NewSet[string]() // Send the traces we can't keep. for _, trace := range allTraces { - tracesSent[trace.TraceID] = struct{}{} + tracesSent.Add(trace.TraceID) totalDataSizeSent += trace.DataSize i.send(trace, TraceSendEjectedMemsize) if totalDataSizeSent > int(totalToRemove) { break } } - existingCache.RemoveTraces(tracesSent) + i.cache.RemoveTraces(tracesSent) // Treat any MaxAlloc overage as an error so we know it's happening i.Logger.Error(). @@ -248,7 +269,7 @@ func (i *InMemCollector) checkAlloc() { WithField("alloc", mem.Alloc). WithField("num_traces_sent", len(tracesSent)). WithField("datasize_sent", totalDataSizeSent). - WithField("new_trace_count", existingCache.GetCacheSize()). + WithField("new_trace_count", i.cache.GetCacheCapacity()). Logf("evicting large traces early due to memory overage") // Manually GC here - without this we can easily end up evicting more than we @@ -299,7 +320,7 @@ func (i *InMemCollector) add(sp *types.Span, ch chan<- *types.Span) error { // block is the only place we are allowed to modify any running data // structures. func (i *InMemCollector) collect() { - tickerDuration := i.Config.GetSendTickerValue() + tickerDuration := i.Config.GetTracesConfig().GetSendTickerValue() ticker := time.NewTicker(tickerDuration) defer ticker.Stop() @@ -309,23 +330,22 @@ func (i *InMemCollector) collect() { defer i.mutex.Unlock() for { + i.Health.Ready(CollectorHealthKey, true) // record channel lengths as histogram but also as gauges i.Metrics.Histogram("collector_incoming_queue", float64(len(i.incoming))) i.Metrics.Histogram("collector_peer_queue", float64(len(i.fromPeer))) i.Metrics.Gauge("collector_incoming_queue_length", float64(len(i.incoming))) i.Metrics.Gauge("collector_peer_queue_length", float64(len(i.fromPeer))) - i.Metrics.Gauge("stress_level", float64(i.StressRelief.StressLevel())) - if i.StressRelief.Stressed() { - i.Metrics.Gauge("stress_relief_activated", 1) - } else { - i.Metrics.Gauge("stress_relief_activated", 0) - } // Always drain peer channel before doing anything else. By processing peer // traffic preferentially we avoid the situation where the cluster essentially // deadlocks because peers are waiting to get their events handed off to each // other. select { + case <-i.done: + return + case <-i.redistributeTimer.Notify(): + i.redistributeTraces() case sp, ok := <-i.fromPeer: if !ok { // channel's been closed; we should shut down. @@ -334,14 +354,22 @@ func (i *InMemCollector) collect() { i.processSpan(sp) default: select { + case <-i.done: + return case <-ticker.C: - i.sendTracesInCache(time.Now()) - i.checkAlloc() - - // Briefly unlock the cache, to allow test access. - i.mutex.Unlock() - runtime.Gosched() - i.mutex.Lock() + select { + case <-i.done: + default: + i.sendExpiredTracesInCache(i.Clock.Now()) + i.checkAlloc() + + // Briefly unlock the cache, to allow test access. + i.mutex.Unlock() + runtime.Gosched() + i.mutex.Lock() + } + case <-i.redistributeTimer.Notify(): + i.redistributeTraces() case sp, ok := <-i.incoming: if !ok { // channel's been closed; we should shut down. @@ -363,13 +391,76 @@ func (i *InMemCollector) collect() { } } -func (i *InMemCollector) sendTracesInCache(now time.Time) { +func (i *InMemCollector) redistributeTraces() { + _, span := otelutil.StartSpan(context.Background(), i.Tracer, "redistributeTraces") + defer span.End() + // loop through eveything in the cache of live traces + // if it doesn't belong to this peer, we should forward it to the correct peer + peers, err := i.Peers.GetPeers() + if err != nil { + i.Logger.Error().Logf("unable to get peer list with error %s", err.Error()) + return + } + numOfPeers := len(peers) + if numOfPeers == 0 { + return + } + + traces := i.cache.GetAll() + forwardedTraces := generics.NewSetWithCapacity[string](len(traces) / numOfPeers) + for _, trace := range traces { + if trace == nil { + continue + } + + newTarget := i.Sharder.WhichShard(trace.TraceID) + + if newTarget.Equals(i.Sharder.MyShard()) { + continue + } + + for _, sp := range trace.GetSpans() { + sp.APIHost = newTarget.GetAddress() + + if sp.Data == nil { + sp.Data = make(map[string]interface{}) + } + if v, ok := sp.Data["meta.refinery.forwarded"]; ok { + sp.Data["meta.refinery.forwarded"] = fmt.Sprintf("%s,%s", v, i.hostname) + } else { + sp.Data["meta.refinery.forwarded"] = i.hostname + } + + i.Transmission.EnqueueSpan(sp) + } + + forwardedTraces.Add(trace.TraceID) + } + + otelutil.AddSpanFields(span, map[string]interface{}{ + "forwarded_trace_count": len(forwardedTraces.Members()), + "total_trace_count": len(traces), + "hostname": i.hostname, + }) + + i.Metrics.Gauge("trace_forwarded_on_peer_change", len(forwardedTraces)) + if len(forwardedTraces) > 0 { + i.cache.RemoveTraces(forwardedTraces) + } +} + +func (i *InMemCollector) sendExpiredTracesInCache(now time.Time) { traces := i.cache.TakeExpiredTraces(now) + spanLimit := uint32(i.Config.GetTracesConfig().SpanLimit) for _, t := range traces { if t.RootSpan != nil { i.send(t, TraceSendGotRoot) } else { - i.send(t, TraceSendExpired) + if spanLimit > 0 && t.DescendantCount() > spanLimit { + i.send(t, TraceSendSpanLimit) + } else { + i.send(t, TraceSendExpired) + } } } } @@ -377,32 +468,35 @@ func (i *InMemCollector) sendTracesInCache(now time.Time) { // processSpan does all the stuff necessary to take an incoming span and add it // to (or create a new placeholder for) a trace. func (i *InMemCollector) processSpan(sp *types.Span) { + ctx := context.Background() defer func() { i.Metrics.Increment("span_processed") i.Metrics.Down("spans_waiting") }() + tcfg := i.Config.GetTracesConfig() + trace := i.cache.Get(sp.TraceID) if trace == nil { // if the trace has already been sent, just pass along the span - if sr, found := i.sampleTraceCache.Check(sp); found { + if sr, sentReason, found := i.sampleTraceCache.CheckSpan(sp); found { i.Metrics.Increment("trace_sent_cache_hit") // bump the count of records on this trace -- if the root span isn't // the last late span, then it won't be perfect, but it will be better than // having none at all - i.dealWithSentTrace(sr, sp) + i.dealWithSentTrace(ctx, sr, sentReason, sp) return } // trace hasn't already been sent (or this span is really old); let's // create a new trace to hold it i.Metrics.Increment("trace_accepted") - timeout, err := i.Config.GetTraceTimeout() - if err != nil { + timeout := tcfg.GetTraceTimeout() + if timeout == 0 { timeout = 60 * time.Second } - now := time.Now() + now := i.Clock.Now() trace = &types.Trace{ APIHost: sp.APIHost, APIKey: sp.APIKey, @@ -410,8 +504,8 @@ func (i *InMemCollector) processSpan(sp *types.Span) { TraceID: sp.TraceID, ArrivalTime: now, SendBy: now.Add(timeout), - SampleRate: sp.SampleRate, // if it had a sample rate, we want to keep it } + trace.SetSampleRate(sp.SampleRate) // if it had a sample rate, we want to keep it // push this into the cache and if we eject an unsent trace, send it ASAP ejectedTrace := i.cache.Set(trace) if ejectedTrace != nil { @@ -421,22 +515,41 @@ func (i *InMemCollector) processSpan(sp *types.Span) { // if the trace we got back from the cache has already been sent, deal with the // span. if trace.Sent { - i.dealWithSentTrace(cache.NewKeptTraceCacheEntry(trace), sp) + if sr, reason, found := i.sampleTraceCache.CheckSpan(sp); found { + i.Metrics.Increment("trace_sent_cache_hit") + i.dealWithSentTrace(ctx, sr, reason, sp) + return + } + // trace has already been sent, but this is not in the sent cache. + // we will just use the default late span reason as the sent reason which is + // set inside the dealWithSentTrace function + i.dealWithSentTrace(ctx, cache.NewKeptTraceCacheEntry(trace), "", sp) } // great! trace is live. add the span. trace.AddSpan(sp) - // if this is a root span, send the trace + // we may override these values in conditions below + var markTraceForSending bool + timeout := tcfg.GetSendDelay() + if timeout == 0 { + timeout = 2 * time.Second // a sensible default + } + + // if this is a root span, say so and send the trace if i.isRootSpan(sp) { - timeout, err := i.Config.GetSendDelay() + markTraceForSending = true + trace.RootSpan = sp + } - if err != nil { - timeout = 2 * time.Second - } + // if the span count has exceeded our SpanLimit, send the trace immediately + if tcfg.SpanLimit > 0 && uint(trace.DescendantCount()) > tcfg.SpanLimit { + markTraceForSending = true + timeout = 0 // don't use a timeout in this case; this is an "act fast" situation + } - trace.SendBy = time.Now().Add(timeout) - trace.RootSpan = sp + if markTraceForSending { + trace.SendBy = i.Clock.Now().Add(timeout) } } @@ -450,54 +563,97 @@ func (i *InMemCollector) processSpan(sp *types.Span) { // cache as "kept". // It doesn't do any logging and barely touches metrics; this is about as // minimal as we can make it. -func (i *InMemCollector) ProcessSpanImmediately(sp *types.Span, keep bool, sampleRate uint, reason string) { - now := time.Now() - trace := &types.Trace{ - APIHost: sp.APIHost, - APIKey: sp.APIKey, - Dataset: sp.Dataset, - TraceID: sp.TraceID, - ArrivalTime: now, - SendBy: now, - } - // we do want a record of how we disposed of traces in case more come in after we've - // turned off stress relief (if stress relief is on we'll keep making the same decisions) - i.sampleTraceCache.Record(trace, keep) +func (i *InMemCollector) ProcessSpanImmediately(sp *types.Span) (processed bool, keep bool) { + _, span := otelutil.StartSpanWith(context.Background(), i.Tracer, "collector.ProcessSpanImmediately", "trace_id", sp.TraceID) + defer span.End() + + if !i.StressRelief.ShouldSampleDeterministically(sp.TraceID) { + otelutil.AddSpanField(span, "nondeterministic", 1) + return false, false + } + + var rate uint + record, reason, found := i.sampleTraceCache.CheckSpan(sp) + if !found { + rate, keep, reason = i.StressRelief.GetSampleRate(sp.TraceID) + now := i.Clock.Now() + trace := &types.Trace{ + APIHost: sp.APIHost, + APIKey: sp.APIKey, + Dataset: sp.Dataset, + TraceID: sp.TraceID, + ArrivalTime: now, + SendBy: now, + } + trace.SetSampleRate(rate) + // we do want a record of how we disposed of traces in case more come in after we've + // turned off stress relief (if stress relief is on we'll keep making the same decisions) + i.sampleTraceCache.Record(trace, keep, reason) + } else { + rate = record.Rate() + keep = record.Kept() + } + if !keep { i.Metrics.Increment("dropped_from_stress") - return + return true, false } + + i.Metrics.Increment("kept_from_stress") // ok, we're sending it, so decorate it first - sp.Event.Data["meta.stressed"] = true + sp.Data["meta.stressed"] = true if i.Config.GetAddRuleReasonToTrace() { - sp.Event.Data["meta.refinery.reason"] = reason + sp.Data["meta.refinery.reason"] = reason } if i.hostname != "" { sp.Data["meta.refinery.local_hostname"] = i.hostname } + i.addAdditionalAttributes(sp) - mergeTraceAndSpanSampleRates(sp, sampleRate, i.Config.GetIsDryRun()) + mergeTraceAndSpanSampleRates(sp, rate, i.Config.GetIsDryRun()) i.Transmission.EnqueueSpan(sp) + + return true, true } // dealWithSentTrace handles a span that has arrived after the sampling decision // on the trace has already been made, and it obeys that decision by either // sending the span immediately or dropping it. -func (i *InMemCollector) dealWithSentTrace(tr cache.TraceSentRecord, sp *types.Span) { +func (i *InMemCollector) dealWithSentTrace(ctx context.Context, tr cache.TraceSentRecord, sentReason string, sp *types.Span) { + _, span := otelutil.StartSpanMulti(ctx, i.Tracer, "dealWithSentTrace", map[string]interface{}{ + "trace_id": sp.TraceID, + "sent_reason": sentReason, + "hostname": i.hostname, + }) + defer span.End() + if i.Config.GetAddRuleReasonToTrace() { - sp.Data["meta.refinery.reason"] = "late" + var metaReason string + if len(sentReason) > 0 { + metaReason = fmt.Sprintf("%s - late arriving span", sentReason) + } else { + metaReason = "late arriving span" + } + sp.Data["meta.refinery.reason"] = metaReason + sp.Data["meta.refinery.send_reason"] = TraceSendLateSpan + } if i.hostname != "" { sp.Data["meta.refinery.local_hostname"] = i.hostname } isDryRun := i.Config.GetIsDryRun() keep := tr.Kept() + otelutil.AddSpanFields(span, map[string]interface{}{ + "keep": keep, + "is_dryrun": isDryRun, + }) if isDryRun { // if dry run mode is enabled, we keep all traces and mark the spans with the sampling decision sp.Data[config.DryRunFieldName] = keep if !keep { i.Logger.Debug().WithField("trace_id", sp.TraceID).Logf("Sending span that would have been dropped, but dry run mode is enabled") + i.Metrics.Increment(TraceSendLateSpan) i.addAdditionalAttributes(sp) i.Transmission.EnqueueSpan(sp) return @@ -507,7 +663,8 @@ func (i *InMemCollector) dealWithSentTrace(tr cache.TraceSentRecord, sp *types.S i.Logger.Debug().WithField("trace_id", sp.TraceID).Logf("Sending span because of previous decision to send trace") mergeTraceAndSpanSampleRates(sp, tr.Rate(), isDryRun) // if this span is a late root span, possibly update it with our current span count - if i.isRootSpan(sp) { + isRootSpan := i.isRootSpan(sp) + if isRootSpan { if i.Config.GetAddCountsToRoot() { sp.Data["meta.span_event_count"] = int64(tr.SpanEventCount()) sp.Data["meta.span_link_count"] = int64(tr.SpanLinkCount()) @@ -516,8 +673,9 @@ func (i *InMemCollector) dealWithSentTrace(tr cache.TraceSentRecord, sp *types.S } else if i.Config.GetAddSpanCountToRoot() { sp.Data["meta.span_count"] = int64(tr.DescendantCount()) } - } + otelutil.AddSpanField(span, "is_root_span", isRootSpan) + i.Metrics.Increment(TraceSendLateSpan) i.addAdditionalAttributes(sp) i.Transmission.EnqueueSpan(sp) return @@ -553,9 +711,14 @@ func mergeTraceAndSpanSampleRates(sp *types.Span, traceSampleRate uint, dryRunMo } func (i *InMemCollector) isRootSpan(sp *types.Span) bool { + // log event should never be considered a root span, check for that first + if signalType := sp.Data["meta.signal_type"]; signalType == "log" { + return false + } + // check if the event has a parent id using the configured parent id field names for _, parentIdFieldName := range i.Config.GetParentIdFieldNames() { parentId := sp.Data[parentIdFieldName] - if _, ok := parentId.(string); ok { + if _, ok := parentId.(string); ok && parentId != "" { return false } } @@ -573,7 +736,7 @@ func (i *InMemCollector) send(trace *types.Trace, sendReason string) { } trace.Sent = true - traceDur := time.Since(trace.ArrivalTime) + traceDur := i.Clock.Since(trace.ArrivalTime) i.Metrics.Histogram("trace_duration_ms", float64(traceDur.Milliseconds())) i.Metrics.Histogram("trace_span_count", float64(trace.DescendantCount())) if trace.RootSpan != nil { @@ -619,7 +782,7 @@ func (i *InMemCollector) send(trace *types.Trace, sendReason string) { // make sampling decision and update the trace rate, shouldSend, reason, key := sampler.GetSampleRate(trace) - trace.SampleRate = rate + trace.SetSampleRate(rate) trace.KeepSample = shouldSend logFields["reason"] = reason if key != "" { @@ -628,7 +791,7 @@ func (i *InMemCollector) send(trace *types.Trace, sendReason string) { // This will observe sample rate attempts even if the trace is dropped i.Metrics.Histogram("trace_aggregate_sample_rate", float64(rate)) - i.sampleTraceCache.Record(trace, shouldSend) + i.sampleTraceCache.Record(trace, shouldSend, reason) // if we're supposed to drop this trace, and dry run mode is not enabled, then we're done. if !shouldSend && !i.Config.GetIsDryRun() { @@ -674,42 +837,218 @@ func (i *InMemCollector) send(trace *types.Trace, sendReason string) { if i.hostname != "" { sp.Data["meta.refinery.local_hostname"] = i.hostname } - mergeTraceAndSpanSampleRates(sp, trace.SampleRate, isDryRun) + mergeTraceAndSpanSampleRates(sp, trace.SampleRate(), isDryRun) i.addAdditionalAttributes(sp) i.Transmission.EnqueueSpan(sp) } } func (i *InMemCollector) Stop() error { - // close the incoming channel and (TODO) wait for all collectors to finish - close(i.incoming) + i.redistributeTimer.Stop() + close(i.done) + // signal the health system to not be ready + // so that no new traces are accepted + i.Health.Ready(CollectorHealthKey, false) i.mutex.Lock() - defer i.mutex.Unlock() - // purge the collector of any in-flight traces - if i.cache != nil { - traces := i.cache.GetAll() - for _, trace := range traces { - if trace != nil { - i.send(trace, TraceSendEjectedFull) - } + if !i.Config.GetCollectionConfig().DisableRedistribution { + peers, err := i.Peers.GetPeers() + if err != nil { + i.Logger.Error().Logf("unable to get peer list with error %s", err.Error()) + } + if len(peers) > 0 { + i.sendTracesOnShutdown() } } + if i.Transmission != nil { i.Transmission.Flush() } i.sampleTraceCache.Stop() + i.mutex.Unlock() + + close(i.incoming) + close(i.fromPeer) return nil } +// sentRecord is a struct that holds a span and the record of the trace decision made. +type sentRecord struct { + span *types.Span + record cache.TraceSentRecord + reason string +} + +// sendTracesInCache sends all traces in the cache to their final destination. +// This is done on shutdown to ensure that all traces are sent before the collector +// is stopped. +// It does this by pulling spans out of both the incoming queue and the peer queue so that +// any spans that are still in the queues when the collector is stopped are also sent. +// It also pulls traces out of the cache and sends them to their final destination. +func (i *InMemCollector) sendTracesOnShutdown() { + wg := &sync.WaitGroup{} + sentChan := make(chan sentRecord, len(i.incoming)) + forwardChan := make(chan *types.Span, i.Config.GetCollectionConfig().CacheCapacity) + + ctx, cancel := context.WithTimeout(context.Background(), time.Duration(i.Config.GetCollectionConfig().ShutdownDelay)) + defer cancel() + + // start a goroutine that will pull spans off of the channels passed in + // and send them to their final destination + wg.Add(1) + go func() { + defer wg.Done() + i.sendSpansOnShutdown(ctx, sentChan, forwardChan) + }() + + // start a goroutine that will pull spans off of the incoming queue + // and place them on the sentChan or forwardChan + wg.Add(1) + go func() { + defer wg.Done() + for { + select { + case <-ctx.Done(): + return + case sp, ok := <-i.incoming: + if !ok { + return + } + + i.distributeSpansOnShutdown(sentChan, forwardChan, sp) + } + } + }() + + // start a goroutine that will pull spans off of the peer queue + // and place them on the sentChan or forwardChan + wg.Add(1) + go func() { + defer wg.Done() + for { + select { + case <-ctx.Done(): + return + case sp, ok := <-i.fromPeer: + if !ok { + return + } + + i.distributeSpansOnShutdown(sentChan, forwardChan, sp) + } + } + }() + + // pull traces from the trace cache and place them on the sentChan or forwardChan + if i.cache != nil { + traces := i.cache.GetAll() + for _, trace := range traces { + i.distributeSpansOnShutdown(sentChan, forwardChan, trace.GetSpans()...) + } + } + + wg.Wait() + + close(sentChan) + close(forwardChan) + +} + +// distributeSpansInCache takes a list of spans and sends them to the appropriate channel based on the state of the trace. +func (i *InMemCollector) distributeSpansOnShutdown(sentSpanChan chan sentRecord, forwardSpanChan chan *types.Span, spans ...*types.Span) { + for _, sp := range spans { + if sp != nil { + + // first check if there's a trace decision + record, reason, found := i.sampleTraceCache.CheckSpan(sp) + if found { + sentSpanChan <- sentRecord{sp, record, reason} + continue + } + + // if there's no trace decision, then we need to forward the trace to its new home + forwardSpanChan <- sp + } + } +} + +// sendSpansOnShutdown is a helper function that sends span to their final destination +// on shutdown. +func (i *InMemCollector) sendSpansOnShutdown(ctx context.Context, sentSpanChan <-chan sentRecord, forwardSpanChan <-chan *types.Span) { + sentTraces := make(map[string]struct{}) + forwardedTraces := make(map[string]struct{}) + + for { + select { + case <-ctx.Done(): + i.Logger.Info().Logf("Timed out waiting for traces to send") + return + + case r, ok := <-sentSpanChan: + if !ok { + return + } + + ctx, span := otelutil.StartSpanMulti(ctx, i.Tracer, "shutdown_sent_span", map[string]interface{}{"trace_id": r.span.TraceID, "hostname": i.hostname}) + r.span.Data["meta.refinery.shutdown.send"] = true + + i.dealWithSentTrace(ctx, r.record, r.reason, r.span) + _, exist := sentTraces[r.span.TraceID] + if !exist { + sentTraces[r.span.TraceID] = struct{}{} + i.Metrics.Count("trace_send_on_shutdown", 1) + + } + + span.End() + + case sp, ok := <-forwardSpanChan: + if !ok { + return + } + + _, span := otelutil.StartSpanMulti(ctx, i.Tracer, "shutdown_forwarded_span", map[string]interface{}{"trace_id": sp.TraceID, "hostname": i.hostname}) + + targetShard := i.Sharder.WhichShard(sp.TraceID) + url := targetShard.GetAddress() + + otelutil.AddSpanField(span, "target_shard", url) + + // TODO: we need to decorate the expired traces before forwarding them so that + // the downstream consumers can make decisions based on the metadata without having + // to restart the TraceTimeout or SendDelay + sp.APIHost = url + + if sp.Data == nil { + sp.Data = make(map[string]interface{}) + } + if v, ok := sp.Data["meta.refinery.forwarded"]; ok { + sp.Data["meta.refinery.forwarded"] = fmt.Sprintf("%s,%s", v, i.hostname) + } else { + sp.Data["meta.refinery.forwarded"] = i.hostname + } + + i.Transmission.EnqueueSpan(sp) + _, exist := forwardedTraces[sp.TraceID] + if !exist { + forwardedTraces[sp.TraceID] = struct{}{} + i.Metrics.Count("trace_forwarded_on_shutdown", 1) + + } + + span.End() + } + + } +} + // Convenience method for tests. func (i *InMemCollector) getFromCache(traceID string) *types.Trace { - i.mutex.RLock() - defer i.mutex.RUnlock() - + i.mutex.Lock() + defer i.mutex.Unlock() return i.cache.Get(traceID) } @@ -718,3 +1057,105 @@ func (i *InMemCollector) addAdditionalAttributes(sp *types.Span) { sp.Data[k] = v } } + +func newRedistributeNotifier(logger logger.Logger, metrics metrics.Metrics, clock clockwork.Clock) *redistributeNotifier { + r := &redistributeNotifier{ + initialDelay: 3 * time.Second, + maxDelay: 30 * time.Second, + maxAttempts: 5, + done: make(chan struct{}), + clock: clock, + logger: logger, + metrics: metrics, + triggered: make(chan struct{}), + reset: make(chan struct{}), + } + r.metrics.Register("trace_redistribution_count", "gauge") + + return r +} + +type redistributeNotifier struct { + clock clockwork.Clock + logger logger.Logger + initialDelay time.Duration + maxAttempts int + maxDelay time.Duration + metrics metrics.Metrics + + reset chan struct{} + done chan struct{} + triggered chan struct{} + once sync.Once +} + +func (r *redistributeNotifier) Notify() <-chan struct{} { + return r.triggered +} + +func (r *redistributeNotifier) Reset() { + var started bool + r.once.Do(func() { + go r.run() + started = true + }) + + if started { + return + } + + select { + case r.reset <- struct{}{}: + case <-r.done: + return + default: + r.logger.Debug().Logf("A trace redistribution is ongoing. Ignoring reset.") + } +} + +func (r *redistributeNotifier) Stop() { + close(r.done) +} + +func (r *redistributeNotifier) run() { + var attempts int + lastBackoff := r.initialDelay + for { + // if we've reached the max attempts, reset the backoff and attempts + // only when the reset signal is received. + if attempts >= r.maxAttempts { + r.metrics.Gauge("trace_redistribution_count", 0) + <-r.reset + lastBackoff = r.initialDelay + attempts = 0 + } + select { + case <-r.done: + return + case r.triggered <- struct{}{}: + } + + attempts++ + r.metrics.Gauge("trace_redistribution_count", attempts) + + // Calculate the backoff interval using exponential backoff with a base time. + backoff := time.Duration(math.Min(float64(lastBackoff)*2, float64(r.maxDelay))) + // Add jitter to the backoff to avoid retry collisions. + jitter := time.Duration(rand.Float64() * float64(backoff) * 0.5) + nextBackoff := backoff + jitter + lastBackoff = nextBackoff + + timer := r.clock.NewTimer(nextBackoff) + select { + case <-timer.Chan(): + timer.Stop() + case <-r.reset: + lastBackoff = r.initialDelay + attempts = 0 + timer.Stop() + case <-r.done: + timer.Stop() + return + } + } +} diff --git a/collect/collect_test.go b/collect/collect_test.go index 31a1c36029..fac99fa6c0 100644 --- a/collect/collect_test.go +++ b/collect/collect_test.go @@ -1,6 +1,7 @@ package collect import ( + "context" "fmt" "math/rand" "runtime" @@ -10,14 +11,19 @@ import ( "time" "github.com/facebookgo/inject" + "github.com/jonboulle/clockwork" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.opentelemetry.io/otel/trace/noop" "github.com/honeycombio/refinery/collect/cache" "github.com/honeycombio/refinery/config" + "github.com/honeycombio/refinery/internal/health" "github.com/honeycombio/refinery/internal/peer" "github.com/honeycombio/refinery/logger" "github.com/honeycombio/refinery/metrics" "github.com/honeycombio/refinery/sample" + "github.com/honeycombio/refinery/sharder" "github.com/honeycombio/refinery/transmit" "github.com/honeycombio/refinery/types" ) @@ -34,29 +40,61 @@ func newCache() (cache.TraceSentCache, error) { return cache.NewCuckooSentCache(cfg, &metrics.NullMetrics{}) } -// TestAddRootSpan tests that adding a root span winds up with a trace object in -// the cache and that that trace gets sent -func TestAddRootSpan(t *testing.T) { - transmission := &transmit.MockTransmission{} - transmission.Start() - conf := &config.MockConfig{ - GetSendDelayVal: 0, - GetTraceTimeoutVal: 60 * time.Second, - GetSamplerTypeVal: &config.DeterministicSamplerConfig{SampleRate: 1}, - SendTickerVal: 2 * time.Millisecond, - ParentIdFieldNames: []string{"trace.parent_id", "parentId"}, +func newTestCollector(conf config.Config, transmission transmit.Transmission) *InMemCollector { + s := &metrics.MockMetrics{} + s.Start() + clock := clockwork.NewRealClock() + healthReporter := &health.Health{ + Clock: clock, } - coll := &InMemCollector{ + healthReporter.Start() + + return &InMemCollector{ Config: conf, + Clock: clock, Logger: &logger.NullLogger{}, + Tracer: noop.NewTracerProvider().Tracer("test"), + Health: healthReporter, Transmission: transmission, Metrics: &metrics.NullMetrics{}, StressRelief: &MockStressReliever{}, SamplerFactory: &sample.SamplerFactory{ - Config: conf, - Logger: &logger.NullLogger{}, + Config: conf, + Metrics: s, + Logger: &logger.NullLogger{}, + }, + done: make(chan struct{}), + Peers: &peer.MockPeers{ + Peers: []string{"api1", "api2"}, + }, + Sharder: &sharder.MockSharder{ + Self: &sharder.TestShard{ + Addr: "api1", + }, + }, + redistributeTimer: newRedistributeNotifier(&logger.NullLogger{}, &metrics.NullMetrics{}, clock), + } +} + +// TestAddRootSpan tests that adding a root span winds up with a trace object in +// the cache and that that trace gets sent +func TestAddRootSpan(t *testing.T) { + conf := &config.MockConfig{ + GetTracesConfigVal: config.TracesConfig{ + SendTicker: config.Duration(2 * time.Millisecond), + SendDelay: config.Duration(1 * time.Millisecond), + TraceTimeout: config.Duration(60 * time.Second), + MaxBatchSize: 500, + }, + GetSamplerTypeVal: &config.DeterministicSamplerConfig{SampleRate: 1}, + ParentIdFieldNames: []string{"trace.parent_id", "parentId"}, + GetCollectionConfigVal: config.CollectionConfig{ + ShutdownDelay: config.Duration(1 * time.Millisecond), }, } + transmission := &transmit.MockTransmission{} + transmission.Start() + coll := newTestCollector(conf, transmission) c := cache.NewInMemCache(3, &metrics.NullMetrics{}, &logger.NullLogger{}) coll.cache = c @@ -81,12 +119,13 @@ func TestAddRootSpan(t *testing.T) { }, } coll.AddSpan(span) + time.Sleep(conf.GetTracesConfig().GetSendTickerValue() * 2) - time.Sleep(conf.SendTickerVal * 2) // adding one span with no parent ID should: // * create the trace in the cache // * send the trace // * remove the trace from the cache + // * remove the trace from the cache assert.Nil(t, coll.getFromCache(traceID1), "after sending the span, it should be removed from the cache") transmission.Mux.RLock() assert.Equal(t, 1, len(transmission.Events), "adding a root span should send the span") @@ -101,7 +140,7 @@ func TestAddRootSpan(t *testing.T) { }, } coll.AddSpanFromPeer(span) - time.Sleep(conf.SendTickerVal * 2) + time.Sleep(conf.GetTracesConfig().GetSendTickerValue() * 2) // adding one span with no parent ID should: // * create the trace in the cache // * send the trace @@ -117,26 +156,27 @@ func TestAddRootSpan(t *testing.T) { // happening upstream of refinery. Writing down what got sent to refinery // will help people figure out what is going on. func TestOriginalSampleRateIsNotedInMetaField(t *testing.T) { - transmission := &transmit.MockTransmission{} - transmission.Start() + // The sample rate applied by Refinery in this test's config. + const expectedDeterministicSampleRate = int(2) + // The sample rate happening upstream of Refinery. + const originalSampleRate = uint(50) + conf := &config.MockConfig{ - GetSendDelayVal: 0, - GetTraceTimeoutVal: 60 * time.Second, - GetSamplerTypeVal: &config.DeterministicSamplerConfig{SampleRate: 2}, - SendTickerVal: 2 * time.Millisecond, + GetTracesConfigVal: config.TracesConfig{ + SendTicker: config.Duration(2 * time.Millisecond), + SendDelay: config.Duration(1 * time.Millisecond), + TraceTimeout: config.Duration(60 * time.Second), + MaxBatchSize: 500, + }, + GetSamplerTypeVal: &config.DeterministicSamplerConfig{SampleRate: expectedDeterministicSampleRate}, ParentIdFieldNames: []string{"trace.parent_id", "parentId"}, - } - coll := &InMemCollector{ - Config: conf, - Logger: &logger.NullLogger{}, - Transmission: transmission, - Metrics: &metrics.NullMetrics{}, - StressRelief: &MockStressReliever{}, - SamplerFactory: &sample.SamplerFactory{ - Config: conf, - Logger: &logger.NullLogger{}, + GetCollectionConfigVal: config.CollectionConfig{ + ShutdownDelay: config.Duration(1 * time.Millisecond), }, } + transmission := &transmit.MockTransmission{} + transmission.Start() + coll := newTestCollector(conf, transmission) c := cache.NewInMemCache(3, &metrics.NullMetrics{}, &logger.NullLogger{}) coll.cache = c @@ -150,7 +190,7 @@ func TestOriginalSampleRateIsNotedInMetaField(t *testing.T) { go coll.collect() defer coll.Stop() - // Spin until a sample gets triggered + // Generate events until one is sampled and appears on the transmission queue for sending. sendAttemptCount := 0 for getEventsLength(transmission) < 1 { sendAttemptCount++ @@ -159,65 +199,72 @@ func TestOriginalSampleRateIsNotedInMetaField(t *testing.T) { Event: types.Event{ Dataset: "aoeu", APIKey: legacyAPIKey, - SampleRate: 50, + SampleRate: originalSampleRate, Data: make(map[string]interface{}), }, } - coll.AddSpan(span) - time.Sleep(conf.SendTickerVal * 5) + err := coll.AddSpan(span) + require.NoError(t, err, "must be able to add the span") + time.Sleep(conf.GetTracesConfig().GetSendTickerValue() * 5) } transmission.Mux.RLock() - assert.Greater(t, len(transmission.Events), 0, "should be at least one event transmitted") - assert.Equal(t, uint(50), transmission.Events[0].Data["meta.refinery.original_sample_rate"], - "metadata should be populated with original sample rate") + require.Greater(t, len(transmission.Events), 0, + "At least one event should have been sampled and transmitted by now for us to make assertions upon.") + upstreamSampledEvent := transmission.Events[0] transmission.Mux.RUnlock() - span := &types.Span{ + assert.Equal(t, originalSampleRate, upstreamSampledEvent.Data["meta.refinery.original_sample_rate"], + "metadata should be populated with original sample rate") + assert.Equal(t, originalSampleRate*uint(expectedDeterministicSampleRate), upstreamSampledEvent.SampleRate, + "sample rate for the event should be the original sample rate multiplied by the deterministic sample rate") + + // Generate one more event with no upstream sampling applied. + err = coll.AddSpan(&types.Span{ TraceID: fmt.Sprintf("trace-%v", 1000), Event: types.Event{ - Dataset: "aoeu", + Dataset: "no-upstream-sampling", APIKey: legacyAPIKey, - SampleRate: 0, + SampleRate: 0, // no upstream sampling Data: make(map[string]interface{}), }, - } - - coll.AddSpan(span) + }) + require.NoError(t, err, "must be able to add the span") - time.Sleep(conf.SendTickerVal * 2) + // Find the Refinery-sampled-and-sent event that had no upstream sampling which + // should be the last event on the transmission queue. + var noUpstreamSampleRateEvent *types.Event + require.Eventually(t, func() bool { + transmission.Mux.RLock() + defer transmission.Mux.RUnlock() + noUpstreamSampleRateEvent = transmission.Events[len(transmission.Events)-1] + return noUpstreamSampleRateEvent.Dataset == "no-upstream-sampling" + }, 5*time.Second, conf.GetTracesConfig().GetSendTickerValue()*2, "the event with no upstream sampling should have appeared in the transmission queue by now") - transmission.Mux.RLock() - assert.Equal(t, 2, len(transmission.Events), "should be some events transmitted") - assert.Nil(t, transmission.Events[1].Data["meta.refinery.original_sample_rate"], - "metadata should not be populated when zero") - transmission.Mux.RUnlock() + assert.Nil(t, noUpstreamSampleRateEvent.Data["meta.refinery.original_sample_rate"], + "original sample rate should not be set in metadata when original sample rate is zero") } // HoneyComb treats a missing or 0 SampleRate the same as 1, but // behaves better/more consistently if the SampleRate is explicitly // set instead of inferred func TestTransmittedSpansShouldHaveASampleRateOfAtLeastOne(t *testing.T) { - transmission := &transmit.MockTransmission{} - transmission.Start() conf := &config.MockConfig{ - GetSendDelayVal: 0, - GetTraceTimeoutVal: 60 * time.Second, + GetTracesConfigVal: config.TracesConfig{ + SendTicker: config.Duration(2 * time.Millisecond), + SendDelay: config.Duration(1 * time.Millisecond), + TraceTimeout: config.Duration(60 * time.Second), + MaxBatchSize: 500, + }, GetSamplerTypeVal: &config.DeterministicSamplerConfig{SampleRate: 1}, - SendTickerVal: 2 * time.Millisecond, ParentIdFieldNames: []string{"trace.parent_id", "parentId"}, - } - coll := &InMemCollector{ - Config: conf, - Logger: &logger.NullLogger{}, - Transmission: transmission, - Metrics: &metrics.NullMetrics{}, - StressRelief: &MockStressReliever{}, - SamplerFactory: &sample.SamplerFactory{ - Config: conf, - Logger: &logger.NullLogger{}, + GetCollectionConfigVal: config.CollectionConfig{ + ShutdownDelay: config.Duration(1 * time.Millisecond), }, } + transmission := &transmit.MockTransmission{} + transmission.Start() + coll := newTestCollector(conf, transmission) c := cache.NewInMemCache(3, &metrics.NullMetrics{}, &logger.NullLogger{}) coll.cache = c @@ -243,13 +290,13 @@ func TestTransmittedSpansShouldHaveASampleRateOfAtLeastOne(t *testing.T) { coll.AddSpan(span) - time.Sleep(conf.SendTickerVal * 2) + time.Sleep(conf.GetTracesConfig().GetSendTickerValue() * 2) assert.Eventually(t, func() bool { transmission.Mux.RLock() defer transmission.Mux.RUnlock() return len(transmission.Events) > 0 - }, 2*time.Second, conf.SendTickerVal*2) + }, 2*time.Second, conf.GetTracesConfig().GetSendTickerValue()*2) transmission.Mux.RLock() assert.Equal(t, uint(1), transmission.Events[0].SampleRate, @@ -267,26 +314,23 @@ func getEventsLength(transmission *transmit.MockTransmission) int { // TestAddSpan tests that adding a span winds up with a trace object in the // cache func TestAddSpan(t *testing.T) { - transmission := &transmit.MockTransmission{} - transmission.Start() conf := &config.MockConfig{ - GetSendDelayVal: 0, - GetTraceTimeoutVal: 60 * time.Second, + GetTracesConfigVal: config.TracesConfig{ + SendTicker: config.Duration(2 * time.Millisecond), + SendDelay: config.Duration(1 * time.Millisecond), + TraceTimeout: config.Duration(60 * time.Second), + MaxBatchSize: 500, + }, GetSamplerTypeVal: &config.DeterministicSamplerConfig{SampleRate: 1}, - SendTickerVal: 2 * time.Millisecond, ParentIdFieldNames: []string{"trace.parent_id", "parentId"}, - } - coll := &InMemCollector{ - Config: conf, - Logger: &logger.NullLogger{}, - Transmission: transmission, - Metrics: &metrics.NullMetrics{}, - StressRelief: &MockStressReliever{}, - SamplerFactory: &sample.SamplerFactory{ - Config: conf, - Logger: &logger.NullLogger{}, + GetCollectionConfigVal: config.CollectionConfig{ + ShutdownDelay: config.Duration(1 * time.Millisecond), }, } + transmission := &transmit.MockTransmission{} + transmission.Start() + coll := newTestCollector(conf, transmission) + c := cache.NewInMemCache(3, &metrics.NullMetrics{}, &logger.NullLogger{}) coll.cache = c stc, err := newCache() @@ -312,8 +356,10 @@ func TestAddSpan(t *testing.T) { }, } coll.AddSpanFromPeer(span) - time.Sleep(conf.SendTickerVal * 2) - assert.Equal(t, traceID, coll.getFromCache(traceID).TraceID, "after adding the span, we should have a trace in the cache with the right trace ID") + time.Sleep(conf.GetTracesConfig().GetSendTickerValue() * 2) + trace := coll.getFromCache(traceID) + require.NotNil(t, trace) + assert.Equal(t, traceID, trace.TraceID, "after adding the span, we should have a trace in the cache with the right trace ID") assert.Equal(t, 0, len(transmission.Events), "adding a non-root span should not yet send the span") // ok now let's add the root span and verify that both got sent rootSpan := &types.Span{ @@ -325,7 +371,7 @@ func TestAddSpan(t *testing.T) { }, } coll.AddSpan(rootSpan) - time.Sleep(conf.SendTickerVal * 2) + time.Sleep(conf.GetTracesConfig().GetSendTickerValue() * 5) assert.Nil(t, coll.getFromCache(traceID), "after adding a leaf and root span, it should be removed from the cache") transmission.Mux.RLock() assert.Equal(t, 2, len(transmission.Events), "adding a root span should send all spans in the trace") @@ -335,31 +381,32 @@ func TestAddSpan(t *testing.T) { // TestDryRunMode tests that all traces are sent, regardless of sampling decision, and that the // sampling decision is marked on each span in the trace func TestDryRunMode(t *testing.T) { - transmission := &transmit.MockTransmission{} - transmission.Start() conf := &config.MockConfig{ - GetSendDelayVal: 0, - GetTraceTimeoutVal: 60 * time.Second, + GetTracesConfigVal: config.TracesConfig{ + SendTicker: config.Duration(20 * time.Millisecond), + SendDelay: config.Duration(1 * time.Millisecond), + TraceTimeout: config.Duration(60 * time.Second), + MaxBatchSize: 500, + }, GetSamplerTypeVal: &config.DeterministicSamplerConfig{ SampleRate: 10, }, - SendTickerVal: 20 * time.Millisecond, DryRun: true, ParentIdFieldNames: []string{"trace.parent_id", "parentId"}, + GetCollectionConfigVal: config.CollectionConfig{ + ShutdownDelay: config.Duration(1 * time.Millisecond), + }, } + transmission := &transmit.MockTransmission{} + transmission.Start() + coll := newTestCollector(conf, transmission) + samplerFactory := &sample.SamplerFactory{ Config: conf, Logger: &logger.NullLogger{}, } sampler := samplerFactory.GetSamplerImplementationForKey("test", true) - coll := &InMemCollector{ - Config: conf, - Logger: &logger.NullLogger{}, - Transmission: transmission, - Metrics: &metrics.NullMetrics{}, - StressRelief: &MockStressReliever{}, - SamplerFactory: samplerFactory, - } + coll.SamplerFactory = samplerFactory c := cache.NewInMemCache(3, &metrics.NullMetrics{}, &logger.NullLogger{}) coll.cache = c stc, err := newCache() @@ -396,7 +443,8 @@ func TestDryRunMode(t *testing.T) { }, } coll.AddSpan(span) - time.Sleep(conf.SendTickerVal * 2) + time.Sleep(conf.GetTracesConfig().GetSendTickerValue() * 2) + // adding one span with no parent ID should: // * create the trace in the cache // * send the trace @@ -419,7 +467,8 @@ func TestDryRunMode(t *testing.T) { }, } coll.AddSpanFromPeer(span) - time.Sleep(conf.SendTickerVal * 2) + time.Sleep(conf.GetTracesConfig().GetSendTickerValue() * 2) + assert.Equal(t, traceID2, coll.getFromCache(traceID2).TraceID, "after adding the span, we should have a trace in the cache with the right trace ID") span = &types.Span{ @@ -430,7 +479,8 @@ func TestDryRunMode(t *testing.T) { }, } coll.AddSpanFromPeer(span) - time.Sleep(conf.SendTickerVal * 2) + time.Sleep(conf.GetTracesConfig().GetSendTickerValue() * 2) + // adding root span to send the trace transmission.Mux.RLock() assert.Equal(t, 3, len(transmission.Events), "adding another root span should send the span") @@ -452,7 +502,8 @@ func TestDryRunMode(t *testing.T) { }, } coll.AddSpan(span) - time.Sleep(conf.SendTickerVal * 2) + time.Sleep(conf.GetTracesConfig().GetSendTickerValue() * 2) + // adding one span with no parent ID should: // * create the trace in the cache // * send the trace @@ -465,16 +516,17 @@ func TestDryRunMode(t *testing.T) { } func TestCacheSizeReload(t *testing.T) { - transmission := &transmit.MockTransmission{} - transmission.Start() - conf := &config.MockConfig{ - GetSendDelayVal: 0, - GetTraceTimeoutVal: 10 * time.Minute, - GetSamplerTypeVal: &config.DeterministicSamplerConfig{SampleRate: 1}, - SendTickerVal: 2 * time.Millisecond, + GetTracesConfigVal: config.TracesConfig{ + SendTicker: config.Duration(2 * time.Millisecond), + SendDelay: config.Duration(1 * time.Millisecond), + TraceTimeout: config.Duration(10 * time.Minute), + MaxBatchSize: 500, + }, + GetSamplerTypeVal: &config.DeterministicSamplerConfig{SampleRate: 1}, GetCollectionConfigVal: config.CollectionConfig{ CacheCapacity: 1, + ShutdownDelay: config.Duration(1 * time.Millisecond), }, ParentIdFieldNames: []string{"trace.parent_id", "parentId"}, SampleCache: config.SampleCacheConfig{ @@ -484,17 +536,10 @@ func TestCacheSizeReload(t *testing.T) { }, } - coll := &InMemCollector{ - Config: conf, - Logger: &logger.NullLogger{}, - Transmission: transmission, - Metrics: &metrics.NullMetrics{}, - StressRelief: &MockStressReliever{}, - SamplerFactory: &sample.SamplerFactory{ - Config: conf, - Logger: &logger.NullLogger{}, - }, - } + transmission := &transmit.MockTransmission{} + transmission.Start() + coll := newTestCollector(conf, transmission) + coll.Peers = &peer.MockPeers{} err := coll.Start() assert.NoError(t, err) @@ -526,41 +571,39 @@ func TestCacheSizeReload(t *testing.T) { conf.Mux.Lock() conf.GetCollectionConfigVal.CacheCapacity = 2 conf.Mux.Unlock() - conf.ReloadConfig() + conf.Reload() assert.Eventually(t, func() bool { coll.mutex.RLock() defer coll.mutex.RUnlock() - - return coll.cache.(*cache.DefaultInMemCache).GetCacheSize() == 2 + return coll.cache.GetCacheCapacity() == 2 }, 60*wait, wait, "cache size to change") err = coll.AddSpan(&types.Span{TraceID: "3", Event: event}) assert.NoError(t, err) - time.Sleep(5 * conf.SendTickerVal) + time.Sleep(5 * conf.GetTracesConfig().GetSendTickerValue()) assert.True(t, check(), "expected no more traces evicted and sent") conf.Mux.Lock() conf.GetCollectionConfigVal.CacheCapacity = 1 conf.Mux.Unlock() - conf.ReloadConfig() + conf.Reload() expectedEvents = 2 assert.Eventually(t, check, 60*wait, wait, "expected another trace evicted and sent") } func TestSampleConfigReload(t *testing.T) { - transmission := &transmit.MockTransmission{} - - transmission.Start() - conf := &config.MockConfig{ - GetSendDelayVal: 0, - GetTraceTimeoutVal: 60 * time.Second, + GetTracesConfigVal: config.TracesConfig{ + SendTicker: config.Duration(2 * time.Millisecond), + SendDelay: config.Duration(1 * time.Millisecond), + TraceTimeout: config.Duration(60 * time.Second), + MaxBatchSize: 500, + }, GetSamplerTypeVal: &config.DeterministicSamplerConfig{SampleRate: 1}, - SendTickerVal: 2 * time.Millisecond, ParentIdFieldNames: []string{"trace.parent_id", "parentId"}, - GetCollectionConfigVal: config.CollectionConfig{CacheCapacity: 10}, + GetCollectionConfigVal: config.CollectionConfig{CacheCapacity: 10, ShutdownDelay: config.Duration(1 * time.Millisecond)}, SampleCache: config.SampleCacheConfig{ KeptSize: 100, DroppedSize: 100, @@ -568,17 +611,9 @@ func TestSampleConfigReload(t *testing.T) { }, } - coll := &InMemCollector{ - Config: conf, - Logger: &logger.NullLogger{}, - Transmission: transmission, - Metrics: &metrics.NullMetrics{}, - StressRelief: &MockStressReliever{}, - SamplerFactory: &sample.SamplerFactory{ - Config: conf, - Logger: &logger.NullLogger{}, - }, - } + transmission := &transmit.MockTransmission{} + transmission.Start() + coll := newTestCollector(conf, transmission) err := coll.Start() assert.NoError(t, err) @@ -598,21 +633,20 @@ func TestSampleConfigReload(t *testing.T) { assert.Eventually(t, func() bool { coll.mutex.Lock() - defer coll.mutex.Unlock() - _, ok := coll.datasetSamplers[dataset] + coll.mutex.Unlock() + return ok - }, conf.GetTraceTimeoutVal*2, conf.SendTickerVal) + }, conf.GetTracesConfig().GetTraceTimeout()*2, conf.GetTracesConfig().GetSendTickerValue()) - conf.ReloadConfig() + conf.Reload() assert.Eventually(t, func() bool { coll.mutex.Lock() - defer coll.mutex.Unlock() - _, ok := coll.datasetSamplers[dataset] + coll.mutex.Unlock() return !ok - }, conf.GetTraceTimeoutVal*2, conf.SendTickerVal) + }, conf.GetTracesConfig().GetTraceTimeout()*2, conf.GetTracesConfig().GetSendTickerValue()) span = &types.Span{ TraceID: "2", @@ -626,34 +660,32 @@ func TestSampleConfigReload(t *testing.T) { assert.Eventually(t, func() bool { coll.mutex.Lock() - defer coll.mutex.Unlock() - _, ok := coll.datasetSamplers[dataset] + coll.mutex.Unlock() return ok - }, conf.GetTraceTimeoutVal*2, conf.SendTickerVal) + }, conf.GetTracesConfig().GetTraceTimeout()*2, conf.GetTracesConfig().GetSendTickerValue()) } func TestStableMaxAlloc(t *testing.T) { - transmission := &transmit.MockTransmission{} - transmission.Start() conf := &config.MockConfig{ - GetSendDelayVal: 0, - GetTraceTimeoutVal: 10 * time.Minute, + GetTracesConfigVal: config.TracesConfig{ + SendTicker: config.Duration(2 * time.Millisecond), + SendDelay: config.Duration(1 * time.Millisecond), + TraceTimeout: config.Duration(10 * time.Minute), + MaxBatchSize: 500, + }, GetSamplerTypeVal: &config.DeterministicSamplerConfig{SampleRate: 1}, - SendTickerVal: 2 * time.Millisecond, ParentIdFieldNames: []string{"trace.parent_id", "parentId"}, - } - coll := &InMemCollector{ - Config: conf, - Logger: &logger.NullLogger{}, - Transmission: transmission, - Metrics: &metrics.NullMetrics{}, - StressRelief: &MockStressReliever{}, - SamplerFactory: &sample.SamplerFactory{ - Config: conf, - Logger: &logger.NullLogger{}, + GetCollectionConfigVal: config.CollectionConfig{ + ShutdownDelay: config.Duration(1 * time.Millisecond), + CacheCapacity: 1000, }, } + + transmission := &transmit.MockTransmission{} + transmission.Start() + coll := newTestCollector(conf, transmission) + spandata := make([]map[string]interface{}, 500) for i := 0; i < 500; i++ { spandata[i] = map[string]interface{}{ @@ -689,7 +721,7 @@ func TestStableMaxAlloc(t *testing.T) { } for len(coll.incoming) > 0 { - time.Sleep(conf.SendTickerVal) + time.Sleep(conf.GetTracesConfig().GetSendTickerValue()) } // Now there should be 500 traces in the cache. @@ -703,9 +735,7 @@ func TestStableMaxAlloc(t *testing.T) { runtime.ReadMemStats(&mem) // Set MaxAlloc, which should cause cache evictions. conf.GetCollectionConfigVal.MaxAlloc = config.MemorySize(mem.Alloc * 99 / 100) - coll.mutex.Unlock() - // wait for the cache to take some action var traces []*types.Trace for { @@ -716,10 +746,10 @@ func TestStableMaxAlloc(t *testing.T) { } coll.mutex.Unlock() - time.Sleep(conf.SendTickerVal) + time.Sleep(conf.GetTracesConfig().GetSendTickerValue()) } - assert.Equal(t, 1000, coll.cache.(*cache.DefaultInMemCache).GetCacheSize(), "cache size shouldn't change") + assert.Equal(t, 1000, coll.cache.GetCacheCapacity(), "cache size shouldn't change") tracesLeft := len(traces) assert.Less(t, tracesLeft, 480, "should have sent some traces") @@ -734,26 +764,25 @@ func TestStableMaxAlloc(t *testing.T) { } func TestAddSpanNoBlock(t *testing.T) { - transmission := &transmit.MockTransmission{} - transmission.Start() conf := &config.MockConfig{ - GetSendDelayVal: 0, - GetTraceTimeoutVal: 10 * time.Minute, + GetTracesConfigVal: config.TracesConfig{ + SendTicker: config.Duration(2 * time.Millisecond), + SendDelay: config.Duration(1 * time.Millisecond), + TraceTimeout: config.Duration(10 * time.Minute), + MaxBatchSize: 500, + }, GetSamplerTypeVal: &config.DeterministicSamplerConfig{}, - SendTickerVal: 2 * time.Millisecond, ParentIdFieldNames: []string{"trace.parent_id", "parentId"}, - } - coll := &InMemCollector{ - Config: conf, - Logger: &logger.NullLogger{}, - Transmission: transmission, - Metrics: &metrics.NullMetrics{}, - StressRelief: &MockStressReliever{}, - SamplerFactory: &sample.SamplerFactory{ - Config: conf, - Logger: &logger.NullLogger{}, + GetCollectionConfigVal: config.CollectionConfig{ + ShutdownDelay: config.Duration(1 * time.Millisecond), + CacheCapacity: 10, }, } + + transmission := &transmit.MockTransmission{} + transmission.Start() + coll := newTestCollector(conf, transmission) + c := cache.NewInMemCache(10, &metrics.NullMetrics{}, &logger.NullLogger{}) coll.cache = c stc, err := newCache() @@ -792,6 +821,10 @@ func TestDependencyInjection(t *testing.T) { &inject.Object{Value: &InMemCollector{}}, &inject.Object{Value: &config.MockConfig{}}, &inject.Object{Value: &logger.NullLogger{}}, + &inject.Object{Value: noop.NewTracerProvider().Tracer("test"), Name: "tracer"}, + &inject.Object{Value: clockwork.NewRealClock()}, + &inject.Object{Value: &health.Health{}}, + &inject.Object{Value: &sharder.SingleServerSharder{}}, &inject.Object{Value: &transmit.MockTransmission{}, Name: "upstreamTransmission"}, &inject.Object{Value: &metrics.NullMetrics{}, Name: "genericMetrics"}, &inject.Object{Value: &sample.SamplerFactory{}}, @@ -810,28 +843,27 @@ func TestDependencyInjection(t *testing.T) { // the cache and that that trace gets span count, span event count, span link count, and event count added to it // This test also makes sure that AddCountsToRoot overrides the AddSpanCountToRoot config. func TestAddCountsToRoot(t *testing.T) { - transmission := &transmit.MockTransmission{} - transmission.Start() conf := &config.MockConfig{ - GetSendDelayVal: 0, - GetTraceTimeoutVal: 60 * time.Second, + GetTracesConfigVal: config.TracesConfig{ + SendTicker: config.Duration(2 * time.Millisecond), + SendDelay: config.Duration(1 * time.Millisecond), + TraceTimeout: config.Duration(60 * time.Second), + MaxBatchSize: 500, + }, GetSamplerTypeVal: &config.DeterministicSamplerConfig{SampleRate: 1}, - SendTickerVal: 2 * time.Millisecond, AddSpanCountToRoot: true, AddCountsToRoot: true, ParentIdFieldNames: []string{"trace.parent_id", "parentId"}, - } - coll := &InMemCollector{ - Config: conf, - Logger: &logger.NullLogger{}, - Transmission: transmission, - Metrics: &metrics.NullMetrics{}, - StressRelief: &MockStressReliever{}, - SamplerFactory: &sample.SamplerFactory{ - Config: conf, - Logger: &logger.NullLogger{}, + GetCollectionConfigVal: config.CollectionConfig{ + ShutdownDelay: config.Duration(1 * time.Millisecond), + CacheCapacity: 3, }, } + + transmission := &transmit.MockTransmission{} + transmission.Start() + coll := newTestCollector(conf, transmission) + c := cache.NewInMemCache(3, &metrics.NullMetrics{}, &logger.NullLogger{}) coll.cache = c stc, err := newCache() @@ -864,7 +896,8 @@ func TestAddCountsToRoot(t *testing.T) { } coll.AddSpanFromPeer(span) } - time.Sleep(conf.SendTickerVal * 2) + time.Sleep(conf.GetTracesConfig().GetSendTickerValue() * 2) + assert.Equal(t, traceID, coll.getFromCache(traceID).TraceID, "after adding the span, we should have a trace in the cache with the right trace ID") assert.Equal(t, 0, len(transmission.Events), "adding a non-root span should not yet send the span") // ok now let's add the root span and verify that both got sent @@ -877,7 +910,8 @@ func TestAddCountsToRoot(t *testing.T) { }, } coll.AddSpan(rootSpan) - time.Sleep(conf.SendTickerVal * 2) + time.Sleep(conf.GetTracesConfig().GetSendTickerValue() * 2) + assert.Nil(t, coll.getFromCache(traceID), "after adding a leaf and root span, it should be removed from the cache") transmission.Mux.RLock() assert.Equal(t, 5, len(transmission.Events), "adding a root span should send all spans in the trace") @@ -896,29 +930,27 @@ func TestAddCountsToRoot(t *testing.T) { // TestLateRootGetsCounts tests that the root span gets decorated with the right counts // even if the trace had already been sent func TestLateRootGetsCounts(t *testing.T) { - transmission := &transmit.MockTransmission{} - transmission.Start() conf := &config.MockConfig{ - GetSendDelayVal: 0, - GetTraceTimeoutVal: 5 * time.Millisecond, + GetTracesConfigVal: config.TracesConfig{ + SendTicker: config.Duration(2 * time.Millisecond), + SendDelay: config.Duration(1 * time.Millisecond), + TraceTimeout: config.Duration(5 * time.Millisecond), + MaxBatchSize: 500, + }, GetSamplerTypeVal: &config.DeterministicSamplerConfig{SampleRate: 1}, - SendTickerVal: 2 * time.Millisecond, AddSpanCountToRoot: true, AddCountsToRoot: true, ParentIdFieldNames: []string{"trace.parent_id", "parentId"}, AddRuleReasonToTrace: true, - } - coll := &InMemCollector{ - Config: conf, - Logger: &logger.NullLogger{}, - Transmission: transmission, - Metrics: &metrics.NullMetrics{}, - StressRelief: &MockStressReliever{}, - SamplerFactory: &sample.SamplerFactory{ - Config: conf, - Logger: &logger.NullLogger{}, + GetCollectionConfigVal: config.CollectionConfig{ + ShutdownDelay: config.Duration(1 * time.Millisecond), }, } + + transmission := &transmit.MockTransmission{} + transmission.Start() + coll := newTestCollector(conf, transmission) + c := cache.NewInMemCache(3, &metrics.NullMetrics{}, &logger.NullLogger{}) coll.cache = c stc, err := newCache() @@ -952,7 +984,7 @@ func TestLateRootGetsCounts(t *testing.T) { } coll.AddSpanFromPeer(span) } - time.Sleep(conf.SendTickerVal * 10) + time.Sleep(conf.GetTracesConfig().GetSendTickerValue() * 10) trace := coll.getFromCache(traceID) assert.Nil(t, trace, "trace should have been sent although the root span hasn't arrived") @@ -967,7 +999,8 @@ func TestLateRootGetsCounts(t *testing.T) { }, } coll.AddSpan(rootSpan) - time.Sleep(conf.SendTickerVal * 2) + time.Sleep(conf.GetTracesConfig().GetSendTickerValue() * 2) + assert.Nil(t, coll.getFromCache(traceID), "after adding a leaf and root span, it should be removed from the cache") transmission.Mux.RLock() assert.Equal(t, 5, len(transmission.Events), "adding a root span should send all spans in the trace") @@ -980,34 +1013,31 @@ func TestLateRootGetsCounts(t *testing.T) { assert.Equal(t, int64(2), transmission.Events[4].Data["meta.span_event_count"], "root span metadata should be populated with span event count") assert.Equal(t, int64(1), transmission.Events[4].Data["meta.span_link_count"], "root span metadata should be populated with span link count") assert.Equal(t, int64(5), transmission.Events[4].Data["meta.event_count"], "root span metadata should be populated with event count") - assert.Equal(t, "late", transmission.Events[4].Data["meta.refinery.reason"], "late spans should have meta.refinery.reason set to late.") + assert.Equal(t, "deterministic/always - late arriving span", transmission.Events[4].Data["meta.refinery.reason"], "late spans should have meta.refinery.reason set to rules + late arriving span.") transmission.Mux.RUnlock() } // TestAddSpanCount tests that adding a root span winds up with a trace object in // the cache and that that trace gets span count added to it func TestAddSpanCount(t *testing.T) { - transmission := &transmit.MockTransmission{} - transmission.Start() conf := &config.MockConfig{ - GetSendDelayVal: 0, - GetTraceTimeoutVal: 60 * time.Second, + GetTracesConfigVal: config.TracesConfig{ + SendTicker: config.Duration(2 * time.Millisecond), + SendDelay: config.Duration(1 * time.Millisecond), + TraceTimeout: config.Duration(60 * time.Second), + MaxBatchSize: 500, + }, GetSamplerTypeVal: &config.DeterministicSamplerConfig{SampleRate: 1}, - SendTickerVal: 2 * time.Millisecond, AddSpanCountToRoot: true, ParentIdFieldNames: []string{"trace.parent_id", "parentId"}, - } - coll := &InMemCollector{ - Config: conf, - Logger: &logger.NullLogger{}, - Transmission: transmission, - Metrics: &metrics.NullMetrics{}, - StressRelief: &MockStressReliever{}, - SamplerFactory: &sample.SamplerFactory{ - Config: conf, - Logger: &logger.NullLogger{}, + GetCollectionConfigVal: config.CollectionConfig{ + ShutdownDelay: config.Duration(1 * time.Millisecond), }, } + transmission := &transmit.MockTransmission{} + transmission.Start() + coll := newTestCollector(conf, transmission) + c := cache.NewInMemCache(3, &metrics.NullMetrics{}, &logger.NullLogger{}) coll.cache = c stc, err := newCache() @@ -1033,7 +1063,8 @@ func TestAddSpanCount(t *testing.T) { }, } coll.AddSpanFromPeer(span) - time.Sleep(conf.SendTickerVal * 2) + time.Sleep(conf.GetTracesConfig().GetSendTickerValue() * 2) + assert.Equal(t, traceID, coll.getFromCache(traceID).TraceID, "after adding the span, we should have a trace in the cache with the right trace ID") assert.Equal(t, 0, len(transmission.Events), "adding a non-root span should not yet send the span") // ok now let's add the root span and verify that both got sent @@ -1046,7 +1077,8 @@ func TestAddSpanCount(t *testing.T) { }, } coll.AddSpan(rootSpan) - time.Sleep(conf.SendTickerVal * 2) + time.Sleep(conf.GetTracesConfig().GetSendTickerValue() * 2) + assert.Nil(t, coll.getFromCache(traceID), "after adding a leaf and root span, it should be removed from the cache") transmission.Mux.RLock() assert.Equal(t, 2, len(transmission.Events), "adding a root span should send all spans in the trace") @@ -1058,28 +1090,25 @@ func TestAddSpanCount(t *testing.T) { // TestLateRootGetsSpanCount tests that the root span gets decorated with the right span count // even if the trace had already been sent func TestLateRootGetsSpanCount(t *testing.T) { - transmission := &transmit.MockTransmission{} - transmission.Start() conf := &config.MockConfig{ - GetSendDelayVal: 0, - GetTraceTimeoutVal: 5 * time.Millisecond, + GetTracesConfigVal: config.TracesConfig{ + SendTicker: config.Duration(2 * time.Millisecond), + SendDelay: config.Duration(1 * time.Millisecond), + TraceTimeout: config.Duration(5 * time.Millisecond), + MaxBatchSize: 500, + }, GetSamplerTypeVal: &config.DeterministicSamplerConfig{SampleRate: 1}, - SendTickerVal: 2 * time.Millisecond, AddSpanCountToRoot: true, ParentIdFieldNames: []string{"trace.parent_id", "parentId"}, AddRuleReasonToTrace: true, - } - coll := &InMemCollector{ - Config: conf, - Logger: &logger.NullLogger{}, - Transmission: transmission, - Metrics: &metrics.NullMetrics{}, - StressRelief: &MockStressReliever{}, - SamplerFactory: &sample.SamplerFactory{ - Config: conf, - Logger: &logger.NullLogger{}, + GetCollectionConfigVal: config.CollectionConfig{ + ShutdownDelay: config.Duration(1 * time.Millisecond), }, } + transmission := &transmit.MockTransmission{} + transmission.Start() + coll := newTestCollector(conf, transmission) + c := cache.NewInMemCache(3, &metrics.NullMetrics{}, &logger.NullLogger{}) coll.cache = c stc, err := newCache() @@ -1105,11 +1134,12 @@ func TestLateRootGetsSpanCount(t *testing.T) { }, } coll.AddSpanFromPeer(span) - time.Sleep(conf.SendTickerVal * 10) + time.Sleep(conf.GetTracesConfig().GetSendTickerValue() * 10) trace := coll.getFromCache(traceID) assert.Nil(t, trace, "trace should have been sent although the root span hasn't arrived") assert.Equal(t, 1, len(transmission.Events), "adding a non-root span and waiting should send the span") + // now we add the root span and verify that both got sent and that the root span had the span count rootSpan := &types.Span{ TraceID: traceID, @@ -1120,39 +1150,40 @@ func TestLateRootGetsSpanCount(t *testing.T) { }, } coll.AddSpan(rootSpan) - time.Sleep(conf.SendTickerVal * 2) + time.Sleep(conf.GetTracesConfig().GetSendTickerValue() * 2) + assert.Nil(t, coll.getFromCache(traceID), "after adding a leaf and root span, it should be removed from the cache") - transmission.Mux.RLock() - assert.Equal(t, 2, len(transmission.Events), "adding a root span should send all spans in the trace") - assert.Equal(t, nil, transmission.Events[0].Data["meta.span_count"], "child span metadata should NOT be populated with span count") - assert.Equal(t, int64(2), transmission.Events[1].Data["meta.span_count"], "root span metadata should be populated with span count") - assert.Equal(t, "late", transmission.Events[1].Data["meta.refinery.reason"], "late spans should have meta.refinery.reason set to late.") - transmission.Mux.RUnlock() + assert.EventuallyWithT(t, func(collect *assert.CollectT) { + transmission.Mux.RLock() + assert.Equal(collect, 2, len(transmission.Events), "adding a root span should send all spans in the trace") + assert.Equal(collect, nil, transmission.Events[0].Data["meta.span_count"], "child span metadata should NOT be populated with span count") + assert.Equal(collect, int64(2), transmission.Events[1].Data["meta.span_count"], "root span metadata should be populated with span count") + assert.Equal(collect, "deterministic/always - late arriving span", transmission.Events[1].Data["meta.refinery.reason"], "late spans should have meta.refinery.reason set to late.") + transmission.Mux.RUnlock() + }, 2*conf.GetTracesConfig().GetSendTickerValue(), 1*time.Millisecond) } // TestLateRootNotDecorated tests that spans do not get decorated with 'meta.refinery.reason' meta field // if the AddRuleReasonToTrace attribute not set in config func TestLateSpanNotDecorated(t *testing.T) { - transmission := &transmit.MockTransmission{} - transmission.Start() conf := &config.MockConfig{ - GetSendDelayVal: 0, - GetTraceTimeoutVal: 5 * time.Minute, + GetTracesConfigVal: config.TracesConfig{ + SendTicker: config.Duration(2 * time.Millisecond), + SendDelay: config.Duration(1 * time.Millisecond), + TraceTimeout: config.Duration(5 * time.Minute), + MaxBatchSize: 500, + }, GetSamplerTypeVal: &config.DeterministicSamplerConfig{SampleRate: 1}, - SendTickerVal: 2 * time.Millisecond, ParentIdFieldNames: []string{"trace.parent_id", "parentId"}, - } - coll := &InMemCollector{ - Config: conf, - Logger: &logger.NullLogger{}, - Transmission: transmission, - Metrics: &metrics.NullMetrics{}, - StressRelief: &MockStressReliever{}, - SamplerFactory: &sample.SamplerFactory{ - Config: conf, - Logger: &logger.NullLogger{}, + GetCollectionConfigVal: config.CollectionConfig{ + ShutdownDelay: config.Duration(1 * time.Millisecond), }, } + + transmission := &transmit.MockTransmission{} + transmission.Start() + coll := newTestCollector(conf, transmission) + c := cache.NewInMemCache(3, &metrics.NullMetrics{}, &logger.NullLogger{}) coll.cache = c stc, err := newCache() @@ -1178,7 +1209,6 @@ func TestLateSpanNotDecorated(t *testing.T) { }, } coll.AddSpanFromPeer(span) - time.Sleep(conf.SendTickerVal * 2) rootSpan := &types.Span{ TraceID: traceID, @@ -1189,37 +1219,38 @@ func TestLateSpanNotDecorated(t *testing.T) { }, } coll.AddSpan(rootSpan) - time.Sleep(conf.SendTickerVal * 2) - transmission.Mux.RLock() - assert.Equal(t, 2, len(transmission.Events), "adding a root span should send all spans in the trace") - assert.Equal(t, nil, transmission.Events[1].Data["meta.refinery.reason"], "late span should not have meta.refinery.reason set to late") - transmission.Mux.RUnlock() + + assert.EventuallyWithT(t, func(c *assert.CollectT) { + transmission.Mux.RLock() + assert.Equal(c, 2, len(transmission.Events), "adding a root span should send all spans in the trace") + if len(transmission.Events) == 2 { + assert.Equal(c, nil, transmission.Events[1].Data["meta.refinery.reason"], "late span should not have meta.refinery.reason set to late") + } + transmission.Mux.RUnlock() + }, 5*time.Second, conf.GetTracesConfig().GetSendTickerValue()) } func TestAddAdditionalAttributes(t *testing.T) { - transmission := &transmit.MockTransmission{} - transmission.Start() conf := &config.MockConfig{ - GetSendDelayVal: 0, - GetTraceTimeoutVal: 60 * time.Second, - GetSamplerTypeVal: &config.DeterministicSamplerConfig{SampleRate: 1}, - SendTickerVal: 2 * time.Millisecond, + GetTracesConfigVal: config.TracesConfig{ + SendTicker: config.Duration(2 * time.Millisecond), + SendDelay: config.Duration(1 * time.Millisecond), + TraceTimeout: config.Duration(60 * time.Second), + MaxBatchSize: 500, + }, + GetSamplerTypeVal: &config.DeterministicSamplerConfig{SampleRate: 1}, AdditionalAttributes: map[string]string{ "name": "foo", "other": "bar", }, - } - coll := &InMemCollector{ - Config: conf, - Logger: &logger.NullLogger{}, - Transmission: transmission, - Metrics: &metrics.NullMetrics{}, - StressRelief: &MockStressReliever{}, - SamplerFactory: &sample.SamplerFactory{ - Config: conf, - Logger: &logger.NullLogger{}, + GetCollectionConfigVal: config.CollectionConfig{ + ShutdownDelay: config.Duration(1 * time.Millisecond), }, } + transmission := &transmit.MockTransmission{} + transmission.Start() + coll := newTestCollector(conf, transmission) + c := cache.NewInMemCache(3, &metrics.NullMetrics{}, &logger.NullLogger{}) coll.cache = c stc, err := newCache() @@ -1245,7 +1276,7 @@ func TestAddAdditionalAttributes(t *testing.T) { }, } coll.AddSpanFromPeer(span) - time.Sleep(conf.SendTickerVal * 2) + time.Sleep(conf.GetTracesConfig().GetSendTickerValue() * 2) rootSpan := &types.Span{ TraceID: traceID, @@ -1256,7 +1287,7 @@ func TestAddAdditionalAttributes(t *testing.T) { }, } coll.AddSpan(rootSpan) - time.Sleep(conf.SendTickerVal * 5) + time.Sleep(conf.GetTracesConfig().GetSendTickerValue() * 5) transmission.Mux.RLock() assert.Equal(t, 2, len(transmission.Events), "should be some events transmitted") assert.Equal(t, "foo", transmission.Events[0].Data["name"], "new attribute should appear in data") @@ -1265,16 +1296,96 @@ func TestAddAdditionalAttributes(t *testing.T) { } +func TestStressReliefSampleRate(t *testing.T) { + conf := &config.MockConfig{ + GetTracesConfigVal: config.TracesConfig{ + SendTicker: config.Duration(2 * time.Millisecond), + SendDelay: config.Duration(1 * time.Millisecond), + TraceTimeout: config.Duration(5 * time.Minute), + MaxBatchSize: 500, + }, + GetSamplerTypeVal: &config.DeterministicSamplerConfig{SampleRate: 1}, + ParentIdFieldNames: []string{"trace.parent_id", "parentId"}, + GetCollectionConfigVal: config.CollectionConfig{ + ShutdownDelay: config.Duration(1 * time.Millisecond), + }, + } + + transmission := &transmit.MockTransmission{} + transmission.Start() + coll := newTestCollector(conf, transmission) + + stc, err := newCache() + assert.NoError(t, err, "lru cache should start") + coll.sampleTraceCache = stc + + var traceID = "traceABC" + + span := &types.Span{ + TraceID: traceID, + Event: types.Event{ + Dataset: "aoeu", + Data: map[string]interface{}{ + "trace.parent_id": "unused", + }, + APIKey: legacyAPIKey, + }, + } + coll.StressRelief = &MockStressReliever{ + IsStressed: true, + SampleDeterministically: true, + ShouldKeep: true, + SampleRate: 100, + } + processed, kept := coll.ProcessSpanImmediately(span) + require.True(t, processed) + require.True(t, kept) + + tr, _, found := coll.sampleTraceCache.CheckTrace(traceID) + require.True(t, found) + require.NotNil(t, tr) + assert.Equal(t, uint(100), tr.Rate()) + + transmission.Mux.RLock() + assert.Equal(t, 1, len(transmission.Events), "span should immediately be sent during stress relief") + assert.Equal(t, uint(100), transmission.Events[0].SampleRate) + transmission.Mux.RUnlock() + + rootSpan := &types.Span{ + TraceID: traceID, + Event: types.Event{ + Dataset: "aoeu", + Data: map[string]interface{}{}, + APIKey: legacyAPIKey, + SampleRate: 10, + }, + } + + processed2, kept2 := coll.ProcessSpanImmediately(rootSpan) + require.True(t, processed2) + require.True(t, kept2) + + tr2, _, found2 := coll.sampleTraceCache.CheckTrace(traceID) + require.True(t, found2) + require.NotNil(t, tr2) + assert.Equal(t, uint(100), tr2.Rate()) + transmission.Mux.RLock() + assert.Equal(t, 2, len(transmission.Events), "span should immediately be sent during stress relief") + assert.Equal(t, uint(1000), transmission.Events[1].SampleRate) + transmission.Mux.RUnlock() +} + // TestStressReliefDecorateHostname tests that the span gets decorated with hostname if // StressReliefMode is active func TestStressReliefDecorateHostname(t *testing.T) { - transmission := &transmit.MockTransmission{} - transmission.Start() conf := &config.MockConfig{ - GetSendDelayVal: 0, - GetTraceTimeoutVal: 5 * time.Minute, + GetTracesConfigVal: config.TracesConfig{ + SendTicker: config.Duration(2 * time.Millisecond), + SendDelay: config.Duration(1 * time.Millisecond), + TraceTimeout: config.Duration(5 * time.Minute), + MaxBatchSize: 500, + }, GetSamplerTypeVal: &config.DeterministicSamplerConfig{SampleRate: 1}, - SendTickerVal: 2 * time.Millisecond, ParentIdFieldNames: []string{"trace.parent_id", "parentId"}, StressRelief: config.StressReliefConfig{ Mode: "monitor", @@ -1282,19 +1393,16 @@ func TestStressReliefDecorateHostname(t *testing.T) { DeactivationLevel: 25, SamplingRate: 100, }, - } - coll := &InMemCollector{ - Config: conf, - Logger: &logger.NullLogger{}, - Transmission: transmission, - Metrics: &metrics.NullMetrics{}, - StressRelief: &MockStressReliever{}, - SamplerFactory: &sample.SamplerFactory{ - Config: conf, - Logger: &logger.NullLogger{}, + GetCollectionConfigVal: config.CollectionConfig{ + ShutdownDelay: config.Duration(1 * time.Millisecond), }, - hostname: "host123", } + + transmission := &transmit.MockTransmission{} + transmission.Start() + coll := newTestCollector(conf, transmission) + + coll.hostname = "host123" c := cache.NewInMemCache(3, &metrics.NullMetrics{}, &logger.NullLogger{}) coll.cache = c stc, err := newCache() @@ -1320,7 +1428,7 @@ func TestStressReliefDecorateHostname(t *testing.T) { }, } coll.AddSpanFromPeer(span) - time.Sleep(conf.SendTickerVal * 2) + time.Sleep(conf.GetTracesConfig().GetSendTickerValue() * 2) rootSpan := &types.Span{ TraceID: traceID, @@ -1331,10 +1439,483 @@ func TestStressReliefDecorateHostname(t *testing.T) { }, } coll.AddSpan(rootSpan) - time.Sleep(conf.SendTickerVal * 2) + time.Sleep(conf.GetTracesConfig().GetSendTickerValue() * 2) + transmission.Mux.RLock() assert.Equal(t, 2, len(transmission.Events), "adding a root span should send all spans in the trace") assert.Equal(t, "host123", transmission.Events[1].Data["meta.refinery.local_hostname"]) transmission.Mux.RUnlock() } + +func TestSpanWithRuleReasons(t *testing.T) { + conf := &config.MockConfig{ + GetTracesConfigVal: config.TracesConfig{ + SendTicker: config.Duration(2 * time.Millisecond), + SendDelay: config.Duration(1 * time.Millisecond), + TraceTimeout: config.Duration(5 * time.Millisecond), + MaxBatchSize: 500, + }, + GetSamplerTypeVal: &config.RulesBasedSamplerConfig{ + Rules: []*config.RulesBasedSamplerRule{ + { + Name: "rule 1", + Scope: "trace", + SampleRate: 1, + Conditions: []*config.RulesBasedSamplerCondition{ + { + Field: "test", + Operator: config.EQ, + Value: int64(1), + }, + }, + Sampler: &config.RulesBasedDownstreamSampler{ + DynamicSampler: &config.DynamicSamplerConfig{ + SampleRate: 1, + FieldList: []string{"http.status_code"}, + }, + }, + }, + { + Name: "rule 2", + Scope: "span", + Conditions: []*config.RulesBasedSamplerCondition{ + { + Field: "test", + Operator: config.EQ, + Value: int64(2), + }, + }, + Sampler: &config.RulesBasedDownstreamSampler{ + EMADynamicSampler: &config.EMADynamicSamplerConfig{ + GoalSampleRate: 1, + FieldList: []string{"http.status_code"}, + }, + }, + }, + }}, + ParentIdFieldNames: []string{"trace.parent_id", "parentId"}, + AddRuleReasonToTrace: true, + GetCollectionConfigVal: config.CollectionConfig{ + ShutdownDelay: config.Duration(1 * time.Millisecond), + }, + } + + transmission := &transmit.MockTransmission{} + transmission.Start() + coll := newTestCollector(conf, transmission) + + c := cache.NewInMemCache(3, &metrics.NullMetrics{}, &logger.NullLogger{}) + coll.cache = c + stc, err := newCache() + assert.NoError(t, err, "lru cache should start") + coll.sampleTraceCache = stc + + coll.incoming = make(chan *types.Span, 5) + coll.fromPeer = make(chan *types.Span, 5) + coll.datasetSamplers = make(map[string]sample.Sampler) + go coll.collect() + defer coll.Stop() + + traceIDs := []string{"trace1", "trace2"} + + for i := 0; i < 4; i++ { + span := &types.Span{ + Event: types.Event{ + Dataset: "aoeu", + Data: map[string]interface{}{ + "trace.parent_id": "unused", + "http.status_code": 200, + }, + APIKey: legacyAPIKey, + }, + } + switch i { + case 0, 1: + span.TraceID = traceIDs[0] + span.Data["test"] = int64(1) + case 2, 3: + span.TraceID = traceIDs[1] + span.Data["test"] = int64(2) + } + coll.AddSpanFromPeer(span) + } + time.Sleep(conf.GetTracesConfig().GetSendTickerValue() * 10) + + for i, traceID := range traceIDs { + assert.Nil(t, coll.getFromCache(traceID), "trace should have been sent although the root span hasn't arrived") + rootSpan := &types.Span{ + TraceID: traceID, + Event: types.Event{ + Dataset: "aoeu", + Data: map[string]interface{}{ + "http.status_code": 200, + }, + APIKey: legacyAPIKey, + }, + } + if i == 0 { + rootSpan.Data["test"] = int64(1) + } else { + rootSpan.Data["test"] = int64(2) + } + + coll.AddSpan(rootSpan) + } + // now we add the root span and verify that both got sent and that the root span had the span count + time.Sleep(conf.GetTracesConfig().GetSendTickerValue() * 2) + + transmission.Mux.RLock() + assert.Equal(t, 6, len(transmission.Events), "adding a root span should send all spans in the trace") + for _, event := range transmission.Events { + reason := event.Data["meta.refinery.reason"] + if event.Data["test"] == int64(1) { + if _, ok := event.Data["trace.parent_id"]; ok { + assert.Equal(t, "rules/trace/rule 1:dynamic", reason, event.Data) + } else { + assert.Equal(t, "rules/trace/rule 1:dynamic - late arriving span", reason, event.Data) + } + } else { + if _, ok := event.Data["trace.parent_id"]; ok { + assert.Equal(t, "rules/span/rule 2:emadynamic", reason, event.Data) + } else { + assert.Equal(t, "rules/span/rule 2:emadynamic - late arriving span", reason, event.Data) + } + } + } + transmission.Mux.RUnlock() +} + +func TestIsRootSpan(t *testing.T) { + tesCases := []struct { + name string + span *types.Span + expected bool + }{ + { + name: "root span - no parent id", + span: &types.Span{ + Event: types.Event{ + Data: map[string]interface{}{}, + }, + }, + expected: true, + }, + { + name: "root span - empty parent id", + span: &types.Span{ + Event: types.Event{ + Data: map[string]interface{}{ + "trace.parent_id": "", + }, + }, + }, + expected: true, + }, + { + name: "non-root span - parent id", + span: &types.Span{ + Event: types.Event{ + Data: map[string]interface{}{ + "trace.parent_id": "some-id", + }, + }, + }, + expected: false, + }, + { + name: "non-root span - no parent id but has signal_type of log", + span: &types.Span{ + Event: types.Event{ + Data: map[string]interface{}{ + "meta.signal_type": "log", + }, + }, + }, + expected: false, + }, + } + + collector := &InMemCollector{ + Config: &config.MockConfig{ + ParentIdFieldNames: []string{"trace.parent_id", "parentId"}, + GetCollectionConfigVal: config.CollectionConfig{ + ShutdownDelay: config.Duration(1 * time.Millisecond), + }, + }, + } + + for _, tc := range tesCases { + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.expected, collector.isRootSpan(tc.span)) + }) + } +} + +func TestRedistributeTraces(t *testing.T) { + conf := &config.MockConfig{ + GetTracesConfigVal: config.TracesConfig{ + SendDelay: config.Duration(1 * time.Millisecond), + TraceTimeout: config.Duration(1 * time.Second), + SendTicker: config.Duration(2 * time.Millisecond), + }, + GetSamplerTypeVal: &config.DeterministicSamplerConfig{SampleRate: 1}, + ParentIdFieldNames: []string{"trace.parent_id", "parentId"}, + GetCollectionConfigVal: config.CollectionConfig{CacheCapacity: 10}, + SampleCache: config.SampleCacheConfig{ + KeptSize: 100, + DroppedSize: 100, + SizeCheckInterval: config.Duration(1 * time.Second), + }, + } + + transmission := &transmit.MockTransmission{} + transmission.Start() + coll := newTestCollector(conf, transmission) + s := &sharder.MockSharder{ + Self: &sharder.TestShard{Addr: "api1"}, + } + + coll.Sharder = s + + err := coll.Start() + assert.NoError(t, err) + defer coll.Stop() + + dataset := "aoeu" + + span := &types.Span{ + TraceID: "1", + Event: types.Event{ + Dataset: dataset, + APIKey: legacyAPIKey, + APIHost: "api1", + Data: make(map[string]interface{}), + }, + } + + coll.AddSpan(span) + + assert.Eventually(t, func() bool { + transmission.Mux.Lock() + defer transmission.Mux.Unlock() + + return len(transmission.Events) == 1 && transmission.Events[0].APIHost == "api1" + }, conf.GetTracesConfig().GetTraceTimeout()*2, conf.GetTracesConfig().GetSendTickerValue()) + transmission.Flush() + + s.Other = &sharder.TestShard{Addr: "api2"} + span = &types.Span{ + TraceID: "11", + Event: types.Event{ + Dataset: dataset, + APIKey: legacyAPIKey, + Data: make(map[string]interface{}), + }, + } + trace := &types.Trace{ + TraceID: span.TraceID, + Dataset: dataset, + SendBy: coll.Clock.Now().Add(5 * time.Second), + } + trace.AddSpan(span) + + coll.mutex.Lock() + coll.cache.Set(trace) + coll.mutex.Unlock() + coll.Peers.RegisterUpdatedPeersCallback(coll.redistributeTimer.Reset) + + assert.Eventually(t, func() bool { + transmission.Mux.Lock() + defer transmission.Mux.Unlock() + if len(transmission.Events) == 0 { + return false + } + + return len(transmission.Events) == 1 && transmission.Events[0].APIHost == "api2" + }, conf.GetTracesConfig().GetTraceTimeout()*2, conf.GetTracesConfig().GetSendTickerValue()) +} + +func TestDrainTracesOnShutdown(t *testing.T) { + // set up the trace cache + conf := &config.MockConfig{ + GetTracesConfigVal: config.TracesConfig{ + SendTicker: config.Duration(2 * time.Millisecond), + SendDelay: config.Duration(1 * time.Millisecond), + TraceTimeout: config.Duration(60 * time.Second), + MaxBatchSize: 500, + }, + GetSamplerTypeVal: &config.DeterministicSamplerConfig{SampleRate: 1}, + ParentIdFieldNames: []string{"trace.parent_id", "parentId"}, + GetCollectionConfigVal: config.CollectionConfig{ + ShutdownDelay: config.Duration(100 * time.Millisecond), + CacheCapacity: 3, + }, + } + transmission := &transmit.MockTransmission{} + transmission.Start() + coll := newTestCollector(conf, transmission) + coll.hostname = "host123" + coll.Sharder = &sharder.MockSharder{ + Self: &sharder.TestShard{Addr: "api1"}, + Other: &sharder.TestShard{Addr: "api2"}, + } + + c := cache.NewInMemCache(3, &metrics.NullMetrics{}, &logger.NullLogger{}) + coll.cache = c + stc, err := newCache() + assert.NoError(t, err, "lru cache should start") + coll.sampleTraceCache = stc + + coll.incoming = make(chan *types.Span, 5) + coll.fromPeer = make(chan *types.Span, 5) + coll.datasetSamplers = make(map[string]sample.Sampler) + + sentTraceChan := make(chan sentRecord, 1) + forwardTraceChan := make(chan *types.Span, 1) + + // test 1 + // the trace in cache already has decision made + trace1 := &types.Trace{ + TraceID: "traceID1", + } + span1 := &types.Span{ + TraceID: "traceID1", + Event: types.Event{ + Dataset: "aoeu", + Data: make(map[string]interface{}), + }, + } + + stc.Record(trace1, true, "test") + + coll.distributeSpansOnShutdown(sentTraceChan, forwardTraceChan, span1) + require.Len(t, sentTraceChan, 1) + require.Len(t, forwardTraceChan, 0) + + ctx1, cancel1 := context.WithCancel(context.Background()) + go coll.sendSpansOnShutdown(ctx1, sentTraceChan, forwardTraceChan) + require.EventuallyWithT(t, func(collect *assert.CollectT) { + transmission.Mux.Lock() + events := transmission.Events + require.Len(collect, events, 1) + require.Equal(collect, span1.Dataset, events[0].Dataset) + transmission.Mux.Unlock() + }, 2*time.Second, 100*time.Millisecond) + + cancel1() + transmission.Flush() + + // test 2 + // we can't make a decision for the trace yet, let's + // forward it to its new home + span2 := &types.Span{ + TraceID: "traceID2", + Event: types.Event{ + Dataset: "test2", + Data: make(map[string]interface{}), + }, + } + + coll.distributeSpansOnShutdown(sentTraceChan, forwardTraceChan, span2) + require.Len(t, sentTraceChan, 0) + require.Len(t, forwardTraceChan, 1) + + ctx2, cancel2 := context.WithCancel(context.Background()) + go coll.sendSpansOnShutdown(ctx2, sentTraceChan, forwardTraceChan) + require.EventuallyWithT(t, func(collect *assert.CollectT) { + transmission.Mux.Lock() + require.Len(collect, transmission.Events, 1) + require.Equal(collect, span2.Dataset, transmission.Events[0].Dataset) + require.Equal(collect, "api2", transmission.Events[0].APIHost) + transmission.Mux.Unlock() + }, 2*time.Second, 100*time.Millisecond) + cancel2() +} + +func TestBigTracesGoEarly(t *testing.T) { + spanlimit := 200 + conf := &config.MockConfig{ + GetTracesConfigVal: config.TracesConfig{ + SendTicker: config.Duration(2 * time.Millisecond), + SendDelay: config.Duration(10 * time.Millisecond), + TraceTimeout: config.Duration(500 * time.Millisecond), + SpanLimit: uint(spanlimit - 1), + MaxBatchSize: 1500, + }, + GetSamplerTypeVal: &config.DeterministicSamplerConfig{SampleRate: 2}, + AddSpanCountToRoot: true, + AddCountsToRoot: true, + ParentIdFieldNames: []string{"trace.parent_id", "parentId"}, + AddRuleReasonToTrace: true, + } + + transmission := &transmit.MockTransmission{} + transmission.Start() + coll := newTestCollector(conf, transmission) + + c := cache.NewInMemCache(3, &metrics.NullMetrics{}, &logger.NullLogger{}) + coll.cache = c + stc, err := newCache() + assert.NoError(t, err, "lru cache should start") + coll.sampleTraceCache = stc + + coll.incoming = make(chan *types.Span, 500) + coll.fromPeer = make(chan *types.Span, 500) + coll.datasetSamplers = make(map[string]sample.Sampler) + go coll.collect() + defer coll.Stop() + + // this name was chosen to be Kept with the deterministic/2 sampler + var traceID = "myTrace" + + for i := 0; i < spanlimit; i++ { + span := &types.Span{ + TraceID: traceID, + Event: types.Event{ + Dataset: "aoeu", + Data: map[string]interface{}{ + "trace.parent_id": "unused", + "index": i, + }, + APIKey: legacyAPIKey, + }, + } + coll.AddSpanFromPeer(span) + } + + // wait for all the events to be transmitted + assert.EventuallyWithT(t, func(collect *assert.CollectT) { + transmission.Mux.RLock() + assert.Equal(collect, spanlimit, len(transmission.Events), "hitting the spanlimit should send the trace") + transmission.Mux.RUnlock() + }, 5*time.Second, 100*time.Millisecond) + + // now we add the root span and verify that it got sent and that the root span had the span count + rootSpan := &types.Span{ + TraceID: traceID, + Event: types.Event{ + Dataset: "aoeu", + Data: map[string]interface{}{}, + APIKey: legacyAPIKey, + }, + } + coll.AddSpan(rootSpan) + + assert.EventuallyWithT(t, func(collect *assert.CollectT) { + transmission.Mux.RLock() + assert.Equal(collect, spanlimit+1, len(transmission.Events), "hitting the spanlimit should send the trace") + transmission.Mux.RUnlock() + }, 5*time.Second, 100*time.Millisecond) + + transmission.Mux.RLock() + require.Equal(t, spanlimit+1, len(transmission.Events), "adding a root span should send all spans in the trace") + assert.Equal(t, nil, transmission.Events[0].Data["meta.span_count"], "child span metadata should NOT be populated with span count") + assert.Equal(t, "trace_send_span_limit", transmission.Events[0].Data["meta.refinery.send_reason"], "child span metadata should set to trace_send_span_limit") + assert.EqualValues(t, spanlimit+1, transmission.Events[spanlimit].Data["meta.span_count"], "root span metadata should be populated with span count") + assert.EqualValues(t, spanlimit+1, transmission.Events[spanlimit].Data["meta.event_count"], "root span metadata should be populated with event count") + assert.Equal(t, "deterministic/chance - late arriving span", transmission.Events[spanlimit].Data["meta.refinery.reason"], "the late root span should have meta.refinery.reason set to rules + late arriving span.") + assert.EqualValues(t, 2, transmission.Events[spanlimit].SampleRate, "the late root span should sample rate set") + assert.Equal(t, "trace_send_late_span", transmission.Events[spanlimit].Data["meta.refinery.send_reason"], "send reason should indicate span count exceeded") + transmission.Mux.RUnlock() +} diff --git a/collect/mockCollector.go b/collect/mockCollector.go new file mode 100644 index 0000000000..4ad0253ecb --- /dev/null +++ b/collect/mockCollector.go @@ -0,0 +1,51 @@ +package collect + +import ( + "github.com/honeycombio/refinery/types" +) + +type MockCollector struct { + Spans chan *types.Span +} + +func NewMockCollector() *MockCollector { + return &MockCollector{ + Spans: make(chan *types.Span, 100), + } +} + +func (m *MockCollector) AddSpan(span *types.Span) error { + m.Spans <- span + return nil +} + +func (m *MockCollector) AddSpanFromPeer(span *types.Span) error { + m.Spans <- span + return nil +} + +func (m *MockCollector) GetStressedSampleRate(traceID string) (rate uint, keep bool, reason string) { + return 0, false, "" +} + +func (m *MockCollector) ProcessSpanImmediately(sp *types.Span) (bool, bool) { + m.Spans <- sp + + return true, true +} + +func (m *MockCollector) Stressed() bool { + return false +} + +func (m *MockCollector) Flush() { + for { + select { + case <-m.Spans: + default: + return + } + } +} + +var _ Collector = (*MockCollector)(nil) diff --git a/collect/stressRelief.go b/collect/stressRelief.go index 4dfc4b0914..4e77caa265 100644 --- a/collect/stressRelief.go +++ b/collect/stressRelief.go @@ -1,35 +1,55 @@ package collect import ( + "context" "fmt" "math" + "strconv" + "strings" "sync" "time" "github.com/dgryski/go-wyhash" + "github.com/facebookgo/startstop" "github.com/honeycombio/refinery/config" + "github.com/honeycombio/refinery/internal/health" + "github.com/honeycombio/refinery/internal/peer" "github.com/honeycombio/refinery/logger" "github.com/honeycombio/refinery/metrics" + "github.com/honeycombio/refinery/pubsub" + "github.com/jonboulle/clockwork" ) +const stressReliefTopic = "refinery-stress-relief" + type StressReliever interface { - Start() error - UpdateFromConfig(cfg config.StressReliefConfig) error - Recalc() - StressLevel() uint + UpdateFromConfig(cfg config.StressReliefConfig) + Recalc() uint Stressed() bool GetSampleRate(traceID string) (rate uint, keep bool, reason string) + ShouldSampleDeterministically(traceID string) bool + + startstop.Starter } -type MockStressReliever struct{} +var _ StressReliever = &MockStressReliever{} -func (m *MockStressReliever) Start() error { return nil } -func (m *MockStressReliever) UpdateFromConfig(cfg config.StressReliefConfig) error { return nil } -func (m *MockStressReliever) Recalc() {} -func (m *MockStressReliever) StressLevel() uint { return 0 } -func (m *MockStressReliever) Stressed() bool { return false } +type MockStressReliever struct { + IsStressed bool + SampleDeterministically bool + ShouldKeep bool + SampleRate uint +} + +func (m *MockStressReliever) Start() error { return nil } +func (m *MockStressReliever) UpdateFromConfig(cfg config.StressReliefConfig) {} +func (m *MockStressReliever) Recalc() uint { return 0 } +func (m *MockStressReliever) Stressed() bool { return m.IsStressed } func (m *MockStressReliever) GetSampleRate(traceID string) (rate uint, keep bool, reason string) { - return 1, false, "" + return m.SampleRate, m.ShouldKeep, "mock" +} +func (m *MockStressReliever) ShouldSampleDeterministically(traceID string) bool { + return m.SampleDeterministically } // hashSeed is a random value to seed the hash generator for the sampler. @@ -45,31 +65,61 @@ const ( Always ) +type stressReport struct { + key string + level uint + // we need to expire these reports after a certain amount of time + timestamp time.Time +} + +var _ StressReliever = &StressRelief{} + type StressRelief struct { - mode StressReliefMode - activateLevel uint - deactivateLevel uint - sampleRate uint64 - upperBound uint64 - stressLevel uint - reason string - formula string - stressed bool - stayOnUntil time.Time - minDuration time.Duration RefineryMetrics metrics.Metrics `inject:"metrics"` Logger logger.Logger `inject:""` + Health health.Recorder `inject:""` + PubSub pubsub.PubSub `inject:""` + Peer peer.Peers `inject:""` + Clock clockwork.Clock `inject:""` Done chan struct{} + mode StressReliefMode + hostID string + activateLevel uint + deactivateLevel uint + sampleRate uint64 + upperBound uint64 + overallStressLevel uint + reason string + formula string + stressed bool + stayOnUntil time.Time + minDuration time.Duration + algorithms map[string]func(string, string) float64 calcs []StressReliefCalculation - lock sync.RWMutex + + lock sync.RWMutex + stressLevels map[string]stressReport + // only used in tests + disableStressLevelReport bool } +const StressReliefHealthKey = "stress_relief" + func (s *StressRelief) Start() error { s.Logger.Debug().Logf("Starting StressRelief system") defer func() { s.Logger.Debug().Logf("Finished starting StressRelief system") }() + // register with health + s.Health.Register(StressReliefHealthKey, 3*time.Second) + + // register stress level metrics + s.RefineryMetrics.Register("cluster_stress_level", "gauge") + s.RefineryMetrics.Register("individual_stress_level", "gauge") + s.RefineryMetrics.Register("stress_level", "gauge") + s.RefineryMetrics.Register("stress_relief_activated", "gauge") + // We use an algorithms map so that we can name these algorithms, which makes it easier for several things: // - change our mind about which algorithm to use // - logging the algorithm actually used @@ -92,15 +142,54 @@ func (s *StressRelief) Start() error { {Numerator: "memory_heap_allocation", Denominator: "MEMORY_MAX_ALLOC", Algorithm: "sigmoid", Reason: "MaxAlloc"}, } + var err error + s.hostID, err = s.Peer.GetInstanceID() + if err != nil { + return fmt.Errorf("failed to get host ID: %w", err) + } + + s.stressLevels = make(map[string]stressReport) + + // Subscribe to the stress relief topic so we can react to stress level + // changes in the cluster. + s.PubSub.Subscribe(context.Background(), stressReliefTopic, s.onStressLevelUpdate) + // start our monitor goroutine that periodically calls recalc + // and also reports that it's healthy + go func(s *StressRelief) { - tick := time.NewTicker(100 * time.Millisecond) + // only publish stress level if it has changed or if it's been a while since the last publish + if s.disableStressLevelReport { + return + } + const maxTicksBetweenReports = 30 + var ( + lastLevel uint = 0 + tickCounter = 0 + ) + + tick := s.Clock.NewTicker(100 * time.Millisecond) defer tick.Stop() for { select { - case <-tick.C: - s.Recalc() + case <-tick.Chan(): + currentLevel := s.Recalc() + + if lastLevel != currentLevel || tickCounter == maxTicksBetweenReports { + err := s.PubSub.Publish(context.Background(), stressReliefTopic, newStressReliefMessage(currentLevel, s.hostID).String()) + if err != nil { + s.Logger.Error().Logf("failed to publish stress level: %s", err) + } + + lastLevel = currentLevel + tickCounter = 0 + } + + tickCounter++ + + s.Health.Ready(StressReliefHealthKey, true) case <-s.Done: + s.Health.Unregister(StressReliefHealthKey) s.Logger.Debug().Logf("Stopping StressRelief system") return } @@ -109,7 +198,51 @@ func (s *StressRelief) Start() error { return nil } -func (s *StressRelief) UpdateFromConfig(cfg config.StressReliefConfig) error { +type stressReliefMessage struct { + peerID string + level uint +} + +func newStressReliefMessage(level uint, peerID string) *stressReliefMessage { + return &stressReliefMessage{level: level, peerID: peerID} +} + +func (msg *stressReliefMessage) String() string { + return msg.peerID + "|" + fmt.Sprint(msg.level) +} + +func unmarshalStressReliefMessage(msg string) (*stressReliefMessage, error) { + if len(msg) < 2 { + return nil, fmt.Errorf("empty message") + } + + parts := strings.SplitN(msg, "|", 2) + level, err := strconv.Atoi(parts[1]) + if err != nil { + return nil, err + } + + return newStressReliefMessage(uint(level), parts[0]), nil +} + +func (s *StressRelief) onStressLevelUpdate(ctx context.Context, msg string) { + stressMsg, err := unmarshalStressReliefMessage(msg) + if err != nil { + s.Logger.Error().Logf("failed to unmarshal stress relief message: %s", err) + return + } + + s.lock.Lock() + defer s.lock.Unlock() + + s.stressLevels[stressMsg.peerID] = stressReport{ + key: stressMsg.peerID, + level: stressMsg.level, + timestamp: s.Clock.Now(), + } +} + +func (s *StressRelief) UpdateFromConfig(cfg config.StressReliefConfig) { s.lock.Lock() defer s.lock.Unlock() @@ -117,17 +250,6 @@ func (s *StressRelief) UpdateFromConfig(cfg config.StressReliefConfig) error { case "never", "": s.mode = Never case "monitor": - // If we're switching into monitor mode from some other state (which - // happens on startup), we will start up in stressed mode for a - // configurable time to try to make sure that we can handle the load - // before we start processing it in earnest. This is to help address the - // problem of trying to bring a new node into an already-overloaded - // cluster. If the time is 0 we won't do this at all. - if s.mode != Monitor && cfg.MinimumStartupDuration != 0 { - s.stressed = true - s.stayOnUntil = time.Now().Add(time.Duration(cfg.MinimumStartupDuration)) - s.Logger.Warn().WithField("stress_level", s.stressLevel).WithField("reason", "MinimumStartupDuration").Logf("StressRelief has been activated") - } s.mode = Monitor case "always": s.mode = Always @@ -158,8 +280,6 @@ func (s *StressRelief) UpdateFromConfig(cfg config.StressReliefConfig) error { // uint64. In the case where the sample rate is 1, this should sample every // value. s.upperBound = math.MaxUint64 / s.sampleRate - - return nil } func clamp(f float64, min float64, max float64) float64 { @@ -263,28 +383,38 @@ type StressReliefCalculation struct { // We want to calculate the stress from various values around the system. Each key value // can be reported as a key-value. // This should be called periodically. -func (s *StressRelief) Recalc() { +func (s *StressRelief) Recalc() uint { // we have multiple queues to watch, and for each we calculate a stress level for that queue, which is // 100 * the fraction of its capacity in use. Our overall stress level is the max of those values. // We track the config value that is under stress as "reason". - var level float64 + var maximumLevel float64 var reason string var formula string for _, c := range s.calcs { stress := 100 * s.algorithms[c.Algorithm](c.Numerator, c.Denominator) - if stress > level { - level = stress + if stress > maximumLevel { + maximumLevel = stress reason = c.Reason formula = fmt.Sprintf("%s(%v/%v)=%v", c.Algorithm, c.Numerator, c.Denominator, stress) } } - s.Logger.Debug().WithField("stress_level", level).WithField("stress_formula", s.formula).WithField("reason", reason).Logf("calculated stress level") + s.Logger.Debug().WithField("individual_stress_level", maximumLevel).WithField("stress_formula", s.formula).WithField("reason", reason).Logf("calculated stress level") + + s.RefineryMetrics.Gauge("individual_stress_level", float64(maximumLevel)) + localLevel := uint(maximumLevel) + + clusterStressLevel := s.clusterStressLevel(localLevel) + s.RefineryMetrics.Gauge("cluster_stress_level", clusterStressLevel) s.lock.Lock() defer s.lock.Unlock() - s.stressLevel = uint(level) + // The overall stress level is the max of the individual and cluster stress levels + // If a single node is under significant stress, it can activate stress relief mode + s.overallStressLevel = uint(math.Max(float64(clusterStressLevel), float64(localLevel))) + s.RefineryMetrics.Gauge("stress_level", s.overallStressLevel) + s.reason = reason s.formula = formula @@ -295,28 +425,79 @@ func (s *StressRelief) Recalc() { s.stressed = true case Monitor: // If it's off, should we activate it? - if !s.stressed && s.stressLevel >= s.activateLevel { + if !s.stressed && s.overallStressLevel >= s.activateLevel { s.stressed = true - s.Logger.Warn().WithField("stress_level", s.stressLevel).WithField("stress_formula", s.formula).WithField("reason", s.reason).Logf("StressRelief has been activated") + s.Logger.Warn().WithFields(map[string]interface{}{ + "individual_stress_level": localLevel, + "cluster_stress_level": clusterStressLevel, + "stress_level": s.overallStressLevel, + "stress_formula": s.formula, + "reason": s.reason, + }).Logf("StressRelief has been activated") } // We want make sure that stress relief is below the deactivate level // for a minimum time after the last time we said it should be, so // whenever it's above that value we push the time out. - if s.stressed && s.stressLevel >= s.deactivateLevel { - s.stayOnUntil = time.Now().Add(s.minDuration) + if s.stressed && s.overallStressLevel >= s.deactivateLevel { + s.stayOnUntil = s.Clock.Now().Add(s.minDuration) } // If it's on, should we deactivate it? - if s.stressed && s.stressLevel < s.deactivateLevel && time.Now().After(s.stayOnUntil) { + if s.stressed && s.overallStressLevel < s.deactivateLevel && s.Clock.Now().After(s.stayOnUntil) { s.stressed = false - s.Logger.Warn().WithField("stress_level", s.stressLevel).Logf("StressRelief has been deactivated") + s.Logger.Warn().WithFields(map[string]interface{}{ + "individual_stress_level": localLevel, + "cluster_stress_level": clusterStressLevel, + "stress_level": s.overallStressLevel, + }).Logf("StressRelief has been deactivated") } } + + if s.stressed { + s.RefineryMetrics.Gauge("stress_relief_activated", 1) + } else { + s.RefineryMetrics.Gauge("stress_relief_activated", 0) + } + + return localLevel } -func (s *StressRelief) StressLevel() uint { - s.lock.RLock() - defer s.lock.RUnlock() - return s.stressLevel +// clusterStressLevel calculates the overall stress level for the cluster +// by using the stress levels reported by each node. +// It uses the geometric mean of the stress levels reported by each node to +// calculate the overall stress level for the cluster. +func (s *StressRelief) clusterStressLevel(localLevel uint) uint { + // we need to calculate the stress level from the levels we've been given + // and then publish it to the cluster + report := stressReport{ + key: s.hostID, + level: localLevel, + timestamp: s.Clock.Now(), + } + + s.lock.Lock() + defer s.lock.Unlock() + + s.stressLevels[report.key] = report + var total float64 + availablePeers := 0 + for _, report := range s.stressLevels { + if s.Clock.Since(report.timestamp) > peer.PeerEntryTimeout { + delete(s.stressLevels, report.key) + continue + } + // we don't want to include peers that are just starting up + if report.level == 0 { + continue + } + availablePeers++ + total += float64(report.level * report.level) + } + + if availablePeers == 0 { + availablePeers = 1 + } + + return uint(math.Sqrt(total / float64(availablePeers))) } // Stressed() indicates whether the system should act as if it's stressed. @@ -336,3 +517,29 @@ func (s *StressRelief) GetSampleRate(traceID string) (rate uint, keep bool, reas hash := wyhash.Hash([]byte(traceID), hashSeed) return uint(s.sampleRate), hash <= s.upperBound, "stress_relief/deterministic/" + s.reason } + +// ShouldSampleDeterministically returns true if the trace should be deterministically sampled. +// It uses the traceID to calculate a hash and then divides it by the maximum possible value +// to get a percentage. If the percentage is less than the deterministic fraction, it returns true. +func (s *StressRelief) ShouldSampleDeterministically(traceID string) bool { + samplePercentage := s.deterministicFraction() + hash := wyhash.Hash([]byte(traceID), hashSeed) + + return float64(hash)/float64(math.MaxUint64)*100 < float64(samplePercentage) +} + +// deterministicFraction returns the fraction of traces that should be deterministic sampled +// It calculates the result by using the stress level as the fraction between the activation +// level and 100%. The result is rounded to the nearest integer. +// +// for example: +// - if the stress level is 90 and the activation level is 80, the result will be 50 +// - meaning that 50% of the traces should be deterministic sampled +func (s *StressRelief) deterministicFraction() uint { + if s.overallStressLevel < s.activateLevel { + return 0 + } + + // round to the nearest integer + return uint(float64(s.overallStressLevel-s.activateLevel)/float64(100-s.activateLevel)*100 + 0.5) +} diff --git a/collect/stress_relief_test.go b/collect/stress_relief_test.go new file mode 100644 index 0000000000..45b57a9958 --- /dev/null +++ b/collect/stress_relief_test.go @@ -0,0 +1,293 @@ +package collect + +import ( + "context" + "fmt" + "math" + "math/rand" + "testing" + "time" + + "github.com/honeycombio/refinery/config" + "github.com/honeycombio/refinery/internal/health" + "github.com/honeycombio/refinery/internal/peer" + "github.com/honeycombio/refinery/logger" + "github.com/honeycombio/refinery/metrics" + "github.com/honeycombio/refinery/pubsub" + "github.com/jonboulle/clockwork" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestStressRelief_Monitor tests that the Stressed method returns the correct value +// for a given metrics data. +func TestStressRelief_Monitor(t *testing.T) { + clock := clockwork.NewFakeClock() + sr, stop := newStressRelief(t, clock, nil) + defer stop() + require.NoError(t, sr.Start()) + + sr.RefineryMetrics.Register("collector_incoming_queue_length", "gauge") + + sr.RefineryMetrics.Store("INCOMING_CAP", 1200) + + cfg := config.StressReliefConfig{ + Mode: "monitor", + ActivationLevel: 80, + DeactivationLevel: 50, + SamplingRate: 2, + MinimumActivationDuration: config.Duration(5 * time.Second), + } + + // On startup, the stress relief should not be active + sr.UpdateFromConfig(cfg) + require.False(t, sr.Stressed()) + + // Test 1 + sr.RefineryMetrics.Gauge("collector_incoming_queue_length", 100) + require.Eventually(t, func() bool { + clock.Advance(time.Second * 6) + return !sr.Stressed() + }, 2*time.Second, 100*time.Millisecond, "stress relief should be false") + + // Test 2 + // Set activation level to 80 and the current stress level to be more than + // 80. Check that the Stressed method returns true + sr.RefineryMetrics.Gauge("collector_incoming_queue_length", 1000) + require.Eventually(t, func() bool { + clock.Advance(time.Second * 6) + return sr.Stressed() + }, 2*time.Second, 100*time.Millisecond, "stress relief should be true") + + sr.RefineryMetrics.Gauge("collector_incoming_queue_length", 100) + require.Eventually(t, func() bool { + clock.Advance(time.Second * 6) + return !sr.Stressed() + }, 2*time.Second, 100*time.Millisecond, "stress relief should be false") +} + +// TestStressRelief_Peer tests stress relief activation and deactivation +// based on the stress level of a peer. +func TestStressRelief_Peer(t *testing.T) { + clock := clockwork.NewFakeClock() + metric := &metrics.MockMetrics{} + metric.Start() + channel := &pubsub.LocalPubSub{ + Metrics: metric, + } + require.NoError(t, channel.Start()) + + sr, stop := newStressRelief(t, clock, channel) + defer stop() + require.NoError(t, sr.Start()) + + sr.RefineryMetrics.Register("collector_incoming_queue_length", "gauge") + + sr.RefineryMetrics.Store("INCOMING_CAP", 1200) + + cfg := config.StressReliefConfig{ + Mode: "monitor", + ActivationLevel: 80, + DeactivationLevel: 65, + SamplingRate: 2, + MinimumActivationDuration: config.Duration(5 * time.Second), + } + + // On startup, the stress relief should not be active + sr.UpdateFromConfig(cfg) + require.False(t, sr.Stressed()) + + // activate stress relief in one refinery + sr.RefineryMetrics.Gauge("collector_incoming_queue_length", 965) + require.Eventually(t, func() bool { + clock.Advance(time.Second * 1) + return sr.Stressed() + }, 2*time.Second, 100*time.Millisecond, "stress relief should be active") + + require.Eventually(t, func() bool { + // pretend another refinery just started up + msg := stressReliefMessage{ + level: 10, + peerID: "peer1", + } + require.NoError(t, channel.Publish(context.Background(), stressReliefTopic, msg.String())) + clock.Advance(time.Second * 1) + return sr.Stressed() + }, 2*time.Second, 100*time.Millisecond, "stress relief should be remain activated") + + // now the peer has reported valid stress level + // it should be taken into account for the overall stress level + sr.RefineryMetrics.Gauge("collector_incoming_queue_length", 5) + require.Eventually(t, func() bool { + msg := stressReliefMessage{ + level: 10, + peerID: "peer1", + } + require.NoError(t, channel.Publish(context.Background(), stressReliefTopic, msg.String())) + + clock.Advance(time.Second * 1) + return !sr.Stressed() + }, 2*time.Second, 100*time.Millisecond, "stress relief should be false") +} + +func TestStressRelief_OverallStressLevel(t *testing.T) { + clock := clockwork.NewFakeClock() + sr, stop := newStressRelief(t, clock, nil) + defer stop() + + // disable the automatic stress level recalculation + sr.disableStressLevelReport = true + sr.Start() + + sr.RefineryMetrics.Register("collector_incoming_queue_length", "gauge") + + sr.RefineryMetrics.Store("INCOMING_CAP", 1200) + + cfg := config.StressReliefConfig{ + Mode: "monitor", + ActivationLevel: 80, + DeactivationLevel: 65, + MinimumActivationDuration: config.Duration(5 * time.Second), + } + + // On startup, the stress relief should not be active + sr.UpdateFromConfig(cfg) + require.False(t, sr.Stressed()) + + // Test 1 + // when a single peer's individual stress level is above the activation level + // the overall stress level should be above the activation level + // and the stress relief should be active + sr.RefineryMetrics.Gauge("collector_incoming_queue_length", 965) + clock.Advance(time.Second * 1) + sr.stressLevels = make(map[string]stressReport, 100) + for i := 0; i < 100; i++ { + key := fmt.Sprintf("peer%d", i) + sr.stressLevels[key] = stressReport{ + key: key, + level: 10, + timestamp: sr.Clock.Now(), + } + } + + localLevel := sr.Recalc() + require.Equal(t, localLevel, sr.overallStressLevel) + require.True(t, sr.stressed) + + // Test 2 + // when a single peer's individual stress level is below the activation level + // and the rest of the cluster is above the activation level + // the single peer should remain in stress relief mode + sr.RefineryMetrics.Gauge("collector_incoming_queue_length", 10) + for i := 0; i < 100; i++ { + key := fmt.Sprintf("peer%d", i) + sr.stressLevels[key] = stressReport{ + key: key, + level: 85, + timestamp: sr.Clock.Now(), + } + } + localLevel = sr.Recalc() + require.Greater(t, sr.overallStressLevel, localLevel) + require.True(t, sr.stressed) + + // Test 3 + // Only when both the single peer's individual stress level and the cluster stress + // level is below the activation level, the stress relief should be deactivated. + sr.RefineryMetrics.Gauge("collector_incoming_queue_length", 10) + for i := 0; i < 100; i++ { + key := fmt.Sprintf("peer%d", i) + sr.stressLevels[key] = stressReport{ + key: key, + level: 1, + timestamp: sr.Clock.Now(), + } + } + clock.Advance(sr.minDuration * 2) + localLevel = sr.Recalc() + assert.Equal(t, sr.overallStressLevel, localLevel) + assert.False(t, sr.stressed) +} + +// TestStressRelief_Sample tests that traces are sampled deterministically +// by traceID. +// The test generates 10000 traceIDs and checks that the sampling rate is +// within 10% of the expected value. +func TestStressRelief_ShouldSampleDeterministically(t *testing.T) { + traceCount := 10000 + traceIDs := make([]string, 0, traceCount) + for i := 0; i < traceCount; i++ { + traceIDs = append(traceIDs, fmt.Sprintf("%016x%016x", rand.Int63(), rand.Int63())) + } + + sr := &StressRelief{ + overallStressLevel: 90, + activateLevel: 60, + } + + var sampled int + var dropped int + var sampledTraceID string + var droppedTraceID string + for _, traceID := range traceIDs { + if sr.ShouldSampleDeterministically(traceID) { + sampled++ + if sampledTraceID == "" { + sampledTraceID = traceID + } + } else { + if droppedTraceID == "" { + droppedTraceID = traceID + } + dropped++ + } + } + + difference := float64(sampled)/float64(traceCount)*100 - float64(sr.deterministicFraction()) + require.LessOrEqual(t, math.Floor(math.Abs(float64(difference))), float64(10), sampled) + + // make sure that the same traceID always gets the same result + require.True(t, sr.ShouldSampleDeterministically(sampledTraceID)) + require.False(t, sr.ShouldSampleDeterministically(droppedTraceID)) +} + +func newStressRelief(t *testing.T, clock clockwork.Clock, channel pubsub.PubSub) (*StressRelief, func()) { + // Create a new StressRelief object + metric := &metrics.MockMetrics{} + metric.Start() + + if clock == nil { + clock = clockwork.NewRealClock() + } + + if channel == nil { + channel = &pubsub.LocalPubSub{ + Metrics: metric, + } + } + require.NoError(t, channel.Start()) + logger := &logger.NullLogger{} + healthReporter := &health.Health{ + Clock: clock, + Metrics: metric, + Logger: logger, + } + require.NoError(t, healthReporter.Start()) + + peer := &peer.MockPeers{} + require.NoError(t, peer.Start()) + + sr := &StressRelief{ + Clock: clock, + Logger: logger, + RefineryMetrics: metric, + PubSub: channel, + Health: healthReporter, + Peer: peer, + } + + return sr, func() { + require.NoError(t, healthReporter.Stop()) + require.NoError(t, channel.Stop()) + } +} diff --git a/config.md b/config.md index 30c3dd8e88..a9ff45a694 100644 --- a/config.md +++ b/config.md @@ -1,7 +1,7 @@ # Honeycomb Refinery Configuration Documentation This is the documentation for the configuration file for Honeycomb's Refinery. -It was automatically generated on 2023-12-04 at 22:34:13 UTC. +It was automatically generated on 2024-09-05 at 17:40:33 UTC. ## The Config file @@ -37,6 +37,7 @@ The remainder of this document describes the sections within the file and the fi - [Prometheus Metrics](#prometheus-metrics) - [Legacy Metrics](#legacy-metrics) - [OpenTelemetry Metrics](#opentelemetry-metrics) +- [OpenTelemetry Tracing](#opentelemetry-tracing) - [Peer Management](#peer-management) - [Redis Peer Management](#redis-peer-management) - [Collection Settings](#collection-settings) @@ -90,7 +91,8 @@ ConfigReloadInterval is the average interval between attempts at reloading the c Refinery will attempt to read its configuration and check for changes at approximately this interval. This time is varied by a random amount up to 10% to avoid all instances refreshing together. In installations where configuration changes are handled by restarting Refinery, which is often the case when using Kubernetes, disable this feature with a value of `0s`. -If the config file is being loaded from a URL, it may be wise to increase this value to avoid overloading the file server. +As of Refinery v2.7, news of a configuration change is immediately propagated to all peers, and they will attempt to reload their configurations. +Note that external factors (for example, Kubernetes ConfigMaps) may cause delays in propagating configuration changes. - Not eligible for live reload. - Type: `duration` @@ -169,10 +171,41 @@ AcceptOnlyListedKeys is a boolean flag that causes events arriving with API keys If `true`, then only traffic using the keys listed in `ReceiveKeys` is accepted. Events arriving with API keys not in the `ReceiveKeys` list will be rejected with an HTTP `401` error. If `false`, then all traffic is accepted and `ReceiveKeys` is ignored. +This setting is applied **before** the `SendKey` and `SendKeyMode` settings. - Eligible for live reload. - Type: `bool` +### `SendKey` + +SendKey is an optional Honeycomb API key that Refinery can use to send data to Honeycomb, depending on configuration. + +If `SendKey` is set to a valid Honeycomb key, then Refinery can use the listed key to send data. +The exact behavior depends on the value of `SendKeyMode`. + +- Eligible for live reload. +- Type: `string` +- Example: `SetThisToAHoneycombKey` + +### `SendKeyMode` + +SendKeyMode controls how SendKey is used to replace or augment API keys used in incoming telemetry. + +Controls how SendKey is used to replace or supply API keys used in incoming telemetry. +If `AcceptOnlyListedKeys` is `true`, then `SendKeys` will only be used for events with keys listed in `ReceiveKeys`. +`none` uses the incoming key for all telemetry (default). +`all` overwrites all keys, even missing ones, with `SendKey`. +`nonblank` overwrites all supplied keys but will not inject `SendKey` if the incoming key is blank. +`listedonly` overwrites only the keys listed in `ReceiveKeys`. +`unlisted` uses the `SendKey` for all events *except* those with keys listed in `ReceiveKeys`, which use their original keys. +`missingonly` uses the SendKey only to inject keys into events with blank keys. +All other events use their original keys. + +- Eligible for live reload. +- Type: `string` +- Default: `none` +- Options: `none`, `all`, `nonblank`, `listedonly`, `unlisted`, `missingonly` + ## Refinery Telemetry `RefineryTelemetry` contains configuration information for the telemetry that Refinery uses to record its own operation. @@ -183,8 +216,8 @@ AddRuleReasonToTrace controls whether to decorate traces with Refinery rule eval When enabled, this setting causes traces that are sent to Honeycomb to include the field `meta.refinery.reason`. This field contains text indicating which rule was evaluated that caused the trace to be included. This setting also includes the field `meta.refinery.send_reason`, which contains the reason that the trace was sent. -Possible values of this field are `trace_send_got_root`, which means that the root span arrived; `trace_send_expired`, which means that TraceTimeout was reached; `trace_send_ejected_full`, which means that the trace cache was full; and `trace_send_ejected_memsize`, which means that refinery was out of memory. -These names are also the names of metrics that refinery tracks. +Possible values of this field are `trace_send_got_root`, which means that the root span arrived; `trace_send_expired`, which means that `TraceTimeout` was reached; `trace_send_ejected_full`, which means that the trace cache was full; and `trace_send_ejected_memsize`, which means that Refinery was out of memory. +These names are also the names of metrics that Refinery tracks. We recommend enabling this setting whenever a rules-based sampler is in use, as it is useful for debugging and understanding the behavior of your Refinery installation. - Eligible for live reload. @@ -231,12 +264,12 @@ If `true`, then Refinery will add the following tag to all traces: - `meta.refin `Traces` contains configuration for how traces are managed. ### `SendDelay` -SendDelay is the duration to wait before sending a trace. +SendDelay is the duration to wait after the root span arrives before sending a trace. -This setting is a short timer that is triggered when a trace is complete. +This setting is a short timer that is triggered when a trace is marked complete by the arrival of the root span. Refinery waits for this duration before sending the trace. -The reason for this setting is to allow for small network delays or clock jitters to elapse and any final spans to arrive before sending the trace. -Set to "0" for immediate sending. +This setting exists to allow for asynchronous spans and small network delays to elapse before sending the trace. +`SendDelay` is not applied if the `TraceTimeout` expires or the `SpanLimit` is reached. - Eligible for live reload. - Type: `duration` @@ -258,7 +291,11 @@ TraceTimeout is the duration to wait before making the trace decision on an inco A long timer; it represents the outside boundary of how long to wait before making the trace decision about an incomplete trace. Normally trace decisions (send or drop) are made when the root span arrives. -Sometimes the root span never arrives (for example, due to crashes) and this timer ensures sending a trace even without having received the root span. +Sometimes the root span never arrives (for example, due to crashes). +Once this timer fires, Refinery will make a trace decision based on the spans that have arrived so far. +This ensures sending a trace even when the root span never arrives. +After the trace decision has been made, Refinery retains a record of that decision for a period of time. +When additional spans (including the root span) arrive, they will be kept or dropped based on the original decision. If particularly long-lived traces are present in your data, then you should increase this timer. Note that this increase will also increase the memory requirements for Refinery. @@ -266,6 +303,17 @@ Note that this increase will also increase the memory requirements for Refinery. - Type: `duration` - Default: `60s` +### `SpanLimit` + +SpanLimit is the number of spans after which a trace becomes eligible for a trace decision. + +This setting helps to keep memory usage under control. +If a trace has more than this set number of spans, then it becomes eligible for a trace decision. +It's most helpful in a situation where a sudden burst of many spans in a large trace hits Refinery all at once, causing memory usage to spike and possibly crashing Refinery. + +- Eligible for live reload. +- Type: `int` + ### `MaxBatchSize` MaxBatchSize is the maximum number of events to be included in each batch for sending. @@ -306,7 +354,7 @@ If this value is not specified, then the debug service runs on the first open po ### `QueryAuthToken` -QueryAuthToken is the token that must be specified to access the `/query` endpoint. +QueryAuthToken is the token that must be specified to access the `/query` endpoint. Setting this value via a command line flag may expose credentials - it is recommended to use the environment variable or a configuration file. This token must be specified with the header "X-Honeycomb-Refinery-Query" in order for a `/query` request to succeed. These `/query` requests are intended for debugging Refinery during setup and are not typically needed in normal operation. @@ -389,7 +437,7 @@ Refinery's internal logs will be sent to this host using the standard Honeycomb ### `APIKey` -APIKey is the API key used to send Refinery's logs to Honeycomb. +APIKey is the API key used to send Refinery's logs to Honeycomb. Setting this value via a command line flag may expose credentials - it is recommended to use the environment variable or a configuration file. It is recommended that you create a separate team and key for Refinery logs. @@ -518,7 +566,7 @@ Refinery's internal metrics will be sent to this host using the standard Honeyco ### `APIKey` -APIKey is the API key used by Refinery to send its metrics to Honeycomb. +APIKey is the API key used by Refinery to send its metrics to Honeycomb. Setting this value via a command line flag may expose credentials - it is recommended to use the environment variable or a configuration file. It is recommended that you create a separate team and key for Refinery metrics. @@ -575,7 +623,7 @@ Refinery's internal metrics will be sent to the `/v1/metrics` endpoint on this h ### `APIKey` -APIKey is the API key used to send Honeycomb metrics via OpenTelemetry. +APIKey is the API key used to send Honeycomb metrics via OpenTelemetry. Setting this value via a command line flag may expose credentials - it is recommended to use the environment variable or a configuration file. It is recommended that you create a separate team and key for Refinery metrics. If this is blank, then Refinery will not set the Honeycomb-specific headers for OpenTelemetry, and your `APIHost` must be set to a valid OpenTelemetry endpoint. @@ -617,6 +665,61 @@ In rare circumstances, compression costs may outweigh the benefits, in which cas - Default: `gzip` - Options: `none`, `gzip` +## OpenTelemetry Tracing + +`OTelTracing` contains configuration for Refinery's own tracing. +### `Enabled` + +Enabled controls whether to send Refinery's own OpenTelemetry traces. + +The setting specifies if Refinery sends traces. + +- Not eligible for live reload. +- Type: `bool` + +### `APIHost` + +APIHost is the URL of the OpenTelemetry API to which traces will be sent. + +Refinery's internal traces will be sent to the `/v1/traces` endpoint on this host. + +- Not eligible for live reload. +- Type: `url` +- Default: `https://api.honeycomb.io` + +### `APIKey` + +APIKey is the API key used to send Refinery's traces to Honeycomb. Setting this value via a command line flag may expose credentials - it is recommended to use the environment variable or a configuration file. + +It is recommended that you create a separate team and key for Refinery telemetry. +If this value is blank, then Refinery will not set the Honeycomb-specific headers for OpenTelemetry, and your `APIHost` must be set to a valid OpenTelemetry endpoint. + +- Not eligible for live reload. +- Type: `string` +- Example: `SetThisToAHoneycombKey` +- Environment variable: `REFINERY_HONEYCOMB_TRACES_API_KEY, REFINERY_HONEYCOMB_API_KEY` + +### `Dataset` + +Dataset is the Honeycomb dataset to which Refinery sends its OpenTelemetry metrics. + +Only used if `APIKey` is specified. + +- Not eligible for live reload. +- Type: `string` +- Default: `Refinery Traces` + +### `SampleRate` + +SampleRate is the rate at which Refinery samples its own traces. + +This is the Honeycomb sample rate used to sample traces sent by Refinery. +Since each incoming span generates multiple outgoing spans, a minimum sample rate of `100` is strongly advised. + +- Eligible for live reload. +- Type: `int` +- Default: `100` + ## Peer Management `PeerManagement` controls how the Refinery cluster communicates between peers. @@ -626,7 +729,10 @@ Type is the type of peer management to use. Peer management is the mechanism by which Refinery locates its peers. `file` means that Refinery gets its peer list from the Peers list in this config file. -`redis` means that Refinery self-registers with a Redis instance and gets its peer list from there. +It also prevents Refinery from using a publish/subscribe mechanism to propagate peer lists, stress levels, and configuration changes. +`redis` means that Refinery uses a Publish/Subscribe mechanism, implemented on Redis, to propagate peer lists, stress levels, and notification of configuration changes much more quickly than the legacy mechanism. +The recommended setting is `redis`, especially for new installations. +If `redis` is specified, fields in `RedisPeerManagement` must also be set. - Not eligible for live reload. - Type: `string` @@ -672,16 +778,16 @@ If this value is specified, then Refinery will use the first IPV6 unicast addres Peers is the list of peers to use when Type is "file", excluding self. This list is ignored when Type is "redis". -The format is a list of strings of the form "host:port". +The format is a list of strings of the form "scheme://host:port". - Not eligible for live reload. - Type: `stringarray` -- Example: `192.168.1.11:8081,192.168.1.12:8081` +- Example: `http://192.168.1.11:8081,http://192.168.1.12:8081` ## Redis Peer Management `RedisPeerManagement` controls how the Refinery cluster communicates between peers when using Redis. -Only applies when `PeerManagement.Type` is "redis". +Does not apply when `PeerManagement.Type` is "file". ### `Host` @@ -694,9 +800,21 @@ Must be in the form `host:port`. - Example: `localhost:6379` - Environment variable: `REFINERY_REDIS_HOST` +### `ClusterHosts` + +ClusterHosts is a list of host and port pairs for the instances in a Redis Cluster, and used for managing peer cluster membership. + +This configuration enables Refinery to connect to a Redis deployment setup in Cluster Mode. +Each entry in the list should follow the format `host:port`. +If `ClusterHosts` is specified, the `Host` setting will be ignored. + +- Not eligible for live reload. +- Type: `stringarray` +- Example: `- localhost:6379` + ### `Username` -Username is the username used to connect to Redis for peer cluster membership management. +Username is the username used to connect to Redis for peer cluster membership management. Setting this value via a command line flag may expose credentials - it is recommended to use the environment variable or a configuration file. Many Redis installations do not use this field. @@ -706,7 +824,7 @@ Many Redis installations do not use this field. ### `Password` -Password is the password used to connect to Redis for peer cluster membership management. +Password is the password used to connect to Redis for peer cluster membership management. Setting this value via a command line flag may expose credentials - it is recommended to use the environment variable or a configuration file. Many Redis installations do not use this field. @@ -716,7 +834,7 @@ Many Redis installations do not use this field. ### `AuthCode` -AuthCode is the string used to connect to Redis for peer cluster membership management using an explicit AUTH command. +AuthCode is the string used to connect to Redis for peer cluster membership management using an explicit AUTH command. Setting this value via a command line flag may expose credentials - it is recommended to use the environment variable or a configuration file. Many Redis installations do not use this field. @@ -724,29 +842,6 @@ Many Redis installations do not use this field. - Type: `string` - Environment variable: `REFINERY_REDIS_AUTH_CODE` -### `Prefix` - -Prefix is a string used as a prefix for the keys in Redis while storing the peer membership. - -It might be useful to override this in any situation where multiple Refinery clusters or multiple applications want to share a single Redis instance. -It may not be blank. - -- Not eligible for live reload. -- Type: `string` -- Default: `refinery` -- Example: `customPrefix` - -### `Database` - -Database is the database number to use for the Redis instance storing the peer membership. - -An integer from 0-15 indicating the database number to use for the Redis instance storing the peer membership. -It might be useful to set this in any situation where multiple Refinery clusters or multiple applications want to share a single Redis instance. - -- Not eligible for live reload. -- Type: `int` -- Example: `1` - ### `UseTLS` UseTLS enables TLS when connecting to Redis for peer cluster membership management. @@ -785,8 +880,10 @@ This is not recommended for production use since a burst of traffic could cause CacheCapacity is the number of traces to keep in the cache's circular buffer. -The collection cache is used to collect all spans into a trace as well as remember the sampling decision for any spans that might come in after the trace has been marked "complete" (either by timing out or seeing the root span). -The number of traces in the cache should be many multiples (100x to 1000x) of the total number of concurrently active traces (trace throughput * trace duration). +The collection cache is used to collect all active spans into traces. +It is organized as a circular buffer. +When the buffer wraps around, Refinery will try a few times to find an empty slot; if it fails, it starts ejecting traces from the cache earlier than would otherwise be necessary. +Ideally, the size of the cache should be many multiples (100x to 1000x) of the total number of concurrently active traces (average trace throughput * average trace duration). - Eligible for live reload. - Type: `int` @@ -862,6 +959,28 @@ If set, `Collections.AvailableMemory` must not be defined. - Eligible for live reload. - Type: `memorysize` +### `DisableRedistribution` + +DisableRedistribution controls whether to transmit traces in cache to remaining peers during cluster scaling event. + +If `true`, Refinery will NOT forward live traces in its cache to the rest of the peers when peers join or leave the cluster. +By disabling this behavior, it can help to prevent disruptive bursts of network traffic when large traces with long `TraceTimeout` are redistributed. + +- Eligible for live reload. +- Type: `bool` + +### `ShutdownDelay` + +ShutdownDelay controls the maximum time Refinery can use while draining traces at shutdown. + +This setting controls the duration that Refinery expects to have to drain in-process traces before shutting down an instance. +When asked to shut down gracefully, Refinery stops accepting new spans immediately and drains the remaining traces by sending them to remaining peers. +This value should be set to a bit less than the normal timeout period for shutting down without forcibly terminating the process. + +- Eligible for live reload. +- Type: `duration` +- Default: `15s` + ## Buffer Sizes `BufferSizes` contains the settings that are relevant to the sizes of communications buffers. @@ -966,6 +1085,7 @@ If `false`, then the gRPC server is not started and no gRPC traffic is accepted. - Not eligible for live reload. - Type: `bool` +- Default: `true` ### `ListenAddr` @@ -1045,7 +1165,7 @@ The size is expressed in bytes. - Not eligible for live reload. - Type: `memorysize` -- Default: `5MB` +- Default: `15MB` ### `MaxRecvMsgSize` @@ -1056,7 +1176,7 @@ The size is expressed in bytes. - Not eligible for live reload. - Type: `memorysize` -- Default: `5MB` +- Default: `15MB` ## Sample Cache @@ -1168,16 +1288,3 @@ This setting helps to prevent oscillations. - Type: `duration` - Default: `10s` -### `MinimumStartupDuration` - -MinimumStartupDuration is the minimum time that Stress Relief will stay enabled. - -This setting is used when switching into Monitor mode. -When Stress Relief is enabled, it will start up in stressed mode for at least this set duration of time to try to make sure that Refinery can handle the load before it begins processing it in earnest. -This is to help address the problem of trying to bring a new node into an already-overloaded cluster. -If this duration is `0`, then Refinery will not start in stressed mode, which will provide faster startup at the possible cost of startup instability. - -- Eligible for live reload. -- Type: `duration` -- Default: `3s` - diff --git a/config/cmdenv.go b/config/cmdenv.go index df45f32ec8..18cbd01f73 100644 --- a/config/cmdenv.go +++ b/config/cmdenv.go @@ -26,21 +26,23 @@ import ( // that this system uses reflection to establish the relationship between the // config struct and the command line options. type CmdEnv struct { - ConfigLocation string `short:"c" long:"config" env:"REFINERY_CONFIG" default:"/etc/refinery/refinery.yaml" description:"config file or URL to load"` - RulesLocation string `short:"r" long:"rules_config" env:"REFINERY_RULES_CONFIG" default:"/etc/refinery/rules.yaml" description:"config file or URL to load"` + ConfigLocations []string `short:"c" long:"config" env:"REFINERY_CONFIG" env-delim:"," default:"/etc/refinery/refinery.yaml" description:"config file or URL to load; can be specified more than once"` + RulesLocations []string `short:"r" long:"rules_config" env:"REFINERY_RULES_CONFIG" env-delim:"," default:"/etc/refinery/rules.yaml" description:"config file or URL to load; can be specified more than once"` HTTPListenAddr string `long:"http-listen-address" env:"REFINERY_HTTP_LISTEN_ADDRESS" description:"HTTP listen address for incoming event traffic"` PeerListenAddr string `long:"peer-listen-address" env:"REFINERY_PEER_LISTEN_ADDRESS" description:"Peer listen address for communication between Refinery instances"` GRPCListenAddr string `long:"grpc-listen-address" env:"REFINERY_GRPC_LISTEN_ADDRESS" description:"gRPC listen address for OTLP traffic"` RedisHost string `long:"redis-host" env:"REFINERY_REDIS_HOST" description:"Redis host address"` - RedisUsername string `long:"redis-username" env:"REFINERY_REDIS_USERNAME" description:"Redis username"` - RedisPassword string `long:"redis-password" env:"REFINERY_REDIS_PASSWORD" description:"Redis password"` - RedisAuthCode string `long:"redis-auth-code" env:"REFINERY_REDIS_AUTH_CODE" description:"Redis AUTH code"` + RedisClusterHosts []string `long:"redis-cluster-hosts" env:"REFINERY_REDIS_CLUSTER_HOSTS" env-delim:"," description:"Redis cluster host addresses"` + RedisUsername string `long:"redis-username" env:"REFINERY_REDIS_USERNAME" description:"Redis username. Setting this value via a flag may expose credentials - it is recommended to use the env var or a configuration file."` + RedisPassword string `long:"redis-password" env:"REFINERY_REDIS_PASSWORD" description:"Redis password. Setting this value via a flag may expose credentials - it is recommended to use the env var or a configuration file."` + RedisAuthCode string `long:"redis-auth-code" env:"REFINERY_REDIS_AUTH_CODE" description:"Redis AUTH code. Setting this value via a flag may expose credentials - it is recommended to use the env var or a configuration file."` HoneycombAPI string `long:"honeycomb-api" env:"REFINERY_HONEYCOMB_API" description:"Honeycomb API URL"` - HoneycombAPIKey string `long:"honeycomb-api-key" env:"REFINERY_HONEYCOMB_API_KEY" description:"Honeycomb API key (for logger and metrics)"` - HoneycombLoggerAPIKey string `long:"logger-api-key" env:"REFINERY_HONEYCOMB_LOGGER_API_KEY" description:"Honeycomb logger API key"` - LegacyMetricsAPIKey string `long:"legacy-metrics-api-key" env:"REFINERY_HONEYCOMB_METRICS_API_KEY" description:"API key for legacy Honeycomb metrics"` - OTelMetricsAPIKey string `long:"otel-metrics-api-key" env:"REFINERY_OTEL_METRICS_API_KEY" description:"API key for OTel metrics if being sent to Honeycomb"` - QueryAuthToken string `long:"query-auth-token" env:"REFINERY_QUERY_AUTH_TOKEN" description:"Token for debug/management queries"` + HoneycombAPIKey string `long:"honeycomb-api-key" env:"REFINERY_HONEYCOMB_API_KEY" description:"Honeycomb API key (for logger and metrics). Setting this value via a flag may expose credentials - it is recommended to use the env var or a configuration file."` + HoneycombLoggerAPIKey string `long:"logger-api-key" env:"REFINERY_HONEYCOMB_LOGGER_API_KEY" description:"Honeycomb logger API key. Setting this value via a flag may expose credentials - it is recommended to use the env var or a configuration file."` + LegacyMetricsAPIKey string `long:"legacy-metrics-api-key" env:"REFINERY_HONEYCOMB_METRICS_API_KEY" description:"API key for legacy Honeycomb metrics. Setting this value via a flag may expose credentials - it is recommended to use the env var or a configuration file."` + OTelMetricsAPIKey string `long:"otel-metrics-api-key" env:"REFINERY_OTEL_METRICS_API_KEY" description:"API key for OTel metrics if being sent to Honeycomb. Setting this value via a flag may expose credentials - it is recommended to use the env var or a configuration file."` + OTelTracesAPIKey string `long:"otel-traces-api-key" env:"REFINERY_OTEL_TRACES_API_KEY" description:"API key for OTel traces if being sent to Honeycomb. Setting this value via a flag may expose credentials - it is recommended to use the env var or a configuration file."` + QueryAuthToken string `long:"query-auth-token" env:"REFINERY_QUERY_AUTH_TOKEN" description:"Token for debug/management queries. Setting this value via a flag may expose credentials - it is recommended to use the env var or a configuration file."` AvailableMemory MemorySize `long:"available-memory" env:"REFINERY_AVAILABLE_MEMORY" description:"The maximum memory available for Refinery to use (ex: 4GiB)."` Debug bool `short:"d" long:"debug" description:"Runs debug service (on the first open port between localhost:6060 and :6069 by default)"` Version bool `short:"v" long:"version" description:"Print version number and exit"` @@ -77,6 +79,14 @@ func (c *CmdEnv) GetField(name string) reflect.Value { return reflect.ValueOf(c).Elem().FieldByName(name) } +func (c *CmdEnv) GetDelimiter(name string) string { + field, ok := reflect.TypeOf(c).Elem().FieldByName(name) + if !ok { + return "" + } + return field.Tag.Get("env-delim") +} + // ApplyTags uses reflection to apply the values from the CmdEnv struct to the // given struct. Any field in the struct that wants to be set from the command // line must have a `cmdenv` tag on it that names one or more fields in the @@ -89,6 +99,7 @@ func (c *CmdEnv) ApplyTags(s reflect.Value) error { type getFielder interface { GetField(name string) reflect.Value + GetDelimiter(name string) string } // applyCmdEnvTags is a helper function that applies the values from the given @@ -122,6 +133,30 @@ func applyCmdEnvTags(s reflect.Value, fielder getFielder) error { return fmt.Errorf("programming error -- types don't match for field: %s (%v and %v)", fieldType.Name, fieldType.Type, value.Type()) } + + if value.Kind() == reflect.Slice { + delimiter := fielder.GetDelimiter(tag) + if delimiter == "" { + return fmt.Errorf("programming error -- missing delimiter for slice field: %s", fieldType.Name) + } + + rawValue, ok := value.Index(0).Interface().(string) + if !ok { + return fmt.Errorf("programming error -- slice field must be a string: %s", fieldType.Name) + } + + // split the value on the delimiter + values := strings.Split(rawValue, delimiter) + // create a new slice of the same type as the field + slice := reflect.MakeSlice(field.Type(), len(values), len(values)) + // iterate over the values and set them + for i, v := range values { + slice.Index(i).SetString(v) + } + // set the field + field.Set(slice) + break + } // now we can set it field.Set(value) // and we're done with this field diff --git a/config/cmdenv_test.go b/config/cmdenv_test.go index e137183703..a68054e3d0 100644 --- a/config/cmdenv_test.go +++ b/config/cmdenv_test.go @@ -6,10 +6,11 @@ import ( ) type TestFielder struct { - S string - S2 string - I int - F float64 + S string + S2 string + I int + F float64 + STRS []string `env-delim:","` } // implement getFielder @@ -17,11 +18,20 @@ func (t *TestFielder) GetField(name string) reflect.Value { return reflect.ValueOf(t).Elem().FieldByName(name) } +func (t *TestFielder) GetDelimiter(name string) string { + field, ok := reflect.TypeOf(t).Elem().FieldByName(name) + if !ok { + return "" + } + return field.Tag.Get("env-delim") +} + type TestConfig struct { - St string `cmdenv:"S"` - It int `cmdenv:"I"` - Fl float64 `cmdenv:"F"` - No string + St string `cmdenv:"S"` + It int `cmdenv:"I"` + Fl float64 `cmdenv:"F"` + No string + Strs []string `cmdenv:"STRS"` } type FallbackConfig struct { @@ -43,11 +53,11 @@ func TestApplyCmdEnvTags(t *testing.T) { want any wantErr bool }{ - {"normal", &TestFielder{"foo", "bar", 1, 2.3}, &TestConfig{}, &TestConfig{"foo", 1, 2.3, ""}, false}, - {"bad", &TestFielder{"foo", "bar", 1, 2.3}, &BadTestConfig1{}, &BadTestConfig1{}, true}, - {"type mismatch", &TestFielder{"foo", "bar", 1, 2.3}, &BadTestConfig2{17}, &BadTestConfig2{17}, true}, - {"fallback1", &TestFielder{"foo", "bar", 1, 2.3}, &FallbackConfig{}, &FallbackConfig{"foo"}, false}, - {"fallback2", &TestFielder{"", "bar", 1, 2.3}, &FallbackConfig{}, &FallbackConfig{"bar"}, false}, + {"normal", &TestFielder{"foo", "bar", 1, 2.3, []string{"test,test1"}}, &TestConfig{}, &TestConfig{"foo", 1, 2.3, "", []string{"test", "test1"}}, false}, + {"bad", &TestFielder{"foo", "bar", 1, 2.3, []string{}}, &BadTestConfig1{}, &BadTestConfig1{}, true}, + {"type mismatch", &TestFielder{"foo", "bar", 1, 2.3, []string{}}, &BadTestConfig2{17}, &BadTestConfig2{17}, true}, + {"fallback1", &TestFielder{"foo", "bar", 1, 2.3, []string{}}, &FallbackConfig{}, &FallbackConfig{"foo"}, false}, + {"fallback2", &TestFielder{"", "bar", 1, 2.3, []string{}}, &FallbackConfig{}, &FallbackConfig{"bar"}, false}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { diff --git a/config/config.go b/config/config.go index 8419256932..1fa599829a 100644 --- a/config/config.go +++ b/config/config.go @@ -10,26 +10,32 @@ const ( // Config defines the interface the rest of the code uses to get items from the // config. There are different implementations of the config using different -// backends to store the config. FileConfig is the default and uses a -// TOML-formatted config file. RedisPeerFileConfig uses a redis cluster to store -// the list of peers and then falls back to a filesystem config file for all -// other config elements. +// backends to store the config. type Config interface { // RegisterReloadCallback takes a name and a function that will be called - // when the configuration is reloaded. This will happen infrequently. If + // whenever the configuration is reloaded. This will happen infrequently. If // consumers of configuration set config values on startup, they should // check their values haven't changed and re-start anything that needs - // restarting with the new values. - RegisterReloadCallback(callback func()) + // restarting with the new values. The callback is passed the two hashes + // for config and rules so that the caller can decide if they need to + // reconfigure anything. + RegisterReloadCallback(callback ConfigReloadCallback) + + // Reload forces the config to attempt to reload its values. If the config + // checksum has changed, the reload callbacks will be called. + Reload() + + // GetHashes returns the current config and rule hashes + GetHashes() (cfg string, rules string) // GetListenAddr returns the address and port on which to listen for // incoming events - GetListenAddr() (string, error) + GetListenAddr() string // GetPeerListenAddr returns the address and port on which to listen for // peer traffic - GetPeerListenAddr() (string, error) + GetPeerListenAddr() string // GetHTTPIdleTimeout returns the idle timeout for refinery's HTTP server GetHTTPIdleTimeout() time.Duration @@ -43,90 +49,52 @@ type Config interface { // GetGRPCListenAddr returns the address and port on which to listen for // incoming events over gRPC - GetGRPCListenAddr() (string, error) + GetGRPCListenAddr() string // Returns the entire GRPC config block GetGRPCConfig() GRPCServerParameters - // IsAPIKeyValid checks if the given API key is valid according to the rules - IsAPIKeyValid(key string) bool + // GetAccessKeyConfig returns the access key configuration + GetAccessKeyConfig() AccessKeyConfig // GetPeers returns a list of other servers participating in this proxy cluster - GetPeers() ([]string, error) - - GetPeerManagementType() (string, error) - - // GetRedisHost returns the address of a Redis instance to use for peer - // management. - GetRedisHost() (string, error) - - // GetRedisUsername returns the username of a Redis instance to use for peer - // management. - GetRedisUsername() (string, error) - - // GetRedisPassword returns the password of a Redis instance to use for peer - // management. - GetRedisPassword() (string, error) - - // GetRedisAuthCode returns the AUTH string to use for connecting to a Redis - // instance to use for peer management - GetRedisAuthCode() (string, error) + GetPeers() []string - // GetRedisPrefix returns the prefix string used in the keys for peer - // management. - GetRedisPrefix() string + GetPeerManagementType() string - // GetRedisDatabase returns the ID of the Redis database to use for peer management. - GetRedisDatabase() int - - // GetUseTLS returns true when TLS must be enabled to dial the Redis instance to - // use for peer management. - GetUseTLS() (bool, error) - - // UseTLSInsecure returns true when certificate checks are disabled - GetUseTLSInsecure() (bool, error) + GetRedisPeerManagement() RedisPeerManagementConfig // GetHoneycombAPI returns the base URL (protocol, hostname, and port) of // the upstream Honeycomb API server - GetHoneycombAPI() (string, error) - - // GetSendDelay returns the number of seconds to pause after a trace is - // complete before sending it, to allow stragglers to arrive - GetSendDelay() (time.Duration, error) - - // GetBatchTimeout returns how often to send off batches in seconds - GetBatchTimeout() time.Duration + GetHoneycombAPI() string - // GetTraceTimeout is how long to wait before sending a trace even if it's - // not complete. This should be longer than the longest expected trace - // duration. - GetTraceTimeout() (time.Duration, error) - - // GetMaxBatchSize is the number of events to be included in the batch for sending - GetMaxBatchSize() uint + GetTracesConfig() TracesConfig // GetLoggerType returns the type of the logger to use. Valid types are in // the logger package - GetLoggerType() (string, error) + GetLoggerType() string // GetLoggerLevel returns the level of the logger to use. GetLoggerLevel() Level // GetHoneycombLoggerConfig returns the config specific to the HoneycombLogger - GetHoneycombLoggerConfig() (HoneycombLoggerConfig, error) + GetHoneycombLoggerConfig() HoneycombLoggerConfig // GetStdoutLoggerConfig returns the config specific to the StdoutLogger - GetStdoutLoggerConfig() (StdoutLoggerConfig, error) + GetStdoutLoggerConfig() StdoutLoggerConfig // GetCollectionConfig returns the config specific to the InMemCollector - GetCollectionConfig() (CollectionConfig, error) + GetCollectionConfig() CollectionConfig // GetSamplerConfigForDestName returns the sampler type and name to use for // the given destination (environment, or dataset in classic) - GetSamplerConfigForDestName(string) (interface{}, string, error) + GetSamplerConfigForDestName(string) (interface{}, string) // GetAllSamplerRules returns all rules in a single map, including the default rules - GetAllSamplerRules() (*V2SamplerConfig, error) + GetAllSamplerRules() *V2SamplerConfig + + // GetGeneralConfig returns the config specific to General + GetGeneralConfig() GeneralConfig // GetLegacyMetricsConfig returns the config specific to LegacyMetrics GetLegacyMetricsConfig() LegacyMetricsConfig @@ -144,18 +112,17 @@ type Config interface { // libhoney client GetPeerBufferSize() int - GetIdentifierInterfaceName() (string, error) + GetIdentifierInterfaceName() string - GetUseIPV6Identifier() (bool, error) + GetOTelTracingConfig() OTelTracingConfig - GetRedisIdentifier() (string, error) + GetUseIPV6Identifier() bool - // GetSendTickerValue returns the duration to use to check for traces to send - GetSendTickerValue() time.Duration + GetRedisIdentifier() string // GetDebugServiceAddr sets the IP and port the debug service will run on (you must provide the // command line flag -d to start the debug service) - GetDebugServiceAddr() (string, error) + GetDebugServiceAddr() string GetIsDryRun() bool @@ -191,6 +158,8 @@ type Config interface { GetParentIdFieldNames() []string } +type ConfigReloadCallback func(configHash, ruleCfgHash string) + type ConfigMetadata struct { Type string `json:"type"` ID string `json:"id"` diff --git a/config/configLoadHelpers.go b/config/configLoadHelpers.go index 14f7602ba4..bc662dda77 100644 --- a/config/configLoadHelpers.go +++ b/config/configLoadHelpers.go @@ -12,6 +12,8 @@ import ( "os" "path/filepath" "reflect" + "strconv" + "strings" "github.com/creasty/defaults" "github.com/pelletier/go-toml/v2" @@ -107,17 +109,84 @@ func load(r io.Reader, format Format, into any) error { } } -func validateConfig(opts *CmdEnv) ([]string, error) { - location := opts.ConfigLocation - r, format, err := getReaderFor(location) - if err != nil { - return nil, err +// This loads all the named configs into destination in the order they are listed. +// It returns the MD5 hash of the collected configs as a string (if there's only one +// config, this is the hash of that config; if there are multiple, it's the hash of +// all of them concatenated together). +func loadConfigsInto(dest any, locations []string) (string, error) { + // start a hash of the configs we read + h := md5.New() + for _, location := range locations { + // trim leading and trailing whitespace just in case + location := strings.TrimSpace(location) + r, format, err := getReaderFor(location) + if err != nil { + return "", err + } + defer r.Close() + // write the data to the hash as we read it + rdr := io.TeeReader(r, h) + + // when working on a struct, load only overwrites destination values that are + // explicitly named. So we can just keep loading successive files into + // the same object without losing data we've already specified. + if err := load(rdr, format, dest); err != nil { + return "", fmt.Errorf("loadConfigsInto unable to load config %s: %w", location, err) + } } - defer r.Close() + hash := hex.EncodeToString(h.Sum(nil)) + return hash, nil +} - var userData map[string]any - if err := load(r, format, &userData); err != nil { - return nil, fmt.Errorf("validateConfig unable to load config %s: %w", location, err) +func loadConfigsIntoMap(dest map[string]any, locations []string) error { + for _, location := range locations { + // trim leading and trailing whitespace just in case + location := strings.TrimSpace(location) + r, format, err := getReaderFor(location) + if err != nil { + return err + } + defer r.Close() + + // when working on a map, when loading a nested object, load will overwrite the entire destination + // value, so we can't just keep loading successive files into the same object. Instead, we + // need to load into a new object and then merge it into the map. + temp := make(map[string]any) + if err := load(r, format, &temp); err != nil { + return fmt.Errorf("loadConfigsInto unable to load config %s: %w", location, err) + } + for k, v := range temp { + switch vm := v.(type) { + case map[string]any: + // if the value is a map, we need to merge its value into the existing map value, if any. + if dest[k] == nil { + // no existing value, just copy it over + dest[k] = vm + } else { + // this works without needing recursion because we know that + // configurations can never be more than two levels deep. + for kk, vv := range vm { + dest[k].(map[string]any)[kk] = vv + } + } + default: + // everything else just gets copied over, including slices + dest[k] = v + } + } + } + return nil +} + +// validateConfigs reads the configs from the given location and validates them. +// It returns a list of failures; if the list is empty, the config is valid. +// err is non-nil only for significant errors like a missing file. +func validateConfigs(opts *CmdEnv) ([]string, error) { + // first read the configs into a map so we can validate them + userData := make(map[string]any) + err := loadConfigsIntoMap(userData, opts.ConfigLocations) + if err != nil { + return nil, err } metadata, err := LoadConfigMetadata() @@ -130,21 +199,14 @@ func validateConfig(opts *CmdEnv) ([]string, error) { return failures, nil } - // Basic validation worked. Now we need to reload it into the struct so that + // Basic validation worked. Now we need to reload everything into our struct so that // we can apply defaults and options, and then validate a second time. - - // we need a new reader for the source data - r2, _, err := getReaderFor(location) + var config configContents + _, err = loadConfigsInto(&config, opts.ConfigLocations) if err != nil { return nil, err } - defer r2.Close() - var config configContents - if err := load(r2, format, &config); err != nil { - // this should never happen, since we already validated the config - return nil, fmt.Errorf("validateConfig unable to RELOAD config %s: %w", location, err) - } // apply defaults and options if err := defaults.Set(&config); err != nil { return nil, fmt.Errorf("readConfigInto unable to apply defaults: %w", err) @@ -165,8 +227,12 @@ func validateConfig(opts *CmdEnv) ([]string, error) { if config.OTelMetrics.APIKey == "" { config.OTelMetrics.APIKey = "InvalidHoneycombAPIKey" } + if config.OTelTracing.APIKey == "" { + config.OTelTracing.APIKey = "InvalidHoneycombAPIKey" + } - // write it out to a YAML buffer + // The validator needs a map[string]any to work with, so we need to + // write it out to a buffer (we always use YAML) and then reload it. buf := new(bytes.Buffer) encoder := yaml.NewEncoder(buf) encoder.SetIndent(2) @@ -175,7 +241,7 @@ func validateConfig(opts *CmdEnv) ([]string, error) { } var rewrittenUserData map[string]any - if err := load(buf, format, &rewrittenUserData); err != nil { + if err := load(buf, FormatYAML, &rewrittenUserData); err != nil { return nil, fmt.Errorf("validateConfig unable to reload hydrated config from buffer: %w", err) } @@ -184,17 +250,13 @@ func validateConfig(opts *CmdEnv) ([]string, error) { return failures, nil } -func validateRules(location string) ([]string, error) { - r, format, err := getReaderFor(location) +func validateRules(locations []string) ([]string, error) { + // first read the configs into a map so we can validate them + userData := make(map[string]any) + err := loadConfigsIntoMap(userData, locations) if err != nil { return nil, err } - defer r.Close() - - var userData map[string]any - if err := load(r, format, &userData); err != nil { - return nil, fmt.Errorf("validateRules unable to load config %s: %w", location, err) - } metadata, err := LoadRulesMetadata() if err != nil { @@ -206,22 +268,11 @@ func validateRules(location string) ([]string, error) { } // readConfigInto reads the config from the given location and applies it to the given struct. -func readConfigInto(dest any, location string, opts *CmdEnv) (string, error) { - r, format, err := getReaderFor(location) +func readConfigInto(dest any, locations []string, opts *CmdEnv) (string, error) { + hash, err := loadConfigsInto(dest, locations) if err != nil { - return "", err + return hash, err } - defer r.Close() - - // we're going to use a TeeReader to calculate the hash while also reading the data - h := md5.New() - rdr := io.TeeReader(r, h) - - if err := load(rdr, format, dest); err != nil { - return "", fmt.Errorf("readConfigInto unable to load config %s: %w", location, err) - } - // the hash is now the MD5 of the config file - hash := hex.EncodeToString(h.Sum(nil)) // don't apply options and defaults if we're not given any if opts == nil { @@ -240,3 +291,21 @@ func readConfigInto(dest any, location string, opts *CmdEnv) (string, error) { return hash, nil } + +// ConfigHashMetrics takes a config hash and returns a integer value for use in metrics. +// The value is the last 4 characters of the config hash, converted to an integer. +// If the config hash is too short, or if there is an error converting the hash to an integer, +// it returns 0. +func ConfigHashMetrics(hash string) int64 { + // get last 4 characters of config hash + if len(hash) < 4 { + return 0 + } + suffix := hash[len(hash)-4:] + CfgDecimal, err := strconv.ParseInt(suffix, 16, 64) + if err != nil { + return 0 + } + + return CfgDecimal +} diff --git a/config/configLoadHelpers_test.go b/config/configLoadHelpers_test.go index c029e1ee13..ff9a18dd34 100644 --- a/config/configLoadHelpers_test.go +++ b/config/configLoadHelpers_test.go @@ -2,10 +2,15 @@ package config import ( "net/http" + "os" "reflect" "strings" "testing" "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "gopkg.in/yaml.v3" ) func Test_formatFromFilename(t *testing.T) { @@ -127,3 +132,144 @@ func Test_loadMemsize(t *testing.T) { }) } } + +func Test_ConfigHashMetrics(t *testing.T) { + testcases := []struct { + name string + hash string + expected int64 + }{ + {name: "valid hash", hash: "7f1237f7db723f4e874a7a8269081a77", expected: 6775}, + {name: "invalid length", hash: "1a8", expected: 0}, + } + + for _, tc := range testcases { + t.Run(tc.name, func(t *testing.T) { + result := ConfigHashMetrics(tc.hash) + require.Equal(t, tc.expected, result) + }) + } +} + +// Creates temporary yaml files from the strings passed in and returns a slice of their filenames +// Because we use t.TempDir() the files will be cleaned up automatically. +func createTempConfigs(t *testing.T, cfgs ...string) []string { + tmpDir := t.TempDir() + + var cfgFiles []string + for _, cfg := range cfgs { + + configFile, err := os.CreateTemp(tmpDir, "cfg_*.yaml") + assert.NoError(t, err) + + _, err = configFile.WriteString(cfg) + assert.NoError(t, err) + configFile.Close() + cfgFiles = append(cfgFiles, configFile.Name()) + } + return cfgFiles +} + +func setMap(m map[string]any, key string, value any) { + if strings.Contains(key, ".") { + parts := strings.Split(key, ".") + if _, ok := m[parts[0]]; !ok { + m[parts[0]] = make(map[string]any) + } + setMap(m[parts[0]].(map[string]any), strings.Join(parts[1:], "."), value) + return + } + m[key] = value +} + +func makeYAML(args ...interface{}) string { + m := make(map[string]any) + for i := 0; i < len(args); i += 2 { + setMap(m, args[i].(string), args[i+1]) + } + b, err := yaml.Marshal(m) + if err != nil { + panic(err) + } + return string(b) +} + +func Test_loadConfigsInto(t *testing.T) { + cm1 := makeYAML("General.ConfigurationVersion", 2, "General.ConfigReloadInterval", Duration(1*time.Second), "Network.ListenAddr", "0.0.0.0:8080") + cm2 := makeYAML("General.ConfigReloadInterval", Duration(2*time.Second), "General.DatasetPrefix", "hello") + cfgfiles := createTempConfigs(t, cm1, cm2) + + cfg := configContents{} + hash, err := loadConfigsInto(&cfg, cfgfiles) + require.NoError(t, err) + require.Equal(t, "2381a6563085f50ac56663b67ca85299", hash) + require.Equal(t, 2, cfg.General.ConfigurationVersion) + require.Equal(t, Duration(2*time.Second), cfg.General.ConfigReloadInterval) + require.Equal(t, "0.0.0.0:8080", cfg.Network.ListenAddr) + require.Equal(t, "hello", cfg.General.DatasetPrefix) +} + +func Test_loadConfigsIntoMap(t *testing.T) { + cm1 := makeYAML("General.ConfigurationVersion", 2, "General.ConfigReloadInterval", Duration(1*time.Second), "Network.ListenAddr", "0.0.0.0:8080") + cm2 := makeYAML("General.ConfigReloadInterval", Duration(2*time.Second), "General.DatasetPrefix", "hello") + cfgfiles := createTempConfigs(t, cm1, cm2) + + cfg := map[string]any{} + err := loadConfigsIntoMap(cfg, cfgfiles) + require.NoError(t, err) + gen := cfg["General"].(map[string]any) + require.Equal(t, 2, gen["ConfigurationVersion"]) + require.Equal(t, "2s", gen["ConfigReloadInterval"]) + require.Equal(t, "hello", gen["DatasetPrefix"]) + net := cfg["Network"].(map[string]any) + require.Equal(t, "0.0.0.0:8080", net["ListenAddr"]) +} + +func Test_validateConfigs(t *testing.T) { + emptySlice := []string{} + tests := []struct { + name string + cfgs []string + want []string + wantErr bool + }{ + { + "test1", []string{ + makeYAML("General.ConfigurationVersion", 2, "General.ConfigReloadInterval", Duration(1*time.Second), "Network.ListenAddr", "0.1.2.3:8080"), + }, + emptySlice, + false, + }, + { + "test2", []string{ + makeYAML("General.ConfigurationVersion", 2, "General.ConfigReloadInterval", Duration(1*time.Second), "Network.ListenAddr", "0.1.2.3:8080"), + makeYAML("General.ConfigReloadInterval", Duration(2*time.Second)), + }, + emptySlice, + false, + }, + { + "test3", []string{ + makeYAML("General.ConfigurationVersion", 2, "General.ConfigReloadInterval", Duration(1*time.Second), "Network.ListenAddr", "0.1.2.3:8080"), + makeYAML("General.ConfigReloadInterval", Duration(2*time.Second), "General.DatasetPrefix", 7), + }, + []string{"field General.DatasetPrefix must be a string but 7 is int"}, + false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + cfgfiles := createTempConfigs(t, tt.cfgs...) + opts := &CmdEnv{ConfigLocations: cfgfiles} + got, err := validateConfigs(opts) + if (err != nil) != tt.wantErr { + t.Errorf("validateConfigs() error = %v, wantErr %v", err, tt.wantErr) + return + } + if !reflect.DeepEqual(got, tt.want) { + t.Errorf("validateConfigs() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/config/config_test.go b/config/config_test.go index f39d06b23e..55086cbab7 100644 --- a/config/config_test.go +++ b/config/config_test.go @@ -1,4 +1,4 @@ -package config +package config_test import ( "fmt" @@ -8,22 +8,25 @@ import ( "testing" "time" + "github.com/honeycombio/refinery/config" + "github.com/honeycombio/refinery/internal/configwatcher" + "github.com/honeycombio/refinery/pubsub" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" "gopkg.in/yaml.v3" ) -func getConfig(args []string) (Config, error) { - opts, err := NewCmdEnvOptions(args) +func getConfig(args []string) (config.Config, error) { + opts, err := config.NewCmdEnvOptions(args) if err != nil { return nil, err } - return NewConfig(opts, func(err error) {}) + return config.NewConfig(opts, func(err error) {}) } // creates two temporary yaml files from the strings passed in and returns their filenames func createTempConfigs(t *testing.T, configBody, rulesBody string) (string, string) { - tmpDir, err := os.MkdirTemp("", "") - assert.NoError(t, err) + tmpDir := t.TempDir() configFile, err := os.CreateTemp(tmpDir, "cfg_*.yaml") assert.NoError(t, err) @@ -69,13 +72,12 @@ func makeYAML(args ...interface{}) string { func TestGRPCListenAddrEnvVar(t *testing.T) { const address = "127.0.0.1:4317" const envVarName = "REFINERY_GRPC_LISTEN_ADDRESS" - os.Setenv(envVarName, address) - defer os.Unsetenv(envVarName) + t.Setenv(envVarName, address) c, err := getConfig([]string{"--no-validate", "--config", "../config.yaml", "--rules_config", "../rules.yaml"}) assert.NoError(t, err) - if a, _ := c.GetGRPCListenAddr(); a != address { + if a := c.GetGRPCListenAddr(); a != address { t.Error("received", a, "expected", address) } } @@ -83,13 +85,12 @@ func TestGRPCListenAddrEnvVar(t *testing.T) { func TestRedisHostEnvVar(t *testing.T) { const host = "redis.magic:1337" const envVarName = "REFINERY_REDIS_HOST" - os.Setenv(envVarName, host) - defer os.Unsetenv(envVarName) + t.Setenv(envVarName, host) c, err := getConfig([]string{"--no-validate", "--config", "../config.yaml", "--rules_config", "../rules.yaml"}) assert.NoError(t, err) - if d, _ := c.GetRedisHost(); d != host { + if d := c.GetRedisPeerManagement().Host; d != host { t.Error("received", d, "expected", host) } } @@ -97,13 +98,12 @@ func TestRedisHostEnvVar(t *testing.T) { func TestRedisUsernameEnvVar(t *testing.T) { const username = "admin" const envVarName = "REFINERY_REDIS_USERNAME" - os.Setenv(envVarName, username) - defer os.Unsetenv(envVarName) + t.Setenv(envVarName, username) c, err := getConfig([]string{"--no-validate", "--config", "../config.yaml", "--rules_config", "../rules.yaml"}) assert.NoError(t, err) - if d, _ := c.GetRedisUsername(); d != username { + if d := c.GetRedisPeerManagement().Username; d != username { t.Error("received", d, "expected", username) } } @@ -111,13 +111,12 @@ func TestRedisUsernameEnvVar(t *testing.T) { func TestRedisPasswordEnvVar(t *testing.T) { const password = "admin1234" const envVarName = "REFINERY_REDIS_PASSWORD" - os.Setenv(envVarName, password) - defer os.Unsetenv(envVarName) + t.Setenv(envVarName, password) c, err := getConfig([]string{"--no-validate", "--config", "../config.yaml", "--rules_config", "../rules.yaml"}) assert.NoError(t, err) - if d, _ := c.GetRedisPassword(); d != password { + if d := c.GetRedisPeerManagement().Password; d != password { t.Error("received", d, "expected", password) } } @@ -125,13 +124,12 @@ func TestRedisPasswordEnvVar(t *testing.T) { func TestRedisAuthCodeEnvVar(t *testing.T) { const authCode = "A:LKNGSDKLSHOE&SDLFKN" const envVarName = "REFINERY_REDIS_AUTH_CODE" - os.Setenv(envVarName, authCode) - defer os.Unsetenv(envVarName) + t.Setenv(envVarName, authCode) c, err := getConfig([]string{"--no-validate", "--config", "../config.yaml", "--rules_config", "../rules.yaml"}) assert.NoError(t, err) - if d, _ := c.GetRedisAuthCode(); d != authCode { + if d := c.GetRedisPeerManagement().AuthCode; d != authCode { t.Error("received", d, "expected", authCode) } } @@ -156,8 +154,7 @@ func TestMetricsAPIKeyEnvVar(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { - os.Setenv(tc.envVar, tc.key) - defer os.Unsetenv(tc.envVar) + t.Setenv(tc.envVar, tc.key) c, err := getConfig([]string{"--no-validate", "--config", "../config.yaml", "--rules_config", "../rules.yaml"}) if err != nil { @@ -177,10 +174,8 @@ func TestMetricsAPIKeyMultipleEnvVar(t *testing.T) { const fallbackKey = "this should not be set in the config" const fallbackEnvVarName = "REFINERY_HONEYCOMB_API_KEY" - os.Setenv(specificEnvVarName, specificKey) - defer os.Unsetenv(specificEnvVarName) - os.Setenv(fallbackEnvVarName, fallbackKey) - defer os.Unsetenv(fallbackEnvVarName) + t.Setenv(specificEnvVarName, specificKey) + t.Setenv(fallbackEnvVarName, fallbackKey) c, err := getConfig([]string{"--no-validate", "--config", "../config.yaml", "--rules_config", "../rules.yaml"}) assert.NoError(t, err) @@ -193,8 +188,7 @@ func TestMetricsAPIKeyMultipleEnvVar(t *testing.T) { func TestMetricsAPIKeyFallbackEnvVar(t *testing.T) { const key = "abc1234" const envVarName = "REFINERY_HONEYCOMB_API_KEY" - os.Setenv(envVarName, key) - defer os.Unsetenv(envVarName) + t.Setenv(envVarName, key) c, err := getConfig([]string{"--no-validate", "--config", "../config.yaml", "--rules_config", "../rules.yaml"}) assert.NoError(t, err) @@ -205,15 +199,25 @@ func TestMetricsAPIKeyFallbackEnvVar(t *testing.T) { } func TestReload(t *testing.T) { - cm := makeYAML("General.ConfigurationVersion", 2, "General.ConfigReloadInterval", Duration(1*time.Second), "Network.ListenAddr", "0.0.0.0:8080") + cm := makeYAML("General.ConfigurationVersion", 2, "General.ConfigReloadInterval", config.Duration(1*time.Second), "Network.ListenAddr", "0.0.0.0:8080") rm := makeYAML("ConfigVersion", 2) - config, rules := createTempConfigs(t, cm, rm) - defer os.Remove(rules) - defer os.Remove(config) - c, err := getConfig([]string{"--no-validate", "--config", config, "--rules_config", rules}) + cfg, rules := createTempConfigs(t, cm, rm) + c, err := getConfig([]string{"--no-validate", "--config", cfg, "--rules_config", rules}) assert.NoError(t, err) - if d, _ := c.GetListenAddr(); d != "0.0.0.0:8080" { + pubsub := &pubsub.LocalPubSub{ + Config: c, + } + pubsub.Start() + defer pubsub.Stop() + watcher := &configwatcher.ConfigWatcher{ + Config: c, + PubSub: pubsub, + } + watcher.Start() + defer watcher.Stop() + + if d := c.GetListenAddr(); d != "0.0.0.0:8080" { t.Error("received", d, "expected", "0.0.0.0:8080") } @@ -221,7 +225,7 @@ func TestReload(t *testing.T) { ch := make(chan interface{}, 1) - c.RegisterReloadCallback(func() { + c.RegisterReloadCallback(func(cfgHash, ruleHash string) { close(ch) }) @@ -248,46 +252,45 @@ func TestReload(t *testing.T) { case <-ch: case <-time.After(5 * time.Second): t.Error("No callback") + close(ch) } }() - if file, err := os.OpenFile(config, os.O_RDWR, 0644); err == nil { - cm := makeYAML("General.ConfigurationVersion", 2, "General.ConfigReloadInterval", Duration(1*time.Second), "Network.ListenAddr", "0.0.0.0:9000") + if file, err := os.OpenFile(cfg, os.O_RDWR, 0644); err == nil { + cm := makeYAML("General.ConfigurationVersion", 2, "General.ConfigReloadInterval", config.Duration(1*time.Second), "Network.ListenAddr", "0.0.0.0:9000") file.WriteString(cm) file.Close() } wg.Wait() - if d, _ := c.GetListenAddr(); d != "0.0.0.0:9000" { + if d := c.GetListenAddr(); d != "0.0.0.0:9000" { t.Error("received", d, "expected", "0.0.0.0:9000") } } func TestReloadDisabled(t *testing.T) { - cm := makeYAML("General.ConfigurationVersion", 2, "General.ConfigReloadInterval", Duration(0*time.Second), "Network.ListenAddr", "0.0.0.0:8080") + cm := makeYAML("General.ConfigurationVersion", 2, "General.ConfigReloadInterval", config.Duration(0*time.Second), "Network.ListenAddr", "0.0.0.0:8080") rm := makeYAML("ConfigVersion", 2) - config, rules := createTempConfigs(t, cm, rm) - defer os.Remove(rules) - defer os.Remove(config) - c, err := getConfig([]string{"--no-validate", "--config", config, "--rules_config", rules}) + cfg, rules := createTempConfigs(t, cm, rm) + c, err := getConfig([]string{"--no-validate", "--config", cfg, "--rules_config", rules}) assert.NoError(t, err) - if d, _ := c.GetListenAddr(); d != "0.0.0.0:8080" { + if d := c.GetListenAddr(); d != "0.0.0.0:8080" { t.Error("received", d, "expected", "0.0.0.0:8080") } - if file, err := os.OpenFile(config, os.O_RDWR, 0644); err == nil { + if file, err := os.OpenFile(cfg, os.O_RDWR, 0644); err == nil { // Since we disabled reload checking this should not change anything - cm := makeYAML("General.ConfigurationVersion", 2, "General.ConfigReloadInterval", Duration(0*time.Second), "Network.ListenAddr", "0.0.0.0:9000") + cm := makeYAML("General.ConfigurationVersion", 2, "General.ConfigReloadInterval", config.Duration(0*time.Second), "Network.ListenAddr", "0.0.0.0:9000") file.WriteString(cm) file.Close() } time.Sleep(5 * time.Second) - if d, _ := c.GetListenAddr(); d != "0.0.0.0:8080" { + if d := c.GetListenAddr(); d != "0.0.0.0:8080" { t.Error("received", d, "expected", "0.0.0.0:8080") } } @@ -296,23 +299,23 @@ func TestReadDefaults(t *testing.T) { c, err := getConfig([]string{"--no-validate", "--config", "../config.yaml", "--rules_config", "../rules.yaml"}) assert.NoError(t, err) - if d, _ := c.GetSendDelay(); d != 2*time.Second { + if d := c.GetTracesConfig().GetSendDelay(); d != 2*time.Second { t.Error("received", d, "expected", 2*time.Second) } - if d, _ := c.GetTraceTimeout(); d != 60*time.Second { + if d := c.GetTracesConfig().GetTraceTimeout(); d != 60*time.Second { t.Error("received", d, "expected", 60*time.Second) } - if d := c.GetSendTickerValue(); d != 100*time.Millisecond { + if d := c.GetTracesConfig().GetSendTickerValue(); d != 100*time.Millisecond { t.Error("received", d, "expected", 100*time.Millisecond) } - if d, _ := c.GetPeerManagementType(); d != "file" { + if d := c.GetPeerManagementType(); d != "file" { t.Error("received", d, "expected", "file") } - if d, _ := c.GetUseIPV6Identifier(); d != false { + if d := c.GetUseIPV6Identifier(); d != false { t.Error("received", d, "expected", false) } @@ -328,9 +331,8 @@ func TestReadDefaults(t *testing.T) { t.Error("received", d, "expected", time.Hour) } - d, name, err := c.GetSamplerConfigForDestName("dataset-doesnt-exist") - assert.NoError(t, err) - assert.IsType(t, &DeterministicSamplerConfig{}, d) + d, name := c.GetSamplerConfigForDestName("dataset-doesnt-exist") + assert.IsType(t, &config.DeterministicSamplerConfig{}, d) assert.Equal(t, "DeterministicSampler", name) } @@ -338,39 +340,36 @@ func TestReadRulesConfig(t *testing.T) { c, err := getConfig([]string{"--no-validate", "--config", "../config.yaml", "--rules_config", "../rules_complete.yaml"}) assert.NoError(t, err) - d, name, err := c.GetSamplerConfigForDestName("doesnt-exist") - assert.NoError(t, err) - assert.IsType(t, &DeterministicSamplerConfig{}, d) + d, name := c.GetSamplerConfigForDestName("doesnt-exist") + assert.IsType(t, &config.DeterministicSamplerConfig{}, d) assert.Equal(t, "DeterministicSampler", name) - d, name, err = c.GetSamplerConfigForDestName("env1") - assert.NoError(t, err) - assert.IsType(t, &DynamicSamplerConfig{}, d) + d, name = c.GetSamplerConfigForDestName("env1") + assert.IsType(t, &config.DynamicSamplerConfig{}, d) assert.Equal(t, "DynamicSampler", name) - d, name, err = c.GetSamplerConfigForDestName("env4") - assert.NoError(t, err) + d, name = c.GetSamplerConfigForDestName("env4") switch r := d.(type) { - case *RulesBasedSamplerConfig: - assert.Len(t, r.Rules, 6) + case *config.RulesBasedSamplerConfig: + assert.Len(t, r.Rules, 7) - var rule *RulesBasedSamplerRule + var rule *config.RulesBasedSamplerRule rule = r.Rules[0] assert.True(t, rule.Drop) assert.Equal(t, 0, rule.SampleRate) assert.Len(t, rule.Conditions, 1) - rule = r.Rules[1] + rule = r.Rules[2] assert.Equal(t, 1, rule.SampleRate) - assert.Equal(t, "keep slow 500 errors", rule.Name) + assert.Equal(t, "keep slow 500 errors across semantic conventions", rule.Name) assert.Len(t, rule.Conditions, 2) - rule = r.Rules[3] + rule = r.Rules[4] assert.Equal(t, 5, rule.SampleRate) assert.Equal(t, "span", rule.Scope) - rule = r.Rules[5] + rule = r.Rules[6] assert.Equal(t, 10, rule.SampleRate) assert.Equal(t, "", rule.Scope) @@ -391,20 +390,18 @@ func TestPeerManagementType(t *testing.T) { ) rm := makeYAML("ConfigVersion", 2) config, rules := createTempConfigs(t, cm, rm) - defer os.Remove(rules) - defer os.Remove(config) c, err := getConfig([]string{"--no-validate", "--config", config, "--rules_config", rules}) assert.NoError(t, err) - if d, _ := c.GetPeerManagementType(); d != "redis" { + if d := c.GetPeerManagementType(); d != "redis" { t.Error("received", d, "expected", "redis") } - if s := c.GetRedisPrefix(); s != "testPrefix" { + if s := c.GetRedisPeerManagement().Prefix; s != "testPrefix" { t.Error("received", s, "expected", "testPrefix") } - if db := c.GetRedisDatabase(); db != 9 { + if db := c.GetRedisPeerManagement().Database; db != 9 { t.Error("received", db, "expected", 9) } } @@ -413,12 +410,10 @@ func TestDebugServiceAddr(t *testing.T) { cm := makeYAML("General.ConfigurationVersion", 2, "Debugging.DebugServiceAddr", "localhost:8085") rm := makeYAML("ConfigVersion", 2) config, rules := createTempConfigs(t, cm, rm) - defer os.Remove(rules) - defer os.Remove(config) c, err := getConfig([]string{"--no-validate", "--config", config, "--rules_config", rules}) assert.NoError(t, err) - if d, _ := c.GetDebugServiceAddr(); d != "localhost:8085" { + if d := c.GetDebugServiceAddr(); d != "localhost:8085" { t.Error("received", d, "expected", "localhost:8085") } } @@ -427,8 +422,6 @@ func TestHTTPIdleTimeout(t *testing.T) { cm := makeYAML("General.ConfigurationVersion", 2, "Network.HTTPIdleTimeout", "60s") rm := makeYAML("ConfigVersion", 2) config, rules := createTempConfigs(t, cm, rm) - defer os.Remove(rules) - defer os.Remove(config) c, err := getConfig([]string{"--no-validate", "--config", config, "--rules_config", rules}) assert.NoError(t, err) @@ -441,8 +434,6 @@ func TestDryRun(t *testing.T) { cm := makeYAML("General.ConfigurationVersion", 2, "Debugging.DryRun", true) rm := makeYAML("ConfigVersion", 2) config, rules := createTempConfigs(t, cm, rm) - defer os.Remove(rules) - defer os.Remove(config) c, err := getConfig([]string{"--no-validate", "--config", config, "--rules_config", rules}) assert.NoError(t, err) @@ -451,18 +442,34 @@ func TestDryRun(t *testing.T) { } } -func TestMaxAlloc(t *testing.T) { - cm := makeYAML("General.ConfigurationVersion", 2, "Collection.CacheCapacity", 1000, "Collection.MaxAlloc", 17179869184) +func TestRedisClusterHosts(t *testing.T) { + clusterHosts := []string{"localhost:7001", "localhost:7002"} + cm := makeYAML( + "General.ConfigurationVersion", 2, + "PeerManagement.Type", "redis", + "RedisPeerManagement.ClusterHosts", clusterHosts, + "RedisPeerManagement.Prefix", "test", + "RedisPeerManagement.Database", 9, + ) rm := makeYAML("ConfigVersion", 2) config, rules := createTempConfigs(t, cm, rm) - defer os.Remove(rules) - defer os.Remove(config) c, err := getConfig([]string{"--no-validate", "--config", config, "--rules_config", rules}) assert.NoError(t, err) - expected := MemorySize(16 * 1024 * 1024 * 1024) - inMemConfig, err := c.GetCollectionConfig() + d := c.GetRedisPeerManagement().ClusterHosts + require.NotNil(t, d) + require.EqualValues(t, clusterHosts, d) +} + +func TestMaxAlloc(t *testing.T) { + cm := makeYAML("General.ConfigurationVersion", 2, "Collection.CacheCapacity", 1000, "Collection.MaxAlloc", 17179869184) + rm := makeYAML("ConfigVersion", 2) + cfg, rules := createTempConfigs(t, cm, rm) + c, err := getConfig([]string{"--no-validate", "--config", cfg, "--rules_config", rules}) assert.NoError(t, err) + + expected := config.MemorySize(16 * 1024 * 1024 * 1024) + inMemConfig := c.GetCollectionConfig() assert.Equal(t, expected, inMemConfig.MaxAlloc) } @@ -502,13 +509,10 @@ func TestPeerAndIncomingQueueSize(t *testing.T) { for _, tc := range testcases { rm := makeYAML("ConfigVersion", 2) config, rules := createTempConfigs(t, tc.configYAML, rm) - defer os.Remove(rules) - defer os.Remove(config) c, err := getConfig([]string{"--no-validate", "--config", config, "--rules_config", rules}) assert.NoError(t, err) - inMemConfig, err := c.GetCollectionConfig() - assert.NoError(t, err) + inMemConfig := c.GetCollectionConfig() assert.Equal(t, tc.expectedForPeer, inMemConfig.GetPeerQueueSize()) assert.Equal(t, tc.expectedForIncoming, inMemConfig.GetIncomingQueueSize()) } @@ -517,14 +521,12 @@ func TestPeerAndIncomingQueueSize(t *testing.T) { func TestAvailableMemoryCmdLine(t *testing.T) { cm := makeYAML("General.ConfigurationVersion", 2, "Collection.CacheCapacity", 1000, "Collection.AvailableMemory", 2_000_000_000) rm := makeYAML("ConfigVersion", 2) - config, rules := createTempConfigs(t, cm, rm) - defer os.Remove(rules) - defer os.Remove(config) - c, err := getConfig([]string{"--no-validate", "--config", config, "--rules_config", rules, "--available-memory", "2.5Gib"}) + cfg, rules := createTempConfigs(t, cm, rm) + c, err := getConfig([]string{"--no-validate", "--config", cfg, "--rules_config", rules, "--available-memory", "2.5Gib"}) assert.NoError(t, err) - expected := MemorySize(2*1024*1024*1024 + 512*1024*1024) - inMemConfig, err := c.GetCollectionConfig() + expected := config.MemorySize(2*1024*1024*1024 + 512*1024*1024) + inMemConfig := c.GetCollectionConfig() assert.NoError(t, err) assert.Equal(t, expected, inMemConfig.AvailableMemory) } @@ -550,34 +552,32 @@ func TestGetSamplerTypes(t *testing.T) { "Samplers.dataset4.TotalThroughputSampler.GoalThroughputPerSec", 100, "Samplers.dataset4.TotalThroughputSampler.FieldList", []string{"request.method"}, ) - config, rules := createTempConfigs(t, cm, rm) - defer os.Remove(rules) - defer os.Remove(config) - c, err := getConfig([]string{"--no-validate", "--config", config, "--rules_config", rules}) + cfg, rules := createTempConfigs(t, cm, rm) + c, err := getConfig([]string{"--no-validate", "--config", cfg, "--rules_config", rules}) assert.NoError(t, err) - if d, name, err := c.GetSamplerConfigForDestName("dataset-doesnt-exist"); assert.Equal(t, nil, err) { - assert.IsType(t, &DeterministicSamplerConfig{}, d) + if d, name := c.GetSamplerConfigForDestName("dataset-doesnt-exist"); assert.Equal(t, nil, err) { + assert.IsType(t, &config.DeterministicSamplerConfig{}, d) assert.Equal(t, "DeterministicSampler", name) } - if d, name, err := c.GetSamplerConfigForDestName("dataset 1"); assert.Equal(t, nil, err) { - assert.IsType(t, &DynamicSamplerConfig{}, d) + if d, name := c.GetSamplerConfigForDestName("dataset 1"); assert.Equal(t, nil, err) { + assert.IsType(t, &config.DynamicSamplerConfig{}, d) assert.Equal(t, "DynamicSampler", name) } - if d, name, err := c.GetSamplerConfigForDestName("dataset2"); assert.Equal(t, nil, err) { - assert.IsType(t, &DeterministicSamplerConfig{}, d) + if d, name := c.GetSamplerConfigForDestName("dataset2"); assert.Equal(t, nil, err) { + assert.IsType(t, &config.DeterministicSamplerConfig{}, d) assert.Equal(t, "DeterministicSampler", name) } - if d, name, err := c.GetSamplerConfigForDestName("dataset3"); assert.Equal(t, nil, err) { - assert.IsType(t, &EMADynamicSamplerConfig{}, d) + if d, name := c.GetSamplerConfigForDestName("dataset3"); assert.Equal(t, nil, err) { + assert.IsType(t, &config.EMADynamicSamplerConfig{}, d) assert.Equal(t, "EMADynamicSampler", name) } - if d, name, err := c.GetSamplerConfigForDestName("dataset4"); assert.Equal(t, nil, err) { - assert.IsType(t, &TotalThroughputSamplerConfig{}, d) + if d, name := c.GetSamplerConfigForDestName("dataset4"); assert.Equal(t, nil, err) { + assert.IsType(t, &config.TotalThroughputSamplerConfig{}, d) assert.Equal(t, "TotalThroughputSampler", name) } } @@ -586,19 +586,15 @@ func TestDefaultSampler(t *testing.T) { t.Skip("This tests for a default sampler, but we are currently not requiring explicit default samplers.") cm := makeYAML("General.ConfigurationVersion", 2) rm := makeYAML("ConfigVersion", 2) - config, rules := createTempConfigs(t, cm, rm) - defer os.Remove(rules) - defer os.Remove(config) - c, err := getConfig([]string{"--no-validate", "--config", config, "--rules_config", rules}) + cfg, rules := createTempConfigs(t, cm, rm) + c, err := getConfig([]string{"--no-validate", "--config", cfg, "--rules_config", rules}) assert.NoError(t, err) - s, name, err := c.GetSamplerConfigForDestName("nonexistent") + s, name := c.GetSamplerConfigForDestName("nonexistent") - assert.NoError(t, err) assert.Equal(t, "DeterministicSampler", name) - - assert.IsType(t, &DeterministicSamplerConfig{}, s) + assert.IsType(t, &config.DeterministicSamplerConfig{}, s) } func TestHoneycombLoggerConfig(t *testing.T) { @@ -613,19 +609,19 @@ func TestHoneycombLoggerConfig(t *testing.T) { ) rm := makeYAML("ConfigVersion", 2) config, rules := createTempConfigs(t, cm, rm) - defer os.Remove(rules) - defer os.Remove(config) + // Set the environment variable to test that it overrides the config + oldenv := os.Getenv("REFINERY_HONEYCOMB_API_KEY") + os.Setenv("REFINERY_HONEYCOMB_API_KEY", "321cba") + defer os.Setenv("REFINERY_HONEYCOMB_API_KEY", oldenv) c, err := getConfig([]string{"--no-validate", "--config", config, "--rules_config", rules}) assert.NoError(t, err) - loggerConfig, err := c.GetHoneycombLoggerConfig() - - assert.NoError(t, err) + loggerConfig := c.GetHoneycombLoggerConfig() assert.Equal(t, "http://honeycomb.io", loggerConfig.APIHost) - assert.Equal(t, "1234", loggerConfig.APIKey) + assert.Equal(t, "321cba", loggerConfig.APIKey) assert.Equal(t, "loggerDataset", loggerConfig.Dataset) - assert.Equal(t, true, loggerConfig.SamplerEnabled) + assert.Equal(t, true, loggerConfig.GetSamplerEnabled()) assert.Equal(t, 5, loggerConfig.SamplerThroughput) } @@ -639,16 +635,12 @@ func TestHoneycombLoggerConfigDefaults(t *testing.T) { ) rm := makeYAML("ConfigVersion", 2) config, rules := createTempConfigs(t, cm, rm) - defer os.Remove(rules) - defer os.Remove(config) c, err := getConfig([]string{"--no-validate", "--config", config, "--rules_config", rules}) assert.NoError(t, err) - loggerConfig, err := c.GetHoneycombLoggerConfig() + loggerConfig := c.GetHoneycombLoggerConfig() - assert.NoError(t, err) - - assert.Equal(t, true, loggerConfig.SamplerEnabled) + assert.Equal(t, true, loggerConfig.GetSamplerEnabled()) assert.Equal(t, 10, loggerConfig.SamplerThroughput) } @@ -659,28 +651,25 @@ func TestHoneycombGRPCConfigDefaults(t *testing.T) { "GRPCServerParameters.ListenAddr", "localhost:4343", ) rm := makeYAML("ConfigVersion", 2) - config, rules := createTempConfigs(t, cm, rm) - defer os.Remove(rules) - defer os.Remove(config) - c, err := getConfig([]string{"--no-validate", "--config", config, "--rules_config", rules}) + cfg, rules := createTempConfigs(t, cm, rm) + c, err := getConfig([]string{"--no-validate", "--config", cfg, "--rules_config", rules}) assert.NoError(t, err) assert.Equal(t, true, c.GetGRPCEnabled()) - a, err := c.GetGRPCListenAddr() - assert.NoError(t, err) + a := c.GetGRPCListenAddr() assert.Equal(t, "localhost:4343", a) grpcConfig := c.GetGRPCConfig() - assert.Equal(t, true, grpcConfig.Enabled) + assert.Equal(t, config.DefaultTrue(true), *grpcConfig.Enabled) assert.Equal(t, "localhost:4343", grpcConfig.ListenAddr) assert.Equal(t, 1*time.Minute, time.Duration(grpcConfig.MaxConnectionIdle)) assert.Equal(t, 3*time.Minute, time.Duration(grpcConfig.MaxConnectionAge)) assert.Equal(t, 1*time.Minute, time.Duration(grpcConfig.MaxConnectionAgeGrace)) assert.Equal(t, 1*time.Minute, time.Duration(grpcConfig.KeepAlive)) assert.Equal(t, 20*time.Second, time.Duration(grpcConfig.KeepAliveTimeout)) - assert.Equal(t, MemorySize(5*1_000_000), grpcConfig.MaxSendMsgSize) - assert.Equal(t, MemorySize(5*1_000_000), grpcConfig.MaxRecvMsgSize) + assert.Equal(t, config.MemorySize(15*1_000_000), grpcConfig.MaxSendMsgSize) + assert.Equal(t, config.MemorySize(15*1_000_000), grpcConfig.MaxRecvMsgSize) } func TestStdoutLoggerConfig(t *testing.T) { @@ -694,14 +683,10 @@ func TestStdoutLoggerConfig(t *testing.T) { rm := makeYAML("ConfigVersion", 2) config, rules := createTempConfigs(t, cm, rm) fmt.Println(config) - defer os.Remove(rules) - defer os.Remove(config) c, err := getConfig([]string{"--no-validate", "--config", config, "--rules_config", rules}) assert.NoError(t, err) - loggerConfig, err := c.GetStdoutLoggerConfig() - - assert.NoError(t, err) + loggerConfig := c.GetStdoutLoggerConfig() assert.True(t, loggerConfig.Structured) assert.True(t, loggerConfig.SamplerEnabled) @@ -714,14 +699,10 @@ func TestStdoutLoggerConfigDefaults(t *testing.T) { ) rm := makeYAML("ConfigVersion", 2) config, rules := createTempConfigs(t, cm, rm) - defer os.Remove(rules) - defer os.Remove(config) c, err := getConfig([]string{"--no-validate", "--config", config, "--rules_config", rules}) assert.NoError(t, err) - loggerConfig, err := c.GetStdoutLoggerConfig() - - assert.NoError(t, err) + loggerConfig := c.GetStdoutLoggerConfig() assert.False(t, loggerConfig.Structured) assert.False(t, loggerConfig.SamplerEnabled) @@ -734,8 +715,6 @@ func TestDatasetPrefix(t *testing.T) { ) rm := makeYAML("ConfigVersion", 2) config, rules := createTempConfigs(t, cm, rm) - defer os.Remove(rules) - defer os.Remove(config) c, err := getConfig([]string{"--no-validate", "--config", config, "--rules_config", rules}) assert.NoError(t, err) @@ -749,8 +728,6 @@ func TestQueryAuthToken(t *testing.T) { ) rm := makeYAML("ConfigVersion", 2) config, rules := createTempConfigs(t, cm, rm) - defer os.Remove(rules) - defer os.Remove(config) c, err := getConfig([]string{"--no-validate", "--config", config, "--rules_config", rules}) assert.NoError(t, err) @@ -770,8 +747,6 @@ func TestGRPCServerParameters(t *testing.T) { ) rm := makeYAML("ConfigVersion", 2) config, rules := createTempConfigs(t, cm, rm) - defer os.Remove(rules) - defer os.Remove(config) c, err := getConfig([]string{"--no-validate", "--config", config, "--rules_config", rules}) assert.NoError(t, err) @@ -783,8 +758,7 @@ func TestGRPCServerParameters(t *testing.T) { assert.Equal(t, 4*time.Minute, time.Duration(gc.KeepAlive)) assert.Equal(t, 5*time.Minute, time.Duration(gc.KeepAliveTimeout)) assert.Equal(t, true, c.GetGRPCEnabled()) - addr, err := c.GetGRPCListenAddr() - assert.NoError(t, err) + addr := c.GetGRPCListenAddr() assert.Equal(t, "localhost:4317", addr) } @@ -795,8 +769,6 @@ func TestHoneycombAdditionalErrorConfig(t *testing.T) { ) rm := makeYAML("ConfigVersion", 2) config, rules := createTempConfigs(t, cm, rm) - defer os.Remove(rules) - defer os.Remove(config) c, err := getConfig([]string{"--no-validate", "--config", config, "--rules_config", rules}) assert.NoError(t, err) @@ -807,8 +779,6 @@ func TestHoneycombAdditionalErrorDefaults(t *testing.T) { cm := makeYAML("General.ConfigurationVersion", 2) rm := makeYAML("ConfigVersion", 2) config, rules := createTempConfigs(t, cm, rm) - defer os.Remove(rules) - defer os.Remove(config) c, err := getConfig([]string{"--no-validate", "--config", config, "--rules_config", rules}) assert.NoError(t, err) @@ -819,8 +789,6 @@ func TestSampleCacheParameters(t *testing.T) { cm := makeYAML("General.ConfigurationVersion", 2) rm := makeYAML("ConfigVersion", 2) config, rules := createTempConfigs(t, cm, rm) - defer os.Remove(rules) - defer os.Remove(config) c, err := getConfig([]string{"--no-validate", "--config", config, "--rules_config", rules}) assert.NoError(t, err) @@ -839,8 +807,6 @@ func TestSampleCacheParametersCuckoo(t *testing.T) { ) rm := makeYAML("ConfigVersion", 2) config, rules := createTempConfigs(t, cm, rm) - defer os.Remove(rules) - defer os.Remove(config) c, err := getConfig([]string{"--no-validate", "--config", config, "--rules_config", rules}) assert.NoError(t, err) @@ -861,8 +827,6 @@ func TestAdditionalAttributes(t *testing.T) { ) rm := makeYAML("ConfigVersion", 2) config, rules := createTempConfigs(t, cm, rm) - defer os.Remove(rules) - defer os.Remove(config) c, err := getConfig([]string{"--no-validate", "--config", config, "--rules_config", rules}) assert.NoError(t, err) @@ -877,8 +841,6 @@ func TestHoneycombIdFieldsConfig(t *testing.T) { ) rm := makeYAML("ConfigVersion", 2) config, rules := createTempConfigs(t, cm, rm) - defer os.Remove(rules) - defer os.Remove(config) c, err := getConfig([]string{"--no-validate", "--config", config, "--rules_config", rules}) assert.NoError(t, err) @@ -890,8 +852,6 @@ func TestHoneycombIdFieldsConfigDefault(t *testing.T) { cm := makeYAML("General.ConfigurationVersion", 2) rm := makeYAML("ConfigVersion", 2) config, rules := createTempConfigs(t, cm, rm) - defer os.Remove(rules) - defer os.Remove(config) c, err := getConfig([]string{"--no-validate", "--config", config, "--rules_config", rules}) assert.NoError(t, err) @@ -899,11 +859,34 @@ func TestHoneycombIdFieldsConfigDefault(t *testing.T) { assert.Equal(t, []string{"trace.parent_id", "parentId"}, c.GetParentIdFieldNames()) } +func TestOverrideConfigDefaults(t *testing.T) { + /// Check that fields that default to true can be set to false + cm := makeYAML( + "General.ConfigurationVersion", 2, + "RefineryTelemetry.AddSpanCountToRoot", false, + "RefineryTelemetry.AddHostMetadataToTrace", false, + "HoneycombLogger.SamplerEnabled", false, + "Specialized.CompressPeerCommunication", false, + "GRPCServerParameters.Enabled", false, + ) + rm := makeYAML("ConfigVersion", 2) + config, rules := createTempConfigs(t, cm, rm) + c, err := getConfig([]string{"--no-validate", "--config", config, "--rules_config", rules}) + assert.NoError(t, err) + + assert.Equal(t, false, c.GetAddSpanCountToRoot()) + assert.Equal(t, false, c.GetAddHostMetadataToTrace()) + loggerConfig := c.GetHoneycombLoggerConfig() + assert.Equal(t, false, loggerConfig.GetSamplerEnabled()) + assert.Equal(t, false, c.GetCompressPeerCommunication()) + assert.Equal(t, false, c.GetGRPCEnabled()) +} + func TestMemorySizeUnmarshal(t *testing.T) { tests := []struct { name string input string - expected MemorySize + expected config.MemorySize }{ { name: "single letter", @@ -1013,7 +996,7 @@ func TestMemorySizeUnmarshal(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - var m MemorySize + var m config.MemorySize err := m.UnmarshalText([]byte(tt.input)) assert.NoError(t, err) assert.Equal(t, tt.expected, m) @@ -1037,9 +1020,9 @@ func TestMemorySizeUnmarshalInvalid(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - var m MemorySize + var m config.MemorySize err := m.UnmarshalText([]byte(tt.input)) - assert.Contains(t, err.Error(), fmt.Sprintf(invalidSizeError, tt.input)) + assert.Contains(t, err.Error(), fmt.Sprintf(config.InvalidSizeError, tt.input)) }) } } @@ -1047,7 +1030,7 @@ func TestMemorySizeUnmarshalInvalid(t *testing.T) { func TestMemorySizeMarshal(t *testing.T) { tests := []struct { name string - input MemorySize + input config.MemorySize expected string }{ { @@ -1057,57 +1040,57 @@ func TestMemorySizeMarshal(t *testing.T) { }, { name: "ei", - input: MemorySize(3 * Ei), + input: config.MemorySize(3 * config.Ei), expected: "3Ei", }, { name: "e", - input: MemorySize(3 * E), + input: config.MemorySize(3 * config.E), expected: "3E", }, { name: "pi", - input: MemorySize(3 * Pi), + input: config.MemorySize(3 * config.Pi), expected: "3Pi", }, { name: "p", - input: MemorySize(3 * P), + input: config.MemorySize(3 * config.P), expected: "3P", }, { name: "gi", - input: MemorySize(3 * Gi), + input: config.MemorySize(3 * config.Gi), expected: "3Gi", }, { name: "g", - input: MemorySize(3 * G), + input: config.MemorySize(3 * config.G), expected: "3G", }, { name: "mi", - input: MemorySize(3 * Mi), + input: config.MemorySize(3 * config.Mi), expected: "3Mi", }, { name: "m", - input: MemorySize(3 * M), + input: config.MemorySize(3 * config.M), expected: "3M", }, { name: "ki", - input: MemorySize(3 * Ki), + input: config.MemorySize(3 * config.Ki), expected: "3Ki", }, { name: "k", - input: MemorySize(3 * K), + input: config.MemorySize(3 * config.K), expected: "3K", }, { name: "b", - input: MemorySize(3), + input: config.MemorySize(3), expected: "3", }, } diff --git a/config/config_test_reload_error_test.go b/config/config_test_reload_error_test.go index 0f923d0e3c..bfd7023476 100644 --- a/config/config_test_reload_error_test.go +++ b/config/config_test_reload_error_test.go @@ -1,6 +1,6 @@ //go:build all || !race -package config +package config_test import ( "os" @@ -8,13 +8,16 @@ import ( "testing" "time" + "github.com/honeycombio/refinery/config" + "github.com/honeycombio/refinery/internal/configwatcher" + "github.com/honeycombio/refinery/pubsub" "github.com/stretchr/testify/assert" ) func TestErrorReloading(t *testing.T) { cm := makeYAML( "General.ConfigurationVersion", 2, - "General.ConfigReloadInterval", Duration(1*time.Second), + "General.ConfigReloadInterval", config.Duration(1*time.Second), "Network.ListenAddr", "0.0.0.0:8080", "HoneycombLogger.APIKey", "SetThisToAHoneycombKey", ) @@ -22,19 +25,31 @@ func TestErrorReloading(t *testing.T) { "RulesVersion", 2, "Samplers.__default__.DeterministicSampler.SampleRate", 5, ) - config, rules := createTempConfigs(t, cm, rm) + cfg, rules := createTempConfigs(t, cm, rm) defer os.Remove(rules) - defer os.Remove(config) + defer os.Remove(cfg) - opts, err := NewCmdEnvOptions([]string{"--config", config, "--rules_config", rules}) + opts, err := config.NewCmdEnvOptions([]string{"--config", cfg, "--rules_config", rules}) assert.NoError(t, err) ch := make(chan interface{}, 1) - c, err := NewConfig(opts, func(err error) { ch <- 1 }) + c, err := config.NewConfig(opts, func(err error) { ch <- 1 }) assert.NoError(t, err) - d, name, _ := c.GetSamplerConfigForDestName("dataset5") - if _, ok := d.(DeterministicSamplerConfig); ok { + pubsub := &pubsub.LocalPubSub{ + Config: c, + } + pubsub.Start() + defer pubsub.Stop() + watcher := &configwatcher.ConfigWatcher{ + Config: c, + PubSub: pubsub, + } + watcher.Start() + defer watcher.Stop() + + d, name := c.GetSamplerConfigForDestName("dataset5") + if _, ok := d.(config.DeterministicSamplerConfig); ok { t.Error("type received", d, "expected", "DeterministicSampler") } if name != "DeterministicSampler" { @@ -66,8 +81,8 @@ func TestErrorReloading(t *testing.T) { wg.Wait() // config should error and not update sampler to invalid type - d, _, _ = c.GetSamplerConfigForDestName("dataset5") - if _, ok := d.(DeterministicSamplerConfig); ok { + d, _ = c.GetSamplerConfigForDestName("dataset5") + if _, ok := d.(config.DeterministicSamplerConfig); ok { t.Error("received", d, "expected", "DeterministicSampler") } } diff --git a/config/file_config.go b/config/file_config.go index a771ae05a5..5689b08bc9 100644 --- a/config/file_config.go +++ b/config/file_config.go @@ -1,15 +1,15 @@ package config import ( - "errors" "fmt" - "math/rand" "net" "os" + "strconv" "strings" "sync" "time" + "golang.org/x/exp/slices" "gopkg.in/yaml.v3" ) @@ -36,14 +36,15 @@ type fileConfig struct { rulesConfig *V2SamplerConfig rulesHash string opts *CmdEnv - callbacks []func() + callbacks []ConfigReloadCallback errorCallback func(error) - done chan struct{} - ticker *time.Ticker mux sync.RWMutex lastLoadTime time.Time } +// ensure that fileConfig implements Config +var _ Config = (*fileConfig)(nil) + type configContents struct { General GeneralConfig `yaml:"General"` Network NetworkConfig `yaml:"Network"` @@ -57,6 +58,7 @@ type configContents struct { PrometheusMetrics PrometheusMetricsConfig `yaml:"PrometheusMetrics"` LegacyMetrics LegacyMetricsConfig `yaml:"LegacyMetrics"` OTelMetrics OTelMetricsConfig `yaml:"OTelMetrics"` + OTelTracing OTelTracingConfig `yaml:"OTelTracing"` PeerManagement PeerManagementConfig `yaml:"PeerManagement"` RedisPeerManagement RedisPeerManagementConfig `yaml:"RedisPeerManagement"` Collection CollectionConfig `yaml:"Collection"` @@ -84,15 +86,102 @@ type NetworkConfig struct { type AccessKeyConfig struct { ReceiveKeys []string `yaml:"ReceiveKeys" default:"[]"` + SendKey string `yaml:"SendKey"` + SendKeyMode string `yaml:"SendKeyMode" default:"none"` AcceptOnlyListedKeys bool `yaml:"AcceptOnlyListedKeys"` - keymap map[string]struct{} +} + +// truncate the key to 8 characters for logging +func (a *AccessKeyConfig) sanitize(key string) string { + return fmt.Sprintf("%.8s...", key) +} + +// CheckAndMaybeReplaceKey checks the given API key against the configuration +// and possibly replaces it with the configured SendKey, if the settings so indicate. +// It returns the key to use, or an error if the key is invalid given the settings. +func (a *AccessKeyConfig) CheckAndMaybeReplaceKey(apiKey string) (string, error) { + // Apply AcceptOnlyListedKeys logic BEFORE we consider replacement + if a.AcceptOnlyListedKeys && !slices.Contains(a.ReceiveKeys, apiKey) { + err := fmt.Errorf("api key %s not found in list of authorized keys", a.sanitize(apiKey)) + return "", err + } + + if a.SendKey != "" { + overwriteWith := "" + switch a.SendKeyMode { + case "none": + // don't replace keys at all + // (SendKey is disabled) + case "all": + // overwrite all keys, even missing ones, with the configured one + overwriteWith = a.SendKey + case "nonblank": + // only replace nonblank keys with the configured one + if apiKey != "" { + overwriteWith = a.SendKey + } + case "listedonly": + // only replace keys that are listed in the `ReceiveKeys` list, + // otherwise use original key + overwriteWith = apiKey + if slices.Contains(a.ReceiveKeys, apiKey) { + overwriteWith = a.SendKey + } + case "missingonly": + // only inject keys into telemetry that doesn't have a key at all + // otherwise use original key + overwriteWith = apiKey + if apiKey == "" { + overwriteWith = a.SendKey + } + case "unlisted": + // only replace nonblank keys that are NOT listed in the `ReceiveKeys` list + // otherwise use original key + if apiKey != "" { + overwriteWith = apiKey + if !slices.Contains(a.ReceiveKeys, apiKey) { + overwriteWith = a.SendKey + } + } + } + apiKey = overwriteWith + } + + if apiKey == "" { + return "", fmt.Errorf("blank API key is not permitted with this configuration") + } + return apiKey, nil +} + +type DefaultTrue bool + +func (dt *DefaultTrue) Get() (enabled bool) { + if dt == nil { + return true + } + return bool(*dt) +} + +func (dt *DefaultTrue) MarshalText() ([]byte, error) { + return []byte(strconv.FormatBool(bool(*dt))), nil +} + +func (dt *DefaultTrue) UnmarshalText(text []byte) error { + trueBool, err := strconv.ParseBool(string(text)) + if err != nil { + return err + } + + *dt = DefaultTrue(trueBool) + + return nil } type RefineryTelemetryConfig struct { - AddRuleReasonToTrace bool `yaml:"AddRuleReasonToTrace"` - AddSpanCountToRoot bool `yaml:"AddSpanCountToRoot" default:"true"` - AddCountsToRoot bool `yaml:"AddCountsToRoot"` - AddHostMetadataToTrace bool `yaml:"AddHostMetadataToTrace" default:"true"` + AddRuleReasonToTrace bool `yaml:"AddRuleReasonToTrace"` + AddSpanCountToRoot *DefaultTrue `yaml:"AddSpanCountToRoot" default:"true"` // Avoid pointer woe on access, use GetAddSpanCountToRoot() instead. + AddCountsToRoot bool `yaml:"AddCountsToRoot"` + AddHostMetadataToTrace *DefaultTrue `yaml:"AddHostMetadataToTrace" default:"true"` // Avoid pointer woe on access, use GetAddHostMetadataToTrace() instead. } type TracesConfig struct { @@ -101,6 +190,27 @@ type TracesConfig struct { TraceTimeout Duration `yaml:"TraceTimeout" default:"60s"` MaxBatchSize uint `yaml:"MaxBatchSize" default:"500"` SendTicker Duration `yaml:"SendTicker" default:"100ms"` + SpanLimit uint `yaml:"SpanLimit"` +} + +func (t TracesConfig) GetSendDelay() time.Duration { + return time.Duration(t.SendDelay) +} + +func (t TracesConfig) GetBatchTimeout() time.Duration { + return time.Duration(t.BatchTimeout) +} + +func (t TracesConfig) GetTraceTimeout() time.Duration { + return time.Duration(t.TraceTimeout) +} + +func (t TracesConfig) GetMaxBatchSize() uint { + return t.MaxBatchSize +} + +func (t TracesConfig) GetSendTickerValue() time.Duration { + return time.Duration(t.SendTicker) } type DebuggingConfig struct { @@ -116,11 +226,17 @@ type LoggerConfig struct { } type HoneycombLoggerConfig struct { - APIHost string `yaml:"APIHost" default:"https://api.honeycomb.io"` - APIKey string `yaml:"APIKey" cmdenv:"HoneycombLoggerAPIKey,HoneycombAPIKey"` - Dataset string `yaml:"Dataset" default:"Refinery Logs"` - SamplerEnabled bool `yaml:"SamplerEnabled" default:"true"` - SamplerThroughput int `yaml:"SamplerThroughput" default:"10"` + APIHost string `yaml:"APIHost" default:"https://api.honeycomb.io"` + APIKey string `yaml:"APIKey" cmdenv:"HoneycombLoggerAPIKey,HoneycombAPIKey"` + Dataset string `yaml:"Dataset" default:"Refinery Logs"` + SamplerEnabled *DefaultTrue `yaml:"SamplerEnabled" default:"true"` // Avoid pointer woe on access, use GetSamplerEnabled() instead. + SamplerThroughput int `yaml:"SamplerThroughput" default:"10"` +} + +// GetSamplerEnabled returns whether configuration has enabled sampling of +// Refinery's own logs destined for Honeycomb. +func (c *HoneycombLoggerConfig) GetSamplerEnabled() (enabled bool) { + return c.SamplerEnabled.Get() } type StdoutLoggerConfig struct { @@ -151,6 +267,14 @@ type OTelMetricsConfig struct { ReportingInterval Duration `yaml:"ReportingInterval" default:"30s"` } +type OTelTracingConfig struct { + Enabled bool `yaml:"Enabled" default:"false"` + APIHost string `yaml:"APIHost" default:"https://api.honeycomb.io"` + APIKey string `yaml:"APIKey" cmdenv:"OTelTracesAPIKey,HoneycombAPIKey"` + Dataset string `yaml:"Dataset" default:"Refinery Traces"` + SampleRate uint64 `yaml:"SampleRate" default:"100"` +} + type PeerManagementConfig struct { Type string `yaml:"Type" default:"file"` Identifier string `yaml:"Identifier"` @@ -161,6 +285,7 @@ type PeerManagementConfig struct { type RedisPeerManagementConfig struct { Host string `yaml:"Host" cmdenv:"RedisHost"` + ClusterHosts []string `yaml:"ClusterHosts" cmdenv:"RedisClusterHosts"` Username string `yaml:"Username" cmdenv:"RedisUsername"` Password string `yaml:"Password" cmdenv:"RedisPassword"` AuthCode string `yaml:"AuthCode" cmdenv:"RedisAuthCode"` @@ -173,12 +298,14 @@ type RedisPeerManagementConfig struct { type CollectionConfig struct { // CacheCapacity must be less than math.MaxInt32 - CacheCapacity int `yaml:"CacheCapacity" default:"10_000"` - PeerQueueSize int `yaml:"PeerQueueSize"` - IncomingQueueSize int `yaml:"IncomingQueueSize"` - AvailableMemory MemorySize `yaml:"AvailableMemory" cmdenv:"AvailableMemory"` - MaxMemoryPercentage int `yaml:"MaxMemoryPercentage" default:"75"` - MaxAlloc MemorySize `yaml:"MaxAlloc"` + CacheCapacity int `yaml:"CacheCapacity" default:"10_000"` + PeerQueueSize int `yaml:"PeerQueueSize"` + IncomingQueueSize int `yaml:"IncomingQueueSize"` + AvailableMemory MemorySize `yaml:"AvailableMemory" cmdenv:"AvailableMemory"` + MaxMemoryPercentage int `yaml:"MaxMemoryPercentage" default:"75"` + MaxAlloc MemorySize `yaml:"MaxAlloc"` + DisableRedistribution bool `yaml:"DisableRedistribution"` + ShutdownDelay Duration `yaml:"ShutdownDelay" default:"15s"` } // GetMaxAlloc returns the maximum amount of memory to use for the cache. @@ -217,7 +344,7 @@ type BufferSizeConfig struct { type SpecializedConfig struct { EnvironmentCacheTTL Duration `yaml:"EnvironmentCacheTTL" default:"1h"` - CompressPeerCommunication bool `yaml:"CompressPeerCommunication" default:"true"` + CompressPeerCommunication *DefaultTrue `yaml:"CompressPeerCommunication" default:"true"` // Avoid pointer woe on access, use GetCompressPeerCommunication() instead. AdditionalAttributes map[string]string `yaml:"AdditionalAttributes" default:"{}"` } @@ -230,15 +357,15 @@ type IDFieldsConfig struct { // by refinery's own GRPC server: // https://pkg.go.dev/google.golang.org/grpc/keepalive#ServerParameters type GRPCServerParameters struct { - Enabled bool `yaml:"Enabled" default:"true"` - ListenAddr string `yaml:"ListenAddr" cmdenv:"GRPCListenAddr"` - MaxConnectionIdle Duration `yaml:"MaxConnectionIdle" default:"1m"` - MaxConnectionAge Duration `yaml:"MaxConnectionAge" default:"3m"` - MaxConnectionAgeGrace Duration `yaml:"MaxConnectionAgeGrace" default:"1m"` - KeepAlive Duration `yaml:"KeepAlive" default:"1m"` - KeepAliveTimeout Duration `yaml:"KeepAliveTimeout" default:"20s"` - MaxSendMsgSize MemorySize `yaml:"MaxSendMsgSize" default:"5MB"` - MaxRecvMsgSize MemorySize `yaml:"MaxRecvMsgSize" default:"5MB"` + Enabled *DefaultTrue `yaml:"Enabled" default:"true"` // Avoid pointer woe on access, use GetGRPCEnabled() instead. + ListenAddr string `yaml:"ListenAddr" cmdenv:"GRPCListenAddr"` + MaxConnectionIdle Duration `yaml:"MaxConnectionIdle" default:"1m"` + MaxConnectionAge Duration `yaml:"MaxConnectionAge" default:"3m"` + MaxConnectionAgeGrace Duration `yaml:"MaxConnectionAgeGrace" default:"1m"` + KeepAlive Duration `yaml:"KeepAlive" default:"1m"` + KeepAliveTimeout Duration `yaml:"KeepAliveTimeout" default:"20s"` + MaxSendMsgSize MemorySize `yaml:"MaxSendMsgSize" default:"15MB"` + MaxRecvMsgSize MemorySize `yaml:"MaxRecvMsgSize" default:"15MB"` } type SampleCacheConfig struct { @@ -253,22 +380,22 @@ type StressReliefConfig struct { DeactivationLevel uint `yaml:"DeactivationLevel" default:"75"` SamplingRate uint64 `yaml:"SamplingRate" default:"100"` MinimumActivationDuration Duration `yaml:"MinimumActivationDuration" default:"10s"` - MinimumStartupDuration Duration `yaml:"MinimumStartupDuration" default:"3s"` } type FileConfigError struct { - ConfigLocation string - ConfigFailures []string - RulesLocation string - RulesFailures []string + ConfigLocations []string + ConfigFailures []string + RulesLocations []string + RulesFailures []string } func (e *FileConfigError) Error() string { var msg strings.Builder if len(e.ConfigFailures) > 0 { - msg.WriteString("Validation failed for config file ") - msg.WriteString(e.ConfigLocation) - msg.WriteString(":\n") + loc := strings.Join(e.ConfigLocations, ", ") + msg.WriteString("Validation failed for config [") + msg.WriteString(loc) + msg.WriteString("]:\n") for _, fail := range e.ConfigFailures { msg.WriteString(" ") msg.WriteString(fail) @@ -276,9 +403,10 @@ func (e *FileConfigError) Error() string { } } if len(e.RulesFailures) > 0 { - msg.WriteString("Validation failed for rules file ") - msg.WriteString(e.RulesLocation) - msg.WriteString(":\n") + loc := strings.Join(e.RulesLocations, ", ") + msg.WriteString("Validation failed for config [") + msg.WriteString(loc) + msg.WriteString("]:\n") for _, fail := range e.RulesFailures { msg.WriteString(" ") msg.WriteString(fail) @@ -296,35 +424,35 @@ func (e *FileConfigError) Error() string { func newFileConfig(opts *CmdEnv) (*fileConfig, error) { // If we're not validating, skip this part if !opts.NoValidate { - cfgFails, err := validateConfig(opts) + cfgFails, err := validateConfigs(opts) if err != nil { return nil, err } - ruleFails, err := validateRules(opts.RulesLocation) + ruleFails, err := validateRules(opts.RulesLocations) if err != nil { return nil, err } if len(cfgFails) > 0 || len(ruleFails) > 0 { return nil, &FileConfigError{ - ConfigLocation: opts.ConfigLocation, - ConfigFailures: cfgFails, - RulesLocation: opts.RulesLocation, - RulesFailures: ruleFails, + ConfigLocations: opts.ConfigLocations, + ConfigFailures: cfgFails, + RulesLocations: opts.RulesLocations, + RulesFailures: ruleFails, } } } // Now load the files mainconf := &configContents{} - mainhash, err := readConfigInto(mainconf, opts.ConfigLocation, opts) + mainhash, err := readConfigInto(mainconf, opts.ConfigLocations, opts) if err != nil { return nil, err } var rulesconf *V2SamplerConfig - ruleshash, err := readConfigInto(&rulesconf, opts.RulesLocation, nil) + ruleshash, err := readConfigInto(&rulesconf, opts.RulesLocations, nil) if err != nil { return nil, err } @@ -380,90 +508,75 @@ func NewConfig(opts *CmdEnv, errorCallback func(error)) (Config, error) { os.Exit(0) } - cfg.callbacks = make([]func(), 0) + cfg.callbacks = make([]ConfigReloadCallback, 0) cfg.errorCallback = errorCallback - if cfg.mainConfig.General.ConfigReloadInterval > 0 { - go cfg.monitor() - } - return cfg, err } -func (f *fileConfig) monitor() { - f.done = make(chan struct{}) - // adjust the time by +/- 10% to avoid everyone reloading at the same time - reload := time.Duration(float64(f.mainConfig.General.ConfigReloadInterval) * (0.9 + 0.2*rand.Float64())) - f.ticker = time.NewTicker(time.Duration(reload)) - for { - select { - case <-f.done: - return - case <-f.ticker.C: - // reread the configs - cfg, err := newFileConfig(f.opts) - if err != nil { - f.errorCallback(err) - continue - } +// Reload attempts to reload the configuration; if it has changed, it stores the +// new data and calls the reload callbacks. +func (f *fileConfig) Reload() { + // reread the configs + cfg, err := newFileConfig(f.opts) + if err != nil { + f.errorCallback(err) + return + } - // if nothing's changed, we're fine - if f.mainHash == cfg.mainHash && f.rulesHash == cfg.rulesHash { - continue - } + // if nothing's changed, we're fine + if f.mainHash == cfg.mainHash && f.rulesHash == cfg.rulesHash { + return + } - // otherwise, update our state and call the callbacks - f.mux.Lock() - f.mainConfig = cfg.mainConfig - f.mainHash = cfg.mainHash - f.rulesConfig = cfg.rulesConfig - f.rulesHash = cfg.rulesHash - f.mux.Unlock() // can't defer -- routine never ends, and callbacks will deadlock - for _, cb := range f.callbacks { - cb() - } - } + // otherwise, update our state and call the callbacks + f.mux.Lock() + f.mainConfig = cfg.mainConfig + f.mainHash = cfg.mainHash + f.rulesConfig = cfg.rulesConfig + f.rulesHash = cfg.rulesHash + f.mux.Unlock() // can't defer -- we don't want callbacks to deadlock + + for _, cb := range f.callbacks { + cb(cfg.mainHash, cfg.rulesHash) } } -// Stop halts the monitor goroutine -func (f *fileConfig) Stop() { - if f.ticker != nil { - f.ticker.Stop() - } - if f.done != nil { - close(f.done) - f.done = nil - } +// GetHashes returns the current hash values for the main and rules configs. +func (f *fileConfig) GetHashes() (cfg string, rules string) { + f.mux.RLock() + defer f.mux.RUnlock() + + return f.mainHash, f.rulesHash } -func (f *fileConfig) RegisterReloadCallback(cb func()) { +func (f *fileConfig) RegisterReloadCallback(cb ConfigReloadCallback) { f.mux.Lock() defer f.mux.Unlock() f.callbacks = append(f.callbacks, cb) } -func (f *fileConfig) GetListenAddr() (string, error) { +func (f *fileConfig) GetListenAddr() string { f.mux.RLock() defer f.mux.RUnlock() _, _, err := net.SplitHostPort(f.mainConfig.Network.ListenAddr) if err != nil { - return "", err + return "" } - return f.mainConfig.Network.ListenAddr, nil + return f.mainConfig.Network.ListenAddr } -func (f *fileConfig) GetPeerListenAddr() (string, error) { +func (f *fileConfig) GetPeerListenAddr() string { f.mux.RLock() defer f.mux.RUnlock() _, _, err := net.SplitHostPort(f.mainConfig.Network.PeerListenAddr) if err != nil { - return "", err + return "" } - return f.mainConfig.Network.PeerListenAddr, nil + return f.mainConfig.Network.PeerListenAddr } func (f *fileConfig) GetHTTPIdleTimeout() time.Duration { @@ -477,16 +590,17 @@ func (f *fileConfig) GetCompressPeerCommunication() bool { f.mux.RLock() defer f.mux.RUnlock() - return f.mainConfig.Specialized.CompressPeerCommunication + return f.mainConfig.Specialized.CompressPeerCommunication.Get() } func (f *fileConfig) GetGRPCEnabled() bool { f.mux.RLock() defer f.mux.RUnlock() - return f.mainConfig.GRPCServerParameters.Enabled + + return f.mainConfig.GRPCServerParameters.Enabled.Get() } -func (f *fileConfig) GetGRPCListenAddr() (string, error) { +func (f *fileConfig) GetGRPCListenAddr() string { f.mux.RLock() defer f.mux.RUnlock() @@ -494,10 +608,10 @@ func (f *fileConfig) GetGRPCListenAddr() (string, error) { if f.mainConfig.GRPCServerParameters.ListenAddr != "" { _, _, err := net.SplitHostPort(f.mainConfig.GRPCServerParameters.ListenAddr) if err != nil { - return "", err + return "" } } - return f.mainConfig.GRPCServerParameters.ListenAddr, nil + return f.mainConfig.GRPCServerParameters.ListenAddr } func (f *fileConfig) GetGRPCConfig() GRPCServerParameters { @@ -507,52 +621,60 @@ func (f *fileConfig) GetGRPCConfig() GRPCServerParameters { return f.mainConfig.GRPCServerParameters } -func (f *fileConfig) IsAPIKeyValid(key string) bool { +func (f *fileConfig) GetTracesConfig() TracesConfig { f.mux.RLock() defer f.mux.RUnlock() - if !f.mainConfig.AccessKeys.AcceptOnlyListedKeys { - return true - } + return f.mainConfig.Traces +} - // if we haven't built the keymap yet, do it now - if f.mainConfig.AccessKeys.keymap == nil { - f.mainConfig.AccessKeys.keymap = make(map[string]struct{}) - for _, key := range f.mainConfig.AccessKeys.ReceiveKeys { - f.mainConfig.AccessKeys.keymap[key] = struct{}{} - } - } +func (f *fileConfig) GetAccessKeyConfig() AccessKeyConfig { + f.mux.RLock() + defer f.mux.RUnlock() + + return f.mainConfig.AccessKeys +} + +func (f *fileConfig) GetPeerManagementType() string { + f.mux.RLock() + defer f.mux.RUnlock() + + return f.mainConfig.PeerManagement.Type +} - _, ok := f.mainConfig.AccessKeys.keymap[key] - return ok +func (f *fileConfig) GetPeers() []string { + f.mux.RLock() + defer f.mux.RUnlock() + + return f.mainConfig.PeerManagement.Peers } -func (f *fileConfig) GetPeerManagementType() (string, error) { +func (f *fileConfig) GetRedisPeerManagement() RedisPeerManagementConfig { f.mux.RLock() defer f.mux.RUnlock() - return f.mainConfig.PeerManagement.Type, nil + return f.mainConfig.RedisPeerManagement } -func (f *fileConfig) GetPeers() ([]string, error) { +func (f *fileConfig) GetRedisHost() string { f.mux.RLock() defer f.mux.RUnlock() - return f.mainConfig.PeerManagement.Peers, nil + return f.mainConfig.RedisPeerManagement.Host } -func (f *fileConfig) GetRedisHost() (string, error) { +func (f *fileConfig) GetRedisClusterHosts() []string { f.mux.RLock() defer f.mux.RUnlock() - return f.mainConfig.RedisPeerManagement.Host, nil + return f.mainConfig.RedisPeerManagement.ClusterHosts } -func (f *fileConfig) GetRedisUsername() (string, error) { +func (f *fileConfig) GetRedisUsername() string { f.mux.RLock() defer f.mux.RUnlock() - return f.mainConfig.RedisPeerManagement.Username, nil + return f.mainConfig.RedisPeerManagement.Username } func (f *fileConfig) GetRedisPrefix() string { @@ -562,18 +684,18 @@ func (f *fileConfig) GetRedisPrefix() string { return f.mainConfig.RedisPeerManagement.Prefix } -func (f *fileConfig) GetRedisPassword() (string, error) { +func (f *fileConfig) GetRedisPassword() string { f.mux.RLock() defer f.mux.RUnlock() - return f.mainConfig.RedisPeerManagement.Password, nil + return f.mainConfig.RedisPeerManagement.Password } -func (f *fileConfig) GetRedisAuthCode() (string, error) { +func (f *fileConfig) GetRedisAuthCode() string { f.mux.RLock() defer f.mux.RUnlock() - return f.mainConfig.RedisPeerManagement.AuthCode, nil + return f.mainConfig.RedisPeerManagement.AuthCode } func (f *fileConfig) GetRedisDatabase() int { @@ -583,46 +705,46 @@ func (f *fileConfig) GetRedisDatabase() int { return f.mainConfig.RedisPeerManagement.Database } -func (f *fileConfig) GetUseTLS() (bool, error) { +func (f *fileConfig) GetUseTLS() bool { f.mux.RLock() defer f.mux.RUnlock() - return f.mainConfig.RedisPeerManagement.UseTLS, nil + return f.mainConfig.RedisPeerManagement.UseTLS } -func (f *fileConfig) GetUseTLSInsecure() (bool, error) { +func (f *fileConfig) GetUseTLSInsecure() bool { f.mux.RLock() defer f.mux.RUnlock() - return f.mainConfig.RedisPeerManagement.UseTLSInsecure, nil + return f.mainConfig.RedisPeerManagement.UseTLSInsecure } -func (f *fileConfig) GetIdentifierInterfaceName() (string, error) { +func (f *fileConfig) GetIdentifierInterfaceName() string { f.mux.RLock() defer f.mux.RUnlock() - return f.mainConfig.PeerManagement.IdentifierInterfaceName, nil + return f.mainConfig.PeerManagement.IdentifierInterfaceName } -func (f *fileConfig) GetUseIPV6Identifier() (bool, error) { +func (f *fileConfig) GetUseIPV6Identifier() bool { f.mux.RLock() defer f.mux.RUnlock() - return f.mainConfig.PeerManagement.UseIPV6Identifier, nil + return f.mainConfig.PeerManagement.UseIPV6Identifier } -func (f *fileConfig) GetRedisIdentifier() (string, error) { +func (f *fileConfig) GetRedisIdentifier() string { f.mux.RLock() defer f.mux.RUnlock() - return f.mainConfig.PeerManagement.Identifier, nil + return f.mainConfig.PeerManagement.Identifier } -func (f *fileConfig) GetHoneycombAPI() (string, error) { +func (f *fileConfig) GetHoneycombAPI() string { f.mux.RLock() defer f.mux.RUnlock() - return f.mainConfig.Network.HoneycombAPI, nil + return f.mainConfig.Network.HoneycombAPI } func (f *fileConfig) GetLoggerLevel() Level { @@ -632,40 +754,40 @@ func (f *fileConfig) GetLoggerLevel() Level { return f.mainConfig.Logger.Level } -func (f *fileConfig) GetLoggerType() (string, error) { +func (f *fileConfig) GetLoggerType() string { f.mux.RLock() defer f.mux.RUnlock() - return f.mainConfig.Logger.Type, nil + return f.mainConfig.Logger.Type } -func (f *fileConfig) GetHoneycombLoggerConfig() (HoneycombLoggerConfig, error) { +func (f *fileConfig) GetHoneycombLoggerConfig() HoneycombLoggerConfig { f.mux.RLock() defer f.mux.RUnlock() - return f.mainConfig.HoneycombLogger, nil + return f.mainConfig.HoneycombLogger } -func (f *fileConfig) GetStdoutLoggerConfig() (StdoutLoggerConfig, error) { +func (f *fileConfig) GetStdoutLoggerConfig() StdoutLoggerConfig { f.mux.RLock() defer f.mux.RUnlock() - return f.mainConfig.StdoutLogger, nil + return f.mainConfig.StdoutLogger } -func (f *fileConfig) GetAllSamplerRules() (*V2SamplerConfig, error) { +func (f *fileConfig) GetAllSamplerRules() *V2SamplerConfig { f.mux.RLock() defer f.mux.RUnlock() // This is probably good enough for debug; if not we can extend it. - return f.rulesConfig, nil + return f.rulesConfig } // GetSamplerConfigForDestName returns the sampler config for the given // destination (environment, or dataset in classic mode), as well as the name of // the sampler type. If the specific destination is not found, it returns the // default sampler config. -func (f *fileConfig) GetSamplerConfigForDestName(destname string) (any, string, error) { +func (f *fileConfig) GetSamplerConfigForDestName(destname string) (any, string) { f.mux.RLock() defer f.mux.RUnlock() @@ -674,23 +796,19 @@ func (f *fileConfig) GetSamplerConfigForDestName(destname string) (any, string, nameToUse = destname } - err := errors.New("no sampler found and no default configured") name := "not found" var cfg any if sampler, ok := f.rulesConfig.Samplers[nameToUse]; ok { cfg, name = sampler.Sampler() - if cfg != nil { - err = nil - } } - return cfg, name, err + return cfg, name } -func (f *fileConfig) GetCollectionConfig() (CollectionConfig, error) { +func (f *fileConfig) GetCollectionConfig() CollectionConfig { f.mux.RLock() defer f.mux.RUnlock() - return f.mainConfig.Collection, nil + return f.mainConfig.Collection } func (f *fileConfig) GetLegacyMetricsConfig() LegacyMetricsConfig { @@ -714,34 +832,6 @@ func (f *fileConfig) GetOTelMetricsConfig() OTelMetricsConfig { return f.mainConfig.OTelMetrics } -func (f *fileConfig) GetSendDelay() (time.Duration, error) { - f.mux.RLock() - defer f.mux.RUnlock() - - return time.Duration(f.mainConfig.Traces.SendDelay), nil -} - -func (f *fileConfig) GetBatchTimeout() time.Duration { - f.mux.RLock() - defer f.mux.RUnlock() - - return time.Duration(f.mainConfig.Traces.BatchTimeout) -} - -func (f *fileConfig) GetTraceTimeout() (time.Duration, error) { - f.mux.RLock() - defer f.mux.RUnlock() - - return time.Duration(f.mainConfig.Traces.TraceTimeout), nil -} - -func (f *fileConfig) GetMaxBatchSize() uint { - f.mux.RLock() - defer f.mux.RUnlock() - - return f.mainConfig.Traces.MaxBatchSize -} - func (f *fileConfig) GetUpstreamBufferSize() int { f.mux.RLock() defer f.mux.RUnlock() @@ -756,22 +846,15 @@ func (f *fileConfig) GetPeerBufferSize() int { return f.mainConfig.BufferSizes.PeerBufferSize } -func (f *fileConfig) GetSendTickerValue() time.Duration { - f.mux.RLock() - defer f.mux.RUnlock() - - return time.Duration(f.mainConfig.Traces.SendTicker) -} - -func (f *fileConfig) GetDebugServiceAddr() (string, error) { +func (f *fileConfig) GetDebugServiceAddr() string { f.mux.RLock() defer f.mux.RUnlock() _, _, err := net.SplitHostPort(f.mainConfig.Debugging.DebugServiceAddr) if err != nil { - return "", err + return "" } - return f.mainConfig.Debugging.DebugServiceAddr, nil + return f.mainConfig.Debugging.DebugServiceAddr } func (f *fileConfig) GetIsDryRun() bool { @@ -785,7 +868,7 @@ func (f *fileConfig) GetAddHostMetadataToTrace() bool { f.mux.RLock() defer f.mux.RUnlock() - return f.mainConfig.Telemetry.AddHostMetadataToTrace + return f.mainConfig.Telemetry.AddHostMetadataToTrace.Get() } func (f *fileConfig) GetAddRuleReasonToTrace() bool { @@ -802,6 +885,13 @@ func (f *fileConfig) GetEnvironmentCacheTTL() time.Duration { return time.Duration(f.mainConfig.Specialized.EnvironmentCacheTTL) } +func (f *fileConfig) GetOTelTracingConfig() OTelTracingConfig { + f.mux.RLock() + defer f.mux.RUnlock() + + return f.mainConfig.OTelTracing +} + func (f *fileConfig) GetDatasetPrefix() string { f.mux.RLock() defer f.mux.RUnlock() @@ -809,6 +899,13 @@ func (f *fileConfig) GetDatasetPrefix() string { return f.mainConfig.General.DatasetPrefix } +func (f *fileConfig) GetGeneralConfig() GeneralConfig { + f.mux.RLock() + defer f.mux.RUnlock() + + return f.mainConfig.General +} + func (f *fileConfig) GetQueryAuthToken() string { f.mux.RLock() defer f.mux.RUnlock() @@ -834,7 +931,7 @@ func (f *fileConfig) GetAddSpanCountToRoot() bool { f.mux.RLock() defer f.mux.RUnlock() - return f.mainConfig.Telemetry.AddSpanCountToRoot + return f.mainConfig.Telemetry.AddSpanCountToRoot.Get() } func (f *fileConfig) GetAddCountsToRoot() bool { @@ -876,13 +973,13 @@ func (f *fileConfig) GetConfigMetadata() []ConfigMetadata { ret := make([]ConfigMetadata, 2) ret[0] = ConfigMetadata{ Type: "config", - ID: f.opts.ConfigLocation, + ID: strings.Join(f.opts.ConfigLocations, ", "), Hash: f.mainHash, LoadedAt: f.lastLoadTime.Format(time.RFC3339), } ret[1] = ConfigMetadata{ Type: "rules", - ID: f.opts.RulesLocation, + ID: strings.Join(f.opts.RulesLocations, ", "), Hash: f.rulesHash, LoadedAt: f.lastLoadTime.Format(time.RFC3339), } diff --git a/config/file_config_test.go b/config/file_config_test.go new file mode 100644 index 0000000000..3c1b868fd5 --- /dev/null +++ b/config/file_config_test.go @@ -0,0 +1,81 @@ +package config + +import "testing" + +func TestAccessKeyConfig_CheckAndMaybeReplaceKey(t *testing.T) { + type fields struct { + ReceiveKeys []string + SendKey string + SendKeyMode string + AcceptOnlyListedKeys bool + } + + fNone := fields{} + fRcvAccept := fields{ + ReceiveKeys: []string{"key1", "key2"}, + AcceptOnlyListedKeys: true, + } + fSendAll := fields{ + ReceiveKeys: []string{"key1", "key2"}, + SendKey: "sendkey", + SendKeyMode: "all", + } + fListed := fields{ + ReceiveKeys: []string{"key1", "key2"}, + SendKey: "sendkey", + SendKeyMode: "listedonly", + } + fMissing := fields{ + ReceiveKeys: []string{"key1", "key2"}, + SendKey: "sendkey", + SendKeyMode: "missingonly", + } + fUnlisted := fields{ + ReceiveKeys: []string{"key1", "key2"}, + SendKey: "sendkey", + SendKeyMode: "unlisted", + } + + tests := []struct { + name string + fields fields + apiKey string + want string + wantErr bool + }{ + {"empty", fNone, "userkey", "userkey", false}, + {"acceptonly known key", fRcvAccept, "key1", "key1", false}, + {"acceptonly unknown key", fRcvAccept, "badkey", "", true}, + {"acceptonly missing key", fRcvAccept, "", "", true}, + {"send all known", fSendAll, "key1", "sendkey", false}, + {"send all unknown", fSendAll, "userkey", "sendkey", false}, + {"send all missing", fSendAll, "", "sendkey", false}, + {"listed known", fListed, "key1", "sendkey", false}, + {"listed unknown", fListed, "userkey", "userkey", false}, + {"listed missing", fListed, "", "", true}, + {"missing known", fMissing, "key1", "key1", false}, + {"missing unknown", fMissing, "userkey", "userkey", false}, + {"missing missing", fMissing, "", "sendkey", false}, + {"unlisted known", fUnlisted, "key1", "key1", false}, + {"unlisted unknown", fUnlisted, "userkey", "sendkey", false}, + {"unlisted missing", fUnlisted, "", "", true}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + a := &AccessKeyConfig{ + ReceiveKeys: tt.fields.ReceiveKeys, + SendKey: tt.fields.SendKey, + SendKeyMode: tt.fields.SendKeyMode, + AcceptOnlyListedKeys: tt.fields.AcceptOnlyListedKeys, + } + got, err := a.CheckAndMaybeReplaceKey(tt.apiKey) + if (err != nil) != tt.wantErr { + t.Errorf("AccessKeyConfig.CheckAndMaybeReplaceKey() error = %v, wantErr %v", err, tt.wantErr) + return + } + if got != tt.want { + t.Errorf("AccessKeyConfig.CheckAndMaybeReplaceKey() = '%v', want '%v'", got, tt.want) + } + }) + } +} diff --git a/config/memorysize.go b/config/memorysize.go index cb6c3c708d..3199c2d1e8 100644 --- a/config/memorysize.go +++ b/config/memorysize.go @@ -25,7 +25,7 @@ const ( Pi = 1024 * Ti Ei = 1024 * Pi - invalidSizeError = "invalid size: %s" + InvalidSizeError = "invalid size: %s" ) var unitSlice = []uint64{ @@ -113,10 +113,10 @@ func (m MemorySize) MarshalText() ([]byte, error) { func (m *MemorySize) UnmarshalText(text []byte) error { txt := string(text) - r := regexp.MustCompile(`^\s*(?P[0-9\._]+)(?P[a-zA-Z]*)\s*$`) + r := regexp.MustCompile(`^\s*(?P[0-9._]+)(?P[a-zA-Z]*)\s*$`) matches := r.FindStringSubmatch(strings.ToLower(txt)) if matches == nil { - return fmt.Errorf(invalidSizeError, txt) + return fmt.Errorf(InvalidSizeError, txt) } var number float64 @@ -131,7 +131,7 @@ func (m *MemorySize) UnmarshalText(text []byte) error { case "number": number, err = strconv.ParseFloat(matches[i], 64) if err != nil { - return fmt.Errorf(invalidSizeError, text) + return fmt.Errorf(InvalidSizeError, text) } case "unit": unit = matches[i] @@ -146,7 +146,7 @@ func (m *MemorySize) UnmarshalText(text []byte) error { } else { scalar, ok := unitMap[unit] if !ok { - return fmt.Errorf(invalidSizeError, text) + return fmt.Errorf(InvalidSizeError, text) } *m = MemorySize(number * float64(scalar)) } diff --git a/config/metadata/configMeta.yaml b/config/metadata/configMeta.yaml index d066f605d2..43ee1425dc 100644 --- a/config/metadata/configMeta.yaml +++ b/config/metadata/configMeta.yaml @@ -72,8 +72,11 @@ groups: up to 10% to avoid all instances refreshing together. In installations where configuration changes are handled by restarting Refinery, which is often the case when using Kubernetes, disable this feature with a - value of `0s`. If the config file is being loaded from a URL, it may - be wise to increase this value to avoid overloading the file server. + value of `0s`. As of Refinery v2.7, news of a configuration change is + immediately propagated to all peers, and they will attempt to reload + their configurations. Note that external factors (for example, + Kubernetes ConfigMaps) may cause delays in propagating configuration + changes. - name: Network title: "Network Configuration" @@ -114,8 +117,8 @@ groups: - name: HTTPIdleTimeout type: duration valuetype: nondefault - firstversion: v2.2 reload: false + firstversion: v2.2 default: 0s validations: - type: minOrZero @@ -178,6 +181,44 @@ groups: If `false`, then all traffic is accepted and `ReceiveKeys` is ignored. + This setting is applied **before** the `SendKey` and `SendKeyMode` settings. + + - name: SendKey + type: string + pattern: apikey + valuetype: nondefault + default: "" + example: "SetThisToAHoneycombKey" + reload: true + validations: + - type: format + arg: apikeyOrBlank + summary: is an optional Honeycomb API key that Refinery can use to send data to Honeycomb, depending on configuration. + description: > + If `SendKey` is set to a valid Honeycomb key, then Refinery can use + the listed key to send data. + The exact behavior depends on the value of `SendKeyMode`. + + - name: SendKeyMode + type: string + valuetype: choice + choices: ["none", "all", "nonblank", "listedonly", "unlisted", "missingonly"] + default: "none" + reload: true + summary: controls how SendKey is used to replace or augment API keys used in incoming telemetry. + description: > + Controls how SendKey is used to replace or supply API keys used in + incoming telemetry. If `AcceptOnlyListedKeys` is `true`, then + `SendKeys` will only be used for events with keys listed in + `ReceiveKeys`. + + `none` uses the incoming key for all telemetry (default). + `all` overwrites all keys, even missing ones, with `SendKey`. + `nonblank` overwrites all supplied keys but will not inject `SendKey` if the incoming key is blank. + `listedonly` overwrites only the keys listed in `ReceiveKeys`. + `unlisted` uses the `SendKey` for all events *except* those with keys listed in `ReceiveKeys`, which use their original keys. + `missingonly` uses the SendKey only to inject keys into events with blank keys. All other events use their original keys. + - name: RefineryTelemetry title: "Refinery Telemetry" description: contains configuration information for the telemetry that Refinery uses to record its own operation. @@ -198,18 +239,18 @@ groups: This setting also includes the field `meta.refinery.send_reason`, which contains the reason that the trace was sent. Possible values of this field are `trace_send_got_root`, which means that the root span - arrived; `trace_send_expired`, which means that TraceTimeout was reached; + arrived; `trace_send_expired`, which means that `TraceTimeout` was reached; `trace_send_ejected_full`, which means that the trace cache was full; and - `trace_send_ejected_memsize`, which means that refinery was out of memory. + `trace_send_ejected_memsize`, which means that Refinery was out of memory. - These names are also the names of metrics that refinery tracks. + These names are also the names of metrics that Refinery tracks. We recommend enabling this setting whenever a rules-based sampler is in use, as it is useful for debugging and understanding the behavior of your Refinery installation. - name: AddSpanCountToRoot - type: bool + type: defaulttrue valuetype: nondefault default: true reload: true @@ -226,6 +267,7 @@ groups: - name: AddCountsToRoot type: bool valuetype: nondefault + firstversion: v2.2 default: false reload: true summary: controls whether to add metadata fields to root spans that indicates the number of child spans, span events, span links, and honeycomb events. @@ -242,7 +284,7 @@ groups: - `meta.event_count`: the number of honeycomb events on the trace - name: AddHostMetadataToTrace - type: bool + type: defaulttrue valuetype: nondefault default: true reload: true @@ -260,14 +302,18 @@ groups: type: duration valuetype: nondefault default: 2s + validations: + - type: minimum + arg: 100ms reload: true - summary: is the duration to wait before sending a trace. + summary: is the duration to wait after the root span arrives before sending a trace. description: > - This setting is a short timer that is triggered when a trace is - complete. Refinery waits for this duration before sending the trace. - The reason for this setting is to allow for small network delays or - clock jitters to elapse and any final spans to arrive before sending - the trace. Set to "0" for immediate sending. + This setting is a short timer that is triggered when a trace is marked + complete by the arrival of the root span. Refinery waits for this + duration before sending the trace. This setting exists to allow for + asynchronous spans and small network delays to elapse before sending + the trace. `SendDelay` is not applied if the `TraceTimeout` expires or + the `SpanLimit` is reached. - name: BatchTimeout type: duration @@ -282,6 +328,9 @@ groups: type: duration valuetype: nondefault default: 60s + validations: + - type: minimum + arg: 1s reload: true summary: is the duration to wait before making the trace decision on an incomplete trace. description: > @@ -290,13 +339,34 @@ groups: Normally trace decisions (send or drop) are made when the root span arrives. Sometimes the root span never arrives (for example, due to - crashes) and this timer ensures sending a trace even without having - received the root span. + crashes). Once this timer fires, Refinery will make a trace decision + based on the spans that have arrived so far. This ensures sending a + trace even when the root span never arrives. + + After the trace decision has been made, Refinery retains a record of + that decision for a period of time. When additional spans (including + the root span) arrive, they will be kept or dropped based on the + original decision. If particularly long-lived traces are present in your data, then you should increase this timer. Note that this increase will also increase the memory requirements for Refinery. + - name: SpanLimit + type: int + valuetype: nondefault + default: 0 + reload: true + summary: is the number of spans after which a trace becomes eligible for a trace decision. + description: > + This setting helps to keep memory usage under control. If a trace has + more than this set number of spans, then it becomes eligible for a + trace decision. + + It's most helpful in a situation where a sudden burst of many spans in + a large trace hits Refinery all at once, causing memory usage to spike + and possibly crashing Refinery. + - name: MaxBatchSize type: int valuetype: nondefault @@ -351,7 +421,7 @@ groups: reload: false envvar: REFINERY_QUERY_AUTH_TOKEN commandline: query-auth-token - summary: is the token that must be specified to access the `/query` endpoint. + summary: is the token that must be specified to access the `/query` endpoint. Setting this value via a command line flag may expose credentials - it is recommended to use the environment variable or a configuration file. description: > This token must be specified with the header "X-Honeycomb-Refinery-Query" in order for a `/query` request to @@ -462,7 +532,7 @@ groups: validations: - type: format arg: apikey - summary: is the API key used to send Refinery's logs to Honeycomb. + summary: is the API key used to send Refinery's logs to Honeycomb. Setting this value via a command line flag may expose credentials - it is recommended to use the environment variable or a configuration file. description: > It is recommended that you create a separate team and key for Refinery logs. @@ -483,7 +553,7 @@ groups: - name: SamplerEnabled v1group: HoneycombLogger v1name: LoggerSamplerEnabled - type: bool + type: defaulttrue valuetype: nondefault default: true reload: false @@ -525,6 +595,7 @@ groups: valuetype: nondefault default: false reload: false + firstversion: v2.2 summary: controls whether logs are sampled before sending to `stdout`. description: > The sample rate is controlled by the `SamplerThroughput` setting. @@ -535,12 +606,14 @@ groups: default: 10 example: 10 reload: false + firstversion: v2.2 summary: is the sampling throughput for logs in events per second. description: > The sampling algorithm attempts to make sure that the average throughput approximates this value, while also ensuring that all unique logs arrive at `stdout` at least once per sampling period. + - name: PrometheusMetrics title: "Prometheus Metrics" description: contains configuration for Refinery's internally-generated metrics as made available through Prometheus. @@ -615,7 +688,7 @@ groups: arg: apikey envvar: REFINERY_HONEYCOMB_METRICS_API_KEY, HONEYCOMB_API_KEY commandline: legacy-metrics-api-key - summary: is the API key used by Refinery to send its metrics to Honeycomb. + summary: is the API key used by Refinery to send its metrics to Honeycomb. Setting this value via a command line flag may expose credentials - it is recommended to use the environment variable or a configuration file. description: > It is recommended that you create a separate team and key for Refinery metrics. @@ -686,7 +759,7 @@ groups: envvar: REFINERY_OTEL_METRICS_API_KEY, HONEYCOMB_API_KEY commandline: otel-metrics-api-key firstversion: v2.0 - summary: is the API key used to send Honeycomb metrics via OpenTelemetry. + summary: is the API key used to send Honeycomb metrics via OpenTelemetry. Setting this value via a command line flag may expose credentials - it is recommended to use the environment variable or a configuration file. description: > It is recommended that you create a separate team and key for Refinery metrics. @@ -730,6 +803,79 @@ groups: compression costs may outweigh the benefits, in which case `none` may be used. + - name: OTelTracing + title: "OpenTelemetry Tracing" + description: contains configuration for Refinery's own tracing. + fields: + - name: Enabled + type: bool + valuetype: nondefault + default: false + reload: false + firstversion: v2.6 + summary: controls whether to send Refinery's own OpenTelemetry traces. + description: > + The setting specifies if Refinery sends traces. + + - name: APIHost + type: url + valuetype: nondefault + default: "https://api.honeycomb.io" + reload: false + firstversion: v2.6 + summary: is the URL of the OpenTelemetry API to which traces will be sent. + description: > + Refinery's internal traces will be sent to the `/v1/traces` + endpoint on this host. + + - name: APIKey + type: string + pattern: apikey + valuetype: nondefault + default: "" + example: "SetThisToAHoneycombKey" + reload: false + firstversion: v2.6 + envvar: REFINERY_HONEYCOMB_TRACES_API_KEY, REFINERY_HONEYCOMB_API_KEY + commandline: otel-traces-api-key + validations: + - type: format + arg: apikey + summary: is the API key used to send Refinery's traces to Honeycomb. Setting this value via a command line flag may expose credentials - it is recommended to use the environment variable or a configuration file. + description: > + It is recommended that you create a separate team and key for + Refinery telemetry. + + If this value is blank, then Refinery will not set the + Honeycomb-specific headers for OpenTelemetry, and your `APIHost` must + be set to a valid OpenTelemetry endpoint. + + - name: Dataset + type: string + valuetype: nondefault + default: "Refinery Traces" + reload: false + validations: + - type: notempty + firstversion: v2.6 + summary: is the Honeycomb dataset to which Refinery sends its OpenTelemetry metrics. + description: > + Only used if `APIKey` is specified. + + - name: SampleRate + type: int + valuetype: nondefault + default: 100 + validations: + - type: minimum + arg: 1 + reload: true + summary: is the rate at which Refinery samples its own traces. + description: > + This is the Honeycomb sample rate used to sample traces sent by Refinery. Since each + incoming span generates multiple outgoing spans, a minimum sample rate of `100` is + strongly advised. + - name: PeerManagement title: "Peer Management" description: controls how the Refinery cluster communicates between peers. @@ -748,11 +894,18 @@ groups: description: > Peer management is the mechanism by which Refinery locates its peers. - `file` means that Refinery gets its peer list from - the Peers list in this config file. + `file` means that Refinery gets its peer list from the Peers list in + this config file. It also prevents Refinery from using a publish/subscribe + mechanism to propagate peer lists, stress levels, and configuration changes. - `redis` means that Refinery self-registers with a Redis instance and - gets its peer list from there. + `redis` means that Refinery uses a Publish/Subscribe mechanism, + implemented on Redis, to propagate peer lists, stress levels, and + notification of configuration changes much more quickly than the + legacy mechanism. + + The recommended setting is `redis`, especially for new + installations. If `redis` is specified, fields in `RedisPeerManagement` + must also be set. `fly-dns` means that Refinery uses Fly internal DNS to find its peers @@ -808,22 +961,21 @@ groups: v1name: Peers type: stringarray valuetype: stringarray - example: "192.168.1.11:8081,192.168.1.12:8081" + example: "http://192.168.1.11:8081,http://192.168.1.12:8081" reload: false validations: - type: elementType - arg: hostport + arg: url summary: is the list of peers to use when Type is "file", excluding self. description: > This list is ignored when Type is "redis". The format is a list of - strings of the form "host:port". + strings of the form "scheme://host:port". - name: RedisPeerManagement title: "Redis Peer Management" description: > - controls how the Refinery cluster communicates - between peers when using Redis. Only applies when `PeerManagement.Type` - is "redis". + controls how the Refinery cluster communicates between peers when using + Redis. Does not apply when `PeerManagement.Type` is "file". fields: - name: Host @@ -839,6 +991,21 @@ groups: description: > Must be in the form `host:port`. + - name: ClusterHosts + type: stringarray + valuetype: stringarray + example: "- localhost:6379" + firstversion: v2.8 + validations: + - type: elementType + arg: hostport + reload: false + summary: is a list of host and port pairs for the instances in a Redis Cluster, and used for managing peer cluster membership. + description: > + This configuration enables Refinery to connect to a Redis deployment setup in Cluster Mode. + Each entry in the list should follow the format `host:port`. + If `ClusterHosts` is specified, the `Host` setting will be ignored. + - name: Username v1group: PeerManagement v1name: RedisUsername @@ -848,7 +1015,7 @@ groups: reload: false envvar: REFINERY_REDIS_USERNAME commandline: redis-username - summary: is the username used to connect to Redis for peer cluster membership management. + summary: is the username used to connect to Redis for peer cluster membership management. Setting this value via a command line flag may expose credentials - it is recommended to use the environment variable or a configuration file. description: > Many Redis installations do not use this field. @@ -861,7 +1028,7 @@ groups: reload: false envvar: REFINERY_REDIS_PASSWORD commandline: redis-password - summary: is the password used to connect to Redis for peer cluster membership management. + summary: is the password used to connect to Redis for peer cluster membership management. Setting this value via a command line flag may expose credentials - it is recommended to use the environment variable or a configuration file. description: > Many Redis installations do not use this field. @@ -872,9 +1039,10 @@ groups: default: "" valuetype: nonemptystring reload: false + firstversion: v2.2 envvar: REFINERY_REDIS_AUTH_CODE commandline: redis-auth-code - summary: is the string used to connect to Redis for peer cluster membership management using an explicit AUTH command. + summary: is the string used to connect to Redis for peer cluster membership management using an explicit AUTH command. Setting this value via a command line flag may expose credentials - it is recommended to use the environment variable or a configuration file. description: > Many Redis installations do not use this field. @@ -886,6 +1054,7 @@ groups: default: "refinery" example: "customPrefix" reload: false + lastversion: v2.6 validations: - type: notempty summary: is a string used as a prefix for the keys in Redis while @@ -903,6 +1072,7 @@ groups: default: 0 example: 1 reload: false + lastversion: v2.6 validations: - type: minimum arg: 0 @@ -996,18 +1166,20 @@ groups: arg: 1000 summary: is the number of traces to keep in the cache's circular buffer. description: > - The collection cache is used to collect all spans into a trace as - well as remember the sampling decision for any spans that might come - in after the trace has been marked "complete" (either by timing out - or seeing the root span). The number of traces in the cache should be - many multiples (100x to 1000x) of the total number of concurrently - active traces (trace throughput * trace duration). + The collection cache is used to collect all active spans into traces. + It is organized as a circular buffer. When the buffer wraps around, + Refinery will try a few times to find an empty slot; if it fails, it + starts ejecting traces from the cache earlier than would otherwise be + necessary. Ideally, the size of the cache should be many + multiples (100x to 1000x) of the total number of concurrently active + traces (average trace throughput * average trace duration). - name: PeerQueueSize type: int default: 30_000 valuetype: nondefault reload: false + firstversion: v2.2 summary: is the maximum number of in-flight spans redirected from other peers stored in the peer span queue. description: > The peer span queue serves as a buffer for spans redirected from other peers before they are processed. @@ -1021,6 +1193,7 @@ groups: default: 30_000 valuetype: nondefault reload: false + firstversion: v2.2 summary: is the number of in-flight spans to keep in the incoming span queue. description: > The incoming span queue is used to buffer spans before they are processed. @@ -1084,6 +1257,34 @@ groups: supported. See `MaxMemoryPercentage` for more details. If set, `Collections.AvailableMemory` must not be defined. + - name: DisableRedistribution + type: bool + valuetype: nondefault + firstversion: v2.8 + default: false + reload: true + summary: controls whether to transmit traces in cache to remaining peers during cluster scaling event. + description: > + If `true`, Refinery will NOT forward live traces in its cache to the rest of the peers when peers join or leave the cluster. + By disabling this behavior, it can help to prevent disruptive bursts of network traffic when large traces with long `TraceTimeout` + are redistributed. + + - name: ShutdownDelay + type: duration + valuetype: nondefault + firstversion: v2.8 + default: 15s + reload: true + summary: controls the maximum time Refinery can use while draining traces at shutdown. + description: > + This setting controls the duration that Refinery expects to have to + drain in-process traces before shutting down an instance. When asked + to shut down gracefully, Refinery stops accepting new spans + immediately and drains the remaining traces by sending them to remaining + peers. + This value should be set to a bit less than the normal timeout period + for shutting down without forcibly terminating the process. + - name: BufferSizes title: "Buffer Sizes" description: > @@ -1141,7 +1342,7 @@ groups: increase this value. - name: CompressPeerCommunication - type: bool + type: defaulttrue default: true valuetype: nondefault reload: false @@ -1246,10 +1447,10 @@ groups: receive OpenTelemetry data in gRPC format. fields: - name: Enabled - type: bool + type: defaulttrue valuetype: conditional extra: "nonempty GRPCListenAddr" - default: false + default: true reload: false summary: specifies whether the gRPC server is enabled. description: > @@ -1354,7 +1555,7 @@ groups: - name: MaxSendMsgSize type: memorysize valuetype: memorysize - default: 5MB + default: 15MB reload: false firstversion: v2.2 validations: @@ -1371,7 +1572,7 @@ groups: - name: MaxRecvMsgSize type: memorysize valuetype: memorysize - default: 5MB + default: 15MB reload: false firstversion: v2.2 validations: @@ -1574,6 +1775,7 @@ groups: v1name: MinimumStartupDuration type: duration valuetype: nondefault + lastversion: v2.6 default: 3s reload: true summary: is the minimum time that Stress Relief will stay enabled. diff --git a/config/metadata/rulesMeta.yaml b/config/metadata/rulesMeta.yaml index b23fb55245..3406a5892b 100644 --- a/config/metadata/rulesMeta.yaml +++ b/config/metadata/rulesMeta.yaml @@ -40,6 +40,9 @@ groups: The sample rate is calculated from the trace ID, so all spans with the same trace ID will be sampled or not sampled together. + A `SampleRate` of `1` or less will keep all traces. + + Specifying this value is required. - name: DynamicSampler title: Dynamic Sampler sortorder: 20 @@ -67,6 +70,7 @@ groups: description: > The duration after which the Dynamic Sampler should reset its internal counters. It should be specified as a duration string. For example, "30s" or "1m". + Defaults to "30s". - name: FieldList type: stringarray validations: @@ -126,9 +130,9 @@ groups: Indicates whether to include the trace length (number of spans in the trace) as part of the key. The number of spans is exact, so if there are normally small variations in trace length, we recommend setting - this field to `false`. If your traces are consistent lengths and - changes in trace length is a useful indicator to view in Honeycomb, - then set this field to `true`. + this field to `false` (the default). If your traces are consistent + lengths and changes in trace length is a useful indicator to view in + Honeycomb, then set this field to `true`. - name: EMADynamicSampler title: EMA Dynamic Sampler @@ -169,7 +173,7 @@ groups: description: > The duration after which the EMA Dynamic Sampler should recalculate its internal counters. It should be specified as a duration string. - For example, "30s" or "1m". + For example, `30s` or `1m`. Defaults to `15s`. - name: Weight type: float validations: @@ -181,10 +185,11 @@ groups: description: > The weight to use when calculating the EMA. It should be a number between `0` and `1`. Larger values weight the average more toward - recent observations. In other words, a larger weight will cause - sample rates more quickly adapt to traffic patterns, while a smaller - weight will result in sample rates that are less sensitive to bursts - or drops in traffic and thus more consistent over time. + recent observations. In other words, a larger weight will cause sample + rates more quickly adapt to traffic patterns, while a smaller weight + will result in sample rates that are less sensitive to bursts or drops + in traffic and thus more consistent over time. The default value is + `0.5`. - name: AgeOutValue type: float validations: @@ -198,7 +203,7 @@ groups: The EMA of any key will approach `0` if it is not repeatedly observed, but will never truly reach it, so this field determines what constitutes "zero". Keys with averages below this threshold will be - removed from the EMA. Default is the same as `Weight`, as this + removed from the EMA. Default is the value of `Weight`, as this prevents a key with the smallest integer value (1) from being aged out immediately. This value should generally be less than (<=) `Weight`, unless you have very specific reasons to set it higher. @@ -282,7 +287,7 @@ groups: description: > Indicates whether to use the cluster size to calculate the goal throughput. If `true`, then the goal throughput will be divided by the - number of instances in the cluster. If `false`, then the goal + number of instances in the cluster. If `false` (the default), then the goal throughput will be the value specified in `GoalThroughputPerSec`. - name: InitialSampleRate type: int @@ -291,7 +296,7 @@ groups: `InitialSampleRate` is the sample rate to use during startup, before the sampler has accumulated enough data to calculate a reasonable throughput. This is mainly useful in situations where unsampled - throughput is high enough to cause problems. + throughput is high enough to cause problems. Default value is `10`. - name: AdjustmentInterval type: duration summary: is how often the sampler will recalculate the sample rate. @@ -364,9 +369,9 @@ groups: A standard configuration would be to set `UpdateFrequency` to `1s` and `LookbackFrequency` to `30s`. In this configuration, for every second, we lookback at the last 30 seconds of data in order to compute the new - sampling rate. The actual sampling rate computation is nearly identical - to the original Throughput Sampler, but this variant has better support - for floating point numbers. + sampling rate. The actual sampling rate computation is nearly identical to + the original Throughput Sampler, but this variant has better support for + floating point numbers and does a better job with less-common keys. fields: - name: GoalThroughputPerSec @@ -390,14 +395,14 @@ groups: description: > Indicates whether to use the cluster size to calculate the goal throughput. If `true`, then the goal throughput will be divided by the - number of instances in the cluster. If `false`, then the goal - throughput will be the value specified in `GoalThroughputPerSec`. + number of instances in the cluster. If `false` (the default), then the + goal throughput will be the value specified in `GoalThroughputPerSec`. - name: UpdateFrequency type: duration summary: is how often the sampling rate is recomputed. description: > The duration between sampling rate computations. It should be - specified as a duration string. For example, "30s" or "1m". + specified as a duration string. For example, `30s` or `1m`. Defaults to `1s`. - name: LookbackFrequency type: duration summary: how far back in time to look when computing the sampling rate. @@ -462,11 +467,12 @@ groups: description: > Indicates whether to use the cluster size to calculate the goal throughput. If `true`, then the goal throughput will be divided by the - number of instances in the cluster. If `false`, then the goal + number of instances in the cluster. If `false` (the default), then the goal throughput will be the value specified in `GoalThroughputPerSec`. - name: ClearFrequency type: duration - summary: is the duration over which the sampler will calculate the throughput. + summary: is the duration over which the sampler will calculate the throughput. It should be + specified as a duration string. For example, `30s` or `1m`. Defaults to `30s`. description: $DynamicSampler.ClearFrequency - name: FieldList type: stringarray @@ -508,14 +514,15 @@ groups: summary: indicates whether to expand nested JSON when evaluating rules. description: > Indicates whether to expand nested JSON when evaluating rules. If - false, nested JSON will be treated as a string. If `true`, nested JSON - will be expanded into a `map[string]interface{}` and the value of the - field will be the value of the nested field. For example, if you have - a field called `http.request.headers` and you want to check the value - of the `User-Agent` header, then you would set this to `true` and use - `http.request.headers.User-Agent` as the field name in your rule. This - is a computationally expensive option and may cause performance - problems if you have a large number of spans with nested JSON. + false (the default), nested JSON will be treated as a string. If + `true`, nested JSON will be expanded into a `map[string]interface{}` + and the value of the field will be the value of the nested field. For + example, if you have a field called `http.request.headers` and you + want to check the value of the `User-Agent` header, then you would set + this to `true` and use `http.request.headers.User-Agent` as the field + name in your rule. This is a computationally expensive option and may + cause performance problems if you have a large number of spans with + nested JSON. - name: Rules title: Rules for Rules-based Samplers @@ -584,9 +591,9 @@ groups: - span summary: controls the scope of the rule. description: > - Controls the scope of the rule evaluation. If set to "trace" (the + Controls the scope of the rule evaluation. If set to `trace` (the default), then each condition can apply to any span in the trace - independently. If set to "span", then all of the conditions in the + independently. If set to `span`, then all of the conditions in the rule will be evaluated against each span in the trace and the rule only succeeds if all of the conditions match on a single span together. @@ -603,10 +610,51 @@ groups: - name: Field type: string summary: is the field to check. + validations: + - type: conflictsWith + arg: Fields description: > - The field to check. This can be any field in the trace. If the field + The field to check. This can name any field in the trace. If the field is not present, then the condition will not match. The comparison is case-sensitive. + + The field can also include a prefix that changes the span used for evaluation of the field. + The only prefix currently supported is `root`, as in `root.http.status`. Specifying `root.` + causes the condition to be evaluated against the root span. + + For example, if the `Field` is `root.url`, then the condition will be processed using the url + field from the root span. + + The setting `Scope: span` for a rule does not change the meaning of this prefix -- + the condition is still evaluated on the root span and is treated as if it were part of the + span being processed. + + When using the `root.` prefix on a field with a `not-exists` operator, include the `has-root-span: true` condition in the rule. + The `not-exists` condition on a `root.`-prefixed field will evaluate to false if the existence of the root span is not checked and the root span does not exist. + The primary reason a root span is not present on a trace when a sampling decision is being made is when the root span takes longer to complete than the configured TraceTimeout. + + - name: Fields + type: stringarray + valuetype: stringarray + validations: + - type: elementType + arg: string + - type: conflictsWith + arg: Field + description: > + An array of field names to check. These can name any field in the + trace. The fields are checked in the order defined here, and the first + named field that contains a value will be used for the condition. Only + the first populated field will be used, even if the condition fails. + + If a `root.` prefix is present on a field, but the root span is not on + the trace, that field will be skipped. + + If none of the fields are present, then the condition will not match. + The comparison is case-sensitive. + + All fields are checked as individual fields before any of them are + checked as nested fields (see `CheckNestedFields`). - name: Operator type: string valuetype: choice @@ -626,16 +674,25 @@ groups: - not-exists - has-root-span - matches + - in + - not-in summary: is the comparison operator to use. description: > The comparison operator to use. String comparisons are case-sensitive. + For most cases, use negative operators (`!=`, `does-not-contain`, + `not-exists`, and `not-in`) in a rule with a scope of "span". + WARNING: Rules can have `Scope: trace` or `Scope: span`. Using a negative + operator with `Scope: trace` will cause the condition be true if **any** + single span in the entire trace matches. Use `Scope: span` with negative + operators. - name: Value - type: anyscalar + type: sliceorscalar summary: is the value to compare against. description: > The value to compare against. If `Datatype` is not specified, then the value and the field will be compared based on the type of the - field. + field. The `in` and `not-in` operators can accept a list of values, + which should all be of the same datatype. - name: Datatype type: string validations: @@ -654,3 +711,7 @@ groups: especially useful when a field like `http status code` may be rendered as strings by some environments and as numbers or booleans by others. + + The best practice is to always specify `Datatype`; this avoids + ambiguity, allows for more accurate comparisons, and offers a + minor performance improvement. diff --git a/config/mock.go b/config/mock.go index 9ea645fc2a..34497a3a10 100644 --- a/config/mock.go +++ b/config/mock.go @@ -1,7 +1,6 @@ package config import ( - "fmt" "sync" "time" ) @@ -9,64 +8,35 @@ import ( // MockConfig will respond with whatever config it's set to do during // initialization type MockConfig struct { - Callbacks []func() - IsAPIKeyValidFunc func(string) bool - GetCollectorTypeErr error + Callbacks []ConfigReloadCallback + GetAccessKeyConfigVal AccessKeyConfig GetCollectorTypeVal string - GetCollectionConfigErr error GetCollectionConfigVal CollectionConfig - GetHoneycombAPIErr error + GetTracesConfigVal TracesConfig GetHoneycombAPIVal string - GetListenAddrErr error GetListenAddrVal string - GetPeerListenAddrErr error GetPeerListenAddrVal string GetHTTPIdleTimeoutVal time.Duration GetCompressPeerCommunicationsVal bool GetGRPCEnabledVal bool - GetGRPCListenAddrErr error GetGRPCListenAddrVal string GetGRPCServerParameters GRPCServerParameters - GetLoggerTypeErr error GetLoggerTypeVal string - GetHoneycombLoggerConfigErr error GetHoneycombLoggerConfigVal HoneycombLoggerConfig - GetStdoutLoggerConfigErr error GetStdoutLoggerConfigVal StdoutLoggerConfig GetLoggerLevelVal Level - GetPeersErr error GetPeersVal []string - GetRedisHostErr error - GetRedisHostVal string - GetRedisUsernameErr error - GetRedisUsernameVal string - GetRedisPasswordErr error - GetRedisPasswordVal string - GetRedisAuthCodeErr error - GetRedisAuthCodeVal string - GetRedisDatabaseVal int - GetRedisPrefixVal string - GetUseTLSErr error - GetUseTLSVal bool - GetUseTLSInsecureErr error - GetUseTLSInsecureVal bool - GetSamplerTypeErr error + GetRedisPeerManagementVal RedisPeerManagementConfig GetSamplerTypeName string GetSamplerTypeVal interface{} - GetMetricsTypeErr error GetMetricsTypeVal string + GetGeneralConfigVal GeneralConfig GetLegacyMetricsConfigVal LegacyMetricsConfig GetPrometheusMetricsConfigVal PrometheusMetricsConfig GetOTelMetricsConfigVal OTelMetricsConfig - GetSendDelayErr error - GetSendDelayVal time.Duration - GetBatchTimeoutVal time.Duration - GetTraceTimeoutErr error - GetTraceTimeoutVal time.Duration - GetMaxBatchSizeVal uint + GetOTelTracingConfigVal OTelTracingConfig GetUpstreamBufferSizeVal int GetPeerBufferSizeVal int - SendTickerVal time.Duration IdentifierInterfaceName string UseIPV6Identifier bool RedisIdentifier string @@ -90,188 +60,160 @@ type MockConfig struct { TraceIdFieldNames []string ParentIdFieldNames []string CfgMetadata []ConfigMetadata + CfgHash string + RulesHash string Mux sync.RWMutex } -func (m *MockConfig) ReloadConfig() { +// assert that MockConfig implements Config +var _ Config = (*MockConfig)(nil) + +func (m *MockConfig) Reload() { m.Mux.RLock() defer m.Mux.RUnlock() for _, callback := range m.Callbacks { - callback() + callback("", "") } } -func (m *MockConfig) RegisterReloadCallback(callback func()) { +func (m *MockConfig) RegisterReloadCallback(callback ConfigReloadCallback) { m.Mux.Lock() m.Callbacks = append(m.Callbacks, callback) m.Mux.Unlock() } -func (m *MockConfig) IsAPIKeyValid(key string) bool { - m.Mux.RLock() - defer m.Mux.RUnlock() - - // if no function is set, assume the key is valid - if m.IsAPIKeyValidFunc == nil { - return true - } - - return m.IsAPIKeyValidFunc(key) -} - -func (m *MockConfig) GetCollectorType() (string, error) { - m.Mux.RLock() - defer m.Mux.RUnlock() - - return m.GetCollectorTypeVal, m.GetCollectorTypeErr -} - -func (m *MockConfig) GetCollectionConfig() (CollectionConfig, error) { - m.Mux.RLock() - defer m.Mux.RUnlock() - - return m.GetCollectionConfigVal, m.GetCollectionConfigErr -} - -func (m *MockConfig) GetHoneycombAPI() (string, error) { +func (m *MockConfig) GetHashes() (string, string) { m.Mux.RLock() defer m.Mux.RUnlock() - return m.GetHoneycombAPIVal, m.GetHoneycombAPIErr + return m.CfgHash, m.RulesHash } -func (m *MockConfig) GetListenAddr() (string, error) { +func (m *MockConfig) GetAccessKeyConfig() AccessKeyConfig { m.Mux.RLock() defer m.Mux.RUnlock() - return m.GetListenAddrVal, m.GetListenAddrErr + return m.GetAccessKeyConfigVal } -func (m *MockConfig) GetPeerListenAddr() (string, error) { +func (m *MockConfig) GetCollectorType() string { m.Mux.RLock() defer m.Mux.RUnlock() - return m.GetPeerListenAddrVal, m.GetPeerListenAddrErr + return m.GetCollectorTypeVal } -func (m *MockConfig) GetHTTPIdleTimeout() time.Duration { +func (m *MockConfig) GetCollectionConfig() CollectionConfig { m.Mux.RLock() defer m.Mux.RUnlock() - return m.GetHTTPIdleTimeoutVal + return m.GetCollectionConfigVal } -func (m *MockConfig) GetCompressPeerCommunication() bool { +func (m *MockConfig) GetTracesConfig() TracesConfig { m.Mux.RLock() defer m.Mux.RUnlock() - return m.GetCompressPeerCommunicationsVal + return m.GetTracesConfigVal } -func (m *MockConfig) GetGRPCEnabled() bool { +func (m *MockConfig) GetHoneycombAPI() string { m.Mux.RLock() defer m.Mux.RUnlock() - return m.GetGRPCEnabledVal -} -func (m *MockConfig) GetGRPCListenAddr() (string, error) { - m.Mux.RLock() - defer m.Mux.RUnlock() - - return m.GetGRPCListenAddrVal, m.GetGRPCListenAddrErr + return m.GetHoneycombAPIVal } -func (m *MockConfig) GetLoggerType() (string, error) { +func (m *MockConfig) GetListenAddr() string { m.Mux.RLock() defer m.Mux.RUnlock() - return m.GetLoggerTypeVal, m.GetLoggerTypeErr + return m.GetListenAddrVal } -func (m *MockConfig) GetHoneycombLoggerConfig() (HoneycombLoggerConfig, error) { +func (m *MockConfig) GetPeerListenAddr() string { m.Mux.RLock() defer m.Mux.RUnlock() - return m.GetHoneycombLoggerConfigVal, m.GetHoneycombLoggerConfigErr + return m.GetPeerListenAddrVal } -func (m *MockConfig) GetStdoutLoggerConfig() (StdoutLoggerConfig, error) { +func (m *MockConfig) GetHTTPIdleTimeout() time.Duration { m.Mux.RLock() defer m.Mux.RUnlock() - return m.GetStdoutLoggerConfigVal, m.GetStdoutLoggerConfigErr + return m.GetHTTPIdleTimeoutVal } -func (m *MockConfig) GetLoggerLevel() Level { +func (m *MockConfig) GetCompressPeerCommunication() bool { m.Mux.RLock() defer m.Mux.RUnlock() - return m.GetLoggerLevelVal + return m.GetCompressPeerCommunicationsVal } -func (m *MockConfig) GetPeers() ([]string, error) { +func (m *MockConfig) GetGRPCEnabled() bool { m.Mux.RLock() defer m.Mux.RUnlock() - - return m.GetPeersVal, m.GetPeersErr + return m.GetGRPCEnabledVal } -func (m *MockConfig) GetRedisHost() (string, error) { +func (m *MockConfig) GetGRPCListenAddr() string { m.Mux.RLock() defer m.Mux.RUnlock() - return m.GetRedisHostVal, m.GetRedisHostErr + return m.GetGRPCListenAddrVal } -func (m *MockConfig) GetRedisUsername() (string, error) { +func (m *MockConfig) GetLoggerType() string { m.Mux.RLock() defer m.Mux.RUnlock() - return m.GetRedisUsernameVal, m.GetRedisUsernameErr + return m.GetLoggerTypeVal } -func (m *MockConfig) GetRedisPassword() (string, error) { +func (m *MockConfig) GetHoneycombLoggerConfig() HoneycombLoggerConfig { m.Mux.RLock() defer m.Mux.RUnlock() - return m.GetRedisPasswordVal, m.GetRedisPasswordErr + return m.GetHoneycombLoggerConfigVal } -func (m *MockConfig) GetRedisAuthCode() (string, error) { +func (m *MockConfig) GetStdoutLoggerConfig() StdoutLoggerConfig { m.Mux.RLock() defer m.Mux.RUnlock() - return m.GetRedisAuthCodeVal, m.GetRedisAuthCodeErr + return m.GetStdoutLoggerConfigVal } -func (m *MockConfig) GetRedisPrefix() string { +func (m *MockConfig) GetLoggerLevel() Level { m.Mux.RLock() defer m.Mux.RUnlock() - return m.GetRedisPrefixVal + return m.GetLoggerLevelVal } -func (m *MockConfig) GetRedisDatabase() int { +func (m *MockConfig) GetPeers() []string { m.Mux.RLock() defer m.Mux.RUnlock() - return m.GetRedisDatabaseVal + return m.GetPeersVal } -func (m *MockConfig) GetUseTLS() (bool, error) { +func (m *MockConfig) GetRedisPeerManagement() RedisPeerManagementConfig { m.Mux.RLock() defer m.Mux.RUnlock() - return m.GetUseTLSVal, m.GetUseTLSErr + return m.GetRedisPeerManagementVal } -func (m *MockConfig) GetUseTLSInsecure() (bool, error) { +func (m *MockConfig) GetGeneralConfig() GeneralConfig { m.Mux.RLock() defer m.Mux.RUnlock() - return m.GetUseTLSInsecureVal, m.GetUseTLSInsecureErr + return m.GetGeneralConfigVal } func (m *MockConfig) GetLegacyMetricsConfig() LegacyMetricsConfig { @@ -295,45 +237,24 @@ func (m *MockConfig) GetOTelMetricsConfig() OTelMetricsConfig { return m.GetOTelMetricsConfigVal } -func (m *MockConfig) GetSendDelay() (time.Duration, error) { +func (m *MockConfig) GetOTelTracingConfig() OTelTracingConfig { m.Mux.RLock() defer m.Mux.RUnlock() - return m.GetSendDelayVal, m.GetSendDelayErr -} - -func (m *MockConfig) GetBatchTimeout() time.Duration { - m.Mux.RLock() - defer m.Mux.RUnlock() - - return m.GetBatchTimeoutVal -} - -func (m *MockConfig) GetTraceTimeout() (time.Duration, error) { - m.Mux.RLock() - defer m.Mux.RUnlock() - - return m.GetTraceTimeoutVal, m.GetTraceTimeoutErr -} - -func (m *MockConfig) GetMaxBatchSize() uint { - m.Mux.RLock() - defer m.Mux.RUnlock() - - return m.GetMaxBatchSizeVal + return m.GetOTelTracingConfigVal } // TODO: allow per-dataset mock values -func (m *MockConfig) GetSamplerConfigForDestName(dataset string) (interface{}, string, error) { +func (m *MockConfig) GetSamplerConfigForDestName(dataset string) (interface{}, string) { m.Mux.RLock() defer m.Mux.RUnlock() - return m.GetSamplerTypeVal, m.GetSamplerTypeName, m.GetSamplerTypeErr + return m.GetSamplerTypeVal, m.GetSamplerTypeName } // GetAllSamplerRules normally returns all dataset rules, including the default // In this mock, it returns only the rules for "dataset1" according to the type of the value field -func (m *MockConfig) GetAllSamplerRules() (*V2SamplerConfig, error) { +func (m *MockConfig) GetAllSamplerRules() *V2SamplerConfig { m.Mux.RLock() defer m.Mux.RUnlock() @@ -350,14 +271,14 @@ func (m *MockConfig) GetAllSamplerRules() (*V2SamplerConfig, error) { case *TotalThroughputSamplerConfig: choice.TotalThroughputSampler = sampler default: - return nil, fmt.Errorf("unable to determine data format") + return nil } v := &V2SamplerConfig{ Samplers: map[string]*V2SamplerChoice{"dataset1": choice}, } - return v, m.GetSamplerTypeErr + return v } func (m *MockConfig) GetUpstreamBufferSize() int { @@ -374,46 +295,39 @@ func (m *MockConfig) GetPeerBufferSize() int { return m.GetPeerBufferSizeVal } -func (m *MockConfig) GetIdentifierInterfaceName() (string, error) { - m.Mux.RLock() - defer m.Mux.RUnlock() - - return m.IdentifierInterfaceName, nil -} - -func (m *MockConfig) GetUseIPV6Identifier() (bool, error) { +func (m *MockConfig) GetIdentifierInterfaceName() string { m.Mux.RLock() defer m.Mux.RUnlock() - return m.UseIPV6Identifier, nil + return m.IdentifierInterfaceName } -func (m *MockConfig) GetRedisIdentifier() (string, error) { +func (m *MockConfig) GetUseIPV6Identifier() bool { m.Mux.RLock() defer m.Mux.RUnlock() - return m.RedisIdentifier, nil + return m.UseIPV6Identifier } -func (m *MockConfig) GetSendTickerValue() time.Duration { +func (m *MockConfig) GetRedisIdentifier() string { m.Mux.RLock() defer m.Mux.RUnlock() - return m.SendTickerVal + return m.RedisIdentifier } -func (m *MockConfig) GetPeerManagementType() (string, error) { +func (m *MockConfig) GetPeerManagementType() string { m.Mux.RLock() defer m.Mux.RUnlock() - return m.PeerManagementType, nil + return m.PeerManagementType } -func (m *MockConfig) GetDebugServiceAddr() (string, error) { +func (m *MockConfig) GetDebugServiceAddr() string { m.Mux.RLock() defer m.Mux.RUnlock() - return m.DebugServiceAddr, nil + return m.DebugServiceAddr } func (m *MockConfig) GetIsDryRun() bool { diff --git a/config/sampler_config.go b/config/sampler_config.go index 19ea3c0a9b..37917b2a32 100644 --- a/config/sampler_config.go +++ b/config/sampler_config.go @@ -5,6 +5,8 @@ import ( "regexp" "strconv" "strings" + + "github.com/honeycombio/refinery/generics" ) // Define some constants for rule comparison operators @@ -30,6 +32,16 @@ const ( NotIn = "not-in" ) +// ComputedField is a virtual field. It's value is calculated during rule evaluation. +// We use the `?.` prefix to distinguish computed fields from regular fields. +type ComputedField string + +const ( + // ComputedFieldPrefix is the prefix for computed fields. + ComputedFieldPrefix = "?." + NUM_DESCENDANTS ComputedField = ComputedFieldPrefix + "NUM_DESCENDANTS" +) + // The json tags in this file are used for conversion from the old format (see tools/convert for details). // They are deliberately all lowercase. // The yaml tags are used for the new format and are PascalCase. @@ -206,6 +218,7 @@ type RulesBasedDownstreamSampler struct { EMAThroughputSampler *EMAThroughputSamplerConfig `json:"emathroughputsampler" yaml:"EMAThroughputSampler,omitempty"` WindowedThroughputSampler *WindowedThroughputSamplerConfig `json:"windowedthroughputsampler" yaml:"WindowedThroughputSampler,omitempty"` TotalThroughputSampler *TotalThroughputSamplerConfig `json:"totalthroughputsampler" yaml:"TotalThroughputSampler,omitempty"` + DeterministicSampler *DeterministicSamplerConfig `json:"deterministicsampler" yaml:"DeterministicSampler,omitempty"` } type RulesBasedSamplerRule struct { @@ -223,7 +236,8 @@ func (r *RulesBasedSamplerRule) String() string { } type RulesBasedSamplerCondition struct { - Field string `json:"field" yaml:"Field" validate:"required"` + Field string `json:"field" yaml:"Field"` + Fields []string `json:"fields" yaml:"Fields,omitempty"` Operator string `json:"operator" yaml:"Operator" validate:"required"` Value any `json:"value" yaml:"Value" ` Datatype string `json:"datatype" yaml:"Datatype,omitempty"` @@ -231,6 +245,16 @@ type RulesBasedSamplerCondition struct { } func (r *RulesBasedSamplerCondition) Init() error { + // if Field is specified, we move it into Fields so that we don't have to deal with checking both. + if r.Field != "" { + // we're going to check that both aren't defined -- this should have been caught by validation + // but we'll also check here just in case. + if len(r.Fields) > 0 { + return fmt.Errorf("both Field and Fields are defined in a single condition") + } + // now we know it's safe to move Field into Fields + r.Fields = []string{r.Field} + } return r.setMatchesFunction() } @@ -238,6 +262,14 @@ func (r *RulesBasedSamplerCondition) String() string { return fmt.Sprintf("%+v", *r) } +func (r *RulesBasedSamplerCondition) GetComputedField() (ComputedField, bool) { + if strings.HasPrefix(r.Field, ComputedFieldPrefix) { + return ComputedField(r.Field), true + } + return "", false + +} + func (r *RulesBasedSamplerCondition) setMatchesFunction() error { switch r.Operator { case Exists: @@ -257,6 +289,11 @@ func (r *RulesBasedSamplerCondition) setMatchesFunction() error { if err != nil { return err } + case In, NotIn: + err := setInBasedOperators(r, r.Operator) + if err != nil { + return err + } case MatchesRegexp: err := setRegexStringMatchOperator(r) if err != nil { @@ -314,16 +351,13 @@ func tryConvertToFloat(v any) (float64, bool) { // "standard" format, which we are defining as whatever Go does with the %v // operator to sprintf. This will make sure that no matter how people encode // their values, they compare on an equal footing. -func tryConvertToString(v any) (string, bool) { - return fmt.Sprintf("%v", v), true +// This function can never fail, so it's not named "tryConvert" like the others. +func convertToString(v any) string { + return fmt.Sprintf("%v", v) } func TryConvertToBool(v any) bool { - value, ok := tryConvertToString(v) - if !ok { - return false - } - str, err := strconv.ParseBool(value) + str, err := strconv.ParseBool(convertToString(v)) if err != nil { return false } @@ -337,58 +371,37 @@ func TryConvertToBool(v any) bool { func setCompareOperators(r *RulesBasedSamplerCondition, condition string) error { switch r.Datatype { case "string": - conditionValue, ok := tryConvertToString(r.Value) - if !ok { - return fmt.Errorf("could not convert %v to string", r.Value) - } + conditionValue := convertToString(r.Value) switch condition { case NEQ: r.Matches = func(spanValue any, exists bool) bool { - if n, ok := tryConvertToString(spanValue); exists && ok { - return n != conditionValue - } - return false + return convertToString(spanValue) != conditionValue } return nil case EQ: r.Matches = func(spanValue any, exists bool) bool { - if n, ok := tryConvertToString(spanValue); exists && ok { - return n == conditionValue - } - return false + return convertToString(spanValue) == conditionValue } return nil case GT: r.Matches = func(spanValue any, exists bool) bool { - if n, ok := tryConvertToString(spanValue); exists && ok { - return n > conditionValue - } - return false + return convertToString(spanValue) > conditionValue } return nil case GTE: r.Matches = func(spanValue any, exists bool) bool { - if n, ok := tryConvertToString(spanValue); exists && ok { - return n >= conditionValue - } - return false + return convertToString(spanValue) >= conditionValue } return nil case LT: r.Matches = func(spanValue any, exists bool) bool { - if n, ok := tryConvertToString(spanValue); exists && ok { - return n < conditionValue - } - return false + return convertToString(spanValue) < conditionValue } return nil case LTE: r.Matches = func(spanValue any, exists bool) bool { - if n, ok := tryConvertToString(spanValue); exists && ok { - return n <= conditionValue - } - return false + return convertToString(spanValue) <= conditionValue } return nil } @@ -534,35 +547,92 @@ func setCompareOperators(r *RulesBasedSamplerCondition, condition string) error } func setMatchStringBasedOperators(r *RulesBasedSamplerCondition, condition string) error { - conditionValue, ok := tryConvertToString(r.Value) - if !ok { - return fmt.Errorf("%s value must be a string, but was '%s'", condition, r.Value) - } + conditionValue := convertToString(r.Value) switch condition { case StartsWith: r.Matches = func(spanValue any, exists bool) bool { - s, ok := tryConvertToString(spanValue) - if ok { - return strings.HasPrefix(s, conditionValue) - } - return false + return strings.HasPrefix(convertToString(spanValue), conditionValue) } case Contains: r.Matches = func(spanValue any, exists bool) bool { - s, ok := tryConvertToString(spanValue) - if ok { - return strings.Contains(s, conditionValue) - } - return false + return strings.Contains(convertToString(spanValue), conditionValue) } case DoesNotContain: r.Matches = func(spanValue any, exists bool) bool { - s, ok := tryConvertToString(spanValue) - if ok { - return !strings.Contains(s, conditionValue) + return !strings.Contains(convertToString(spanValue), conditionValue) + } + } + + return nil +} + +func setInBasedOperators(r *RulesBasedSamplerCondition, condition string) error { + var matches func(spanValue any, exists bool) bool + + // we'll support having r.Value be either a single scalar or a list of scalars + // so to avoid having to check the type of r.Value every time, we'll just convert + // it to a list of scalars and then check the type of each scalar as we iterate + var value []any + switch v := r.Value.(type) { + case []any: + value = v + case string, int, float64: + value = []any{v} + default: + return fmt.Errorf("value must be a list of scalars") + } + + switch r.Datatype { + // if datatype is not specified, we'll always convert the values to strings + case "string", "": + values := generics.NewSet[string]() + for _, v := range value { + value := convertToString(v) + values.Add(value) + } + matches = func(spanValue any, exists bool) bool { + s := convertToString(spanValue) + return values.Contains(s) + } + case "int": + values := generics.NewSet[int]() + for _, v := range value { + value, ok := tryConvertToInt(v) + if !ok { + // validation should have caught this, so we'll just skip it + continue } - return false + values.Add(value) + } + matches = func(spanValue any, exists bool) bool { + i, ok := tryConvertToInt(spanValue) + return ok && values.Contains(i) + } + case "float": + values := generics.NewSet[float64]() + for _, v := range value { + value, ok := tryConvertToFloat(v) + if !ok { + // validation should have caught this, so we'll just skip it + continue + } + values.Add(value) + } + matches = func(spanValue any, exists bool) bool { + f, ok := tryConvertToFloat(spanValue) + return ok && values.Contains(f) + } + case "bool": + return fmt.Errorf("cannot use %s operator with boolean datatype", condition) + } + + switch condition { + case In: + r.Matches = matches + case NotIn: + r.Matches = func(spanValue any, exists bool) bool { + return !matches(spanValue, exists) } } @@ -570,10 +640,7 @@ func setMatchStringBasedOperators(r *RulesBasedSamplerCondition, condition strin } func setRegexStringMatchOperator(r *RulesBasedSamplerCondition) error { - conditionValue, ok := tryConvertToString(r.Value) - if !ok { - return fmt.Errorf("regex value must be a string, but was '%s'", r.Value) - } + conditionValue := convertToString(r.Value) regex, err := regexp.Compile(conditionValue) if err != nil { @@ -581,11 +648,8 @@ func setRegexStringMatchOperator(r *RulesBasedSamplerCondition) error { } r.Matches = func(spanValue any, exists bool) bool { - s, ok := tryConvertToString(spanValue) - if ok { - return regex.MatchString(s) - } - return false + s := convertToString(spanValue) + return regex.MatchString(s) } return nil diff --git a/config/sampler_config_test.go b/config/sampler_config_test.go index 09bde697e3..19cd776b36 100644 --- a/config/sampler_config_test.go +++ b/config/sampler_config_test.go @@ -1,6 +1,8 @@ package config -import "testing" +import ( + "testing" +) func Test_setCompareOperators(t *testing.T) { tests := []struct { @@ -97,3 +99,80 @@ func Test_setCompareOperators(t *testing.T) { }) } } + +func anys(a ...any) []any { + return a +} + +func Test_setInBasedOperators(t *testing.T) { + tests := []struct { + name string + datatype string + testvalue any + value any + shouldContain bool + wantErr bool + }{ + // we want to test many different combinations of datatypes and conditions + // datatypes: string, int, float, bool, for all 3 of datatype, value, testvalue + // conditions, plus 4 different boolean states. That's a lot of cases, so + // we'll try some representative ones to limit the scope. + // In and NotIn are true opposites, so we can test both with the same test cases. + {"s1", "string", "foo", "bar", false, false}, + {"s2", "string", "bar", "foo", false, false}, + {"s3", "string", "bar", "bar", true, false}, + {"s4", "string", "bar", anys("foo", "bar"), true, false}, + {"s5", "string", "10", 10, true, false}, + {"s6", "string", "1", 10, false, false}, + {"i1", "int", "1", 1, true, false}, + {"i2", "int", "10", 1, false, false}, + {"f1", "float", "1", 1, true, false}, + {"f2", "float", "10", 1, false, false}, + {"b", "bool", "true", true, false, true}, + {"s7", "string", "a", anys("a", "b", "c", "d"), true, false}, + {"s8", "string", "d", anys("a", "b", "c", "d"), true, false}, + {"s9", "string", "h", anys("a", "b", "c", "d"), false, false}, + {"i3", "int", "1", anys(1, 2, 3, 4), true, false}, + {"i4", "int", "5", anys(1, 2, 3, 4), false, false}, + {"i5", "int", 5, anys(1, 2, 3, 4, "5"), true, false}, + {"f3", "float", "1.5", anys(1.5, 2.5, 1.6, 4), true, false}, + {"f4", "float", 5.0, anys(1, 2, 3, 4), false, false}, + {"f5", "float", 5.0, anys(1, 2, 3, 4, "5.0"), true, false}, + {"s10", "string", "1.5", anys(1.5, 2.5, 1.6, 4), true, false}, + {"s11", "string", 5.0, anys(1, 2, 3, 4), false, false}, + {"s12", "string", 5.0, anys(1, 2, 3, 4, "5"), true, false}, + {"n1", "", "1", anys(1, 2, 3, 4), true, false}, + {"n2", "", "5", anys(1, 2, 3, 4), false, false}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + rbsc := &RulesBasedSamplerCondition{ + Datatype: tt.datatype, + Value: tt.value, + } + // test In + err := setInBasedOperators(rbsc, In) + if (err != nil) != tt.wantErr { + t.Errorf("setCompareOperators() error = %v, wantErr %v", err, tt.wantErr) + } + if err == nil { + result := rbsc.Matches(tt.testvalue, true) + if result != tt.shouldContain { + t.Errorf("setCompareOperators() result = %v, shouldContain %v", result, tt.shouldContain) + } + } + // test NotIn + err = setInBasedOperators(rbsc, NotIn) + if (err != nil) != tt.wantErr { + t.Errorf("setCompareOperators() error = %v, wantErr %v", err, tt.wantErr) + } + if err == nil { + result := rbsc.Matches(tt.testvalue, true) + // opposite result + if result != !tt.shouldContain { + t.Errorf("setCompareOperators() result = %v, should not Contain %v", result, !tt.shouldContain) + } + } + }) + } +} diff --git a/config/validate.go b/config/validate.go index 855b5c11f7..019068438b 100644 --- a/config/validate.go +++ b/config/validate.go @@ -99,6 +99,24 @@ func validateDatatype(k string, v any, typ string) string { default: return fmt.Sprintf("field %s must be a string, int, float, or bool", k) } + case "sliceorscalar": + switch vt := v.(type) { + case string, int, int64, float64, bool: + // we're good + case []any: + // we need to check that the slice is all the same type + // if it's empty or 1 element, it's fine + if len(v.([]any)) > 1 { + firstType := fmt.Sprintf("%T", vt[0]) + for i, a := range vt { + if fmt.Sprintf("%T", a) != firstType { + return fmt.Sprintf("field %s must be a slice of all the same type, but element %d is %T", k, i, a) + } + } + } + default: + return fmt.Sprintf("field %s must be a list of string, int, float, or bool", k) + } case "string": if !isString(v) { return fmt.Sprintf("field %s must be a string but %v is %T", k, v, v) @@ -184,6 +202,19 @@ func validateDatatype(k string, v any, typ string) string { if u.Scheme != "http" && u.Scheme != "https" { return fmt.Sprintf("field %s (%v) must use an http or https scheme", k, v) } + case "defaulttrue": + switch val := v.(type) { + case bool: + return "" + case string: + switch strings.ToLower(val) { + case "t", "true", "f", "false": + default: + return fmt.Sprintf("field %s (%v) must be 'true', 'false', 't', or 'f'", k, v) + } + default: + return fmt.Sprintf("field %s (%v) must be a bool or string with value true/false or 'true'/'false'/'t'/'f'", k, v) + } default: panic("unknown data type " + typ) } @@ -256,6 +287,7 @@ func (m *Metadata) Validate(data map[string]any) []string { } } for _, validation := range field.Validations { + nextValidation: switch validation.Type { case "choice": if !(isString(v) && slices.Contains(field.Choices, v.(string))) { @@ -266,8 +298,18 @@ func (m *Metadata) Validate(data map[string]any) []string { var format string mask := false switch validation.Arg.(string) { + case "apikeyOrBlank": + // allow an empty string as well as a valid API key + if v.(string) == "" { + break nextValidation + } + fallthrough // fallthrough to the apikey case case "apikey": - pat = regexp.MustCompile(`^[a-f0-9]{32}|[a-zA-Z0-9]{20,23}$`) + // valid API key formats are: + // 1. 32 hex characters ("classic" Honeycomb API key) + // 2. 20-23 alphanumeric characters (new-style Honeycomb API key) + // 3. hc<1 letter region)><2 letter keytype>_<58 alphanumeric characters>} (ingest key) + pat = regexp.MustCompile(`^([a-f0-9]{32}|[a-zA-Z0-9]{20,23}|hc[a-z][a-z]{2}_[a-z0-9]{58})$`) format = "field %s (%v) must be a valid Honeycomb API key" mask = true case "version": diff --git a/config/validate_test.go b/config/validate_test.go index db50b50de7..dc9ccff957 100644 --- a/config/validate_test.go +++ b/config/validate_test.go @@ -143,6 +143,14 @@ groups: validations: - type: notempty + - name: PeerManagement + fields: + - name: Peers + type: stringarray + validations: + - type: elementType + arg: url + - name: RequireTest fields: - name: FieldA @@ -268,8 +276,11 @@ func Test_validate(t *testing.T) { `field Network.APIKey (****) must be a valid Honeycomb API key`}, {"bad format apikey long", mm("Network.APIKey", "abc123abc123whee"), `field Network.APIKey (****whee) must be a valid Honeycomb API key`}, + {"bad format ingest key long", mm("Network.APIKey", "xxxxx-abcdefgh12345678abcdefgh12345678abcdefgh12345678aabbccddee"), + `field Network.APIKey (****ddee) must be a valid Honeycomb API key`}, {"good format apikey", mm("Network.APIKey", "abc123abc123abc123abc123abc123ab"), ""}, {"good format apikey", mm("Network.APIKey", "NewStyleKeyWith22chars"), ""}, + {"good format ingest key", mm("Network.APIKey", "hcaik_01hshz0tyh2fqa9wznx5a1jf4exbmsd3jj4p89k8c02eb7tx4mwgs7tf99"), ""}, {"good format version", mm("General.Version", "v2.0"), ""}, {"bad format version1", mm("General.Version", "2.0"), "field General.Version (2.0) must be a valid major.minor version number, like v2.0"}, {"bad format version2", mm("General.Version", "v2.0.0"), "field General.Version (v2.0.0) must be a valid major.minor version number, like v2.0"}, @@ -305,6 +316,8 @@ func Test_validate(t *testing.T) { {"bad slice elementType", mm("Traces.AStringArray", []any{"0.0.0.0"}), "field Traces.AStringArray[0] (0.0.0.0) must be a hostport: address 0.0.0.0: missing port in address"}, {"good map elementType", mm("Traces.AStringMap", map[string]any{"k": "v"}), ""}, {"bad map elementType", mm("Traces.AStringMap", map[string]any{"k": 1}), "field Traces.AStringMap[k] must be a string"}, + {"bad peer url", mm("PeerManagement.Peers", []any{"0.0.0.0:8082", "http://192.168.1.1:8088"}), "must be a valid UR"}, + {"good peer url", mm("PeerManagement.Peers", []any{"http://0.0.0.0:8082", "http://192.168.1.1:8088"}), ""}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { diff --git a/config_complete.yaml b/config_complete.yaml index 32118ccefd..69f13602ef 100644 --- a/config_complete.yaml +++ b/config_complete.yaml @@ -2,7 +2,7 @@ ## Honeycomb Refinery Configuration ## ###################################### # -# created on 2023-12-04 at 18:06:30 UTC from ../../config.yaml using a template generated on 2023-12-04 at 18:06:10 UTC +# created on 2024-09-05 at 17:40:32 UTC from ../../config.yaml using a template generated on 2024-09-05 at 17:40:29 UTC # This file contains a configuration for the Honeycomb Refinery. It is in YAML # format, organized into named groups, each of which contains a set of @@ -63,8 +63,11 @@ General: ## up to 10% to avoid all instances refreshing together. In installations ## where configuration changes are handled by restarting Refinery, which ## is often the case when using Kubernetes, disable this feature with a - ## value of `0s`. If the config file is being loaded from a URL, it may - ## be wise to increase this value to avoid overloading the file server. + ## value of `0s`. As of Refinery v2.7, news of a configuration change is + ## immediately propagated to all peers, and they will attempt to reload + ## their configurations. Note that external factors (for example, + ## Kubernetes ConfigMaps) may cause delays in propagating configuration + ## changes. ## ## Accepts a duration string with units, like "15s". ## default: 15s @@ -152,10 +155,43 @@ AccessKeys: ## accepted. Events arriving with API keys not in the `ReceiveKeys` list ## will be rejected with an HTTP `401` error. ## If `false`, then all traffic is accepted and `ReceiveKeys` is ignored. + ## This setting is applied **before** the `SendKey` and `SendKeyMode` + ## settings. ## ## Eligible for live reload. # AcceptOnlyListedKeys: false + ## SendKey is an optional Honeycomb API key that Refinery can use to send + ## data to Honeycomb, depending on configuration. + ## + ## If `SendKey` is set to a valid Honeycomb key, then Refinery can use + ## the listed key to send data. The exact behavior depends on the value + ## of `SendKeyMode`. + ## + ## Eligible for live reload. + # SendKey: "" + + ## SendKeyMode controls how SendKey is used to replace or augment API + ## keys used in incoming telemetry. + ## + ## Controls how SendKey is used to replace or supply API keys used in + ## incoming telemetry. If `AcceptOnlyListedKeys` is `true`, then + ## `SendKeys` will only be used for events with keys listed in + ## `ReceiveKeys`. + ## `none` uses the incoming key for all telemetry (default). `all` + ## overwrites all keys, even missing ones, with `SendKey`. `nonblank` + ## overwrites all supplied keys but will not inject `SendKey` if the + ## incoming key is blank. `listedonly` overwrites only the keys listed in + ## `ReceiveKeys`. `unlisted` uses the `SendKey` for all events *except* + ## those with keys listed in `ReceiveKeys`, which use their original + ## keys. `missingonly` uses the SendKey only to inject keys into events + ## with blank keys. All other events use their original keys. + ## + ## default: none + ## Eligible for live reload. + ## Options: none all nonblank listedonly unlisted missingonly + # SendKeyMode: none + ######################## ## Refinery Telemetry ## ######################## @@ -173,11 +209,11 @@ RefineryTelemetry: ## This setting also includes the field `meta.refinery.send_reason`, ## which contains the reason that the trace was sent. Possible values of ## this field are `trace_send_got_root`, which means that the root span - ## arrived; `trace_send_expired`, which means that TraceTimeout was + ## arrived; `trace_send_expired`, which means that `TraceTimeout` was ## reached; `trace_send_ejected_full`, which means that the trace cache - ## was full; and `trace_send_ejected_memsize`, which means that refinery + ## was full; and `trace_send_ejected_memsize`, which means that Refinery ## was out of memory. - ## These names are also the names of metrics that refinery tracks. + ## These names are also the names of metrics that Refinery tracks. ## We recommend enabling this setting whenever a rules-based sampler is ## in use, as it is useful for debugging and understanding the behavior ## of your Refinery installation. @@ -206,11 +242,11 @@ RefineryTelemetry: ## ## If `true`, then Refinery will ignore the `AddSpanCountToRoot` setting ## and add the following fields to the root span based on the values at - ## the time the sampling decision was made: - `meta.span_count`: the - ## number of child spans on the trace - `meta.span_event_count`: the - ## number of span events on the trace - `meta.span_link_count`: the - ## number of span links on the trace - `meta.event_count`: the number of - ## honeycomb events on the trace + ## the time the sampling decision was made: + ## - `meta.span_count`: the number of child spans on the trace + ## - `meta.span_event_count`: the number of span events on the trace + ## - `meta.span_link_count`: the number of span links on the trace + ## - `meta.event_count`: the number of honeycomb events on the trace ## ## Eligible for live reload. # AddCountsToRoot: false @@ -231,13 +267,15 @@ RefineryTelemetry: Traces: ## Traces contains configuration for how traces are managed. #### - ## SendDelay is the duration to wait before sending a trace. + ## SendDelay is the duration to wait after the root span arrives before + ## sending a trace. ## - ## This setting is a short timer that is triggered when a trace is - ## complete. Refinery waits for this duration before sending the trace. - ## The reason for this setting is to allow for small network delays or - ## clock jitters to elapse and any final spans to arrive before sending - ## the trace. Set to "0" for immediate sending. + ## This setting is a short timer that is triggered when a trace is marked + ## complete by the arrival of the root span. Refinery waits for this + ## duration before sending the trace. This setting exists to allow for + ## asynchronous spans and small network delays to elapse before sending + ## the trace. `SendDelay` is not applied if the `TraceTimeout` expires or + ## the `SpanLimit` is reached. ## ## Accepts a duration string with units, like "2s". ## default: 2s @@ -260,8 +298,13 @@ Traces: ## before making the trace decision about an incomplete trace. ## Normally trace decisions (send or drop) are made when the root span ## arrives. Sometimes the root span never arrives (for example, due to - ## crashes) and this timer ensures sending a trace even without having - ## received the root span. + ## crashes). Once this timer fires, Refinery will make a trace decision + ## based on the spans that have arrived so far. This ensures sending a + ## trace even when the root span never arrives. + ## After the trace decision has been made, Refinery retains a record of + ## that decision for a period of time. When additional spans (including + ## the root span) arrive, they will be kept or dropped based on the + ## original decision. ## If particularly long-lived traces are present in your data, then you ## should increase this timer. Note that this increase will also increase ## the memory requirements for Refinery. @@ -271,6 +314,19 @@ Traces: ## Eligible for live reload. # TraceTimeout: 60s + ## SpanLimit is the number of spans after which a trace becomes eligible + ## for a trace decision. + ## + ## This setting helps to keep memory usage under control. If a trace has + ## more than this set number of spans, then it becomes eligible for a + ## trace decision. + ## It's most helpful in a situation where a sudden burst of many spans in + ## a large trace hits Refinery all at once, causing memory usage to spike + ## and possibly crashing Refinery. + ## + ## Eligible for live reload. + # SpanLimit: 0 + ## MaxBatchSize is the maximum number of events to be included in each ## batch for sending. ## @@ -316,7 +372,9 @@ Debugging: # DebugServiceAddr: "localhost:6060" ## QueryAuthToken is the token that must be specified to access the - ## `/query` endpoint. + ## `/query` endpoint. Setting this value via a command line flag may + ## expose credentials - it is recommended to use the environment variable + ## or a configuration file. ## ## This token must be specified with the header ## "X-Honeycomb-Refinery-Query" in order for a `/query` request to @@ -403,6 +461,9 @@ HoneycombLogger: # APIHost: "https://api.honeycomb.io" ## APIKey is the API key used to send Refinery's logs to Honeycomb. + ## Setting this value via a command line flag may expose credentials - it + ## is recommended to use the environment variable or a configuration + ## file. ## ## It is recommended that you create a separate team and key for Refinery ## logs. @@ -457,7 +518,7 @@ StdoutLogger: # Structured: false ## SamplerEnabled controls whether logs are sampled before sending to - ## stdout. + ## `stdout`. ## ## The sample rate is controlled by the `SamplerThroughput` setting. ## @@ -469,7 +530,7 @@ StdoutLogger: ## ## The sampling algorithm attempts to make sure that the average ## throughput approximates this value, while also ensuring that all - ## unique logs arrive at stdout at least once per sampling period. + ## unique logs arrive at `stdout` at least once per sampling period. ## ## default: 10 ## Not eligible for live reload. @@ -533,7 +594,9 @@ LegacyMetrics: # APIHost: "https://api.honeycomb.io" ## APIKey is the API key used by Refinery to send its metrics to - ## Honeycomb. + ## Honeycomb. Setting this value via a command line flag may expose + ## credentials - it is recommended to use the environment variable or a + ## configuration file. ## ## It is recommended that you create a separate team and key for Refinery ## metrics. @@ -587,7 +650,9 @@ OTelMetrics: # APIHost: "https://api.honeycomb.io" ## APIKey is the API key used to send Honeycomb metrics via - ## OpenTelemetry. + ## OpenTelemetry. Setting this value via a command line flag may expose + ## credentials - it is recommended to use the environment variable or a + ## configuration file. ## ## It is recommended that you create a separate team and key for Refinery ## metrics. @@ -629,6 +694,62 @@ OTelMetrics: ## Options: none gzip # Compression: gzip +########################### +## OpenTelemetry Tracing ## +########################### +OTelTracing: + ## OTelTracing contains configuration for Refinery's own tracing. + #### + ## Enabled controls whether to send Refinery's own OpenTelemetry traces. + ## + ## The setting specifies if Refinery sends traces. + ## + ## Not eligible for live reload. + # Enabled: false + + ## APIHost is the URL of the OpenTelemetry API to which traces will be + ## sent. + ## + ## Refinery's internal traces will be sent to the `/v1/traces` endpoint + ## on this host. + ## + ## default: https://api.honeycomb.io + ## Not eligible for live reload. + # APIHost: "https://api.honeycomb.io" + + ## APIKey is the API key used to send Refinery's traces to Honeycomb. + ## Setting this value via a command line flag may expose credentials - it + ## is recommended to use the environment variable or a configuration + ## file. + ## + ## It is recommended that you create a separate team and key for Refinery + ## telemetry. + ## If this value is blank, then Refinery will not set the + ## Honeycomb-specific headers for OpenTelemetry, and your `APIHost` must + ## be set to a valid OpenTelemetry endpoint. + ## + ## Not eligible for live reload. + # APIKey: "" + + ## Dataset is the Honeycomb dataset to which Refinery sends its + ## OpenTelemetry metrics. + ## + ## Only used if `APIKey` is specified. + ## + ## default: Refinery Traces + ## Not eligible for live reload. + # Dataset: "Refinery Traces" + + ## SampleRate is the rate at which Refinery samples its own traces. + ## + ## This is the Honeycomb sample rate used to sample traces sent by + ## Refinery. Since each incoming span generates multiple outgoing spans, + ## a minimum sample rate of `100` is strongly advised. + ## + ## default: 100 + ## Eligible for live reload. + # SampleRate: 100 + ##################### ## Peer Management ## ##################### @@ -640,9 +761,16 @@ PeerManagement: ## ## Peer management is the mechanism by which Refinery locates its peers. ## `file` means that Refinery gets its peer list from the Peers list in - ## this config file. - ## `redis` means that Refinery self-registers with a Redis instance and - ## gets its peer list from there. + ## this config file. It also prevents Refinery from using a + ## publish/subscribe mechanism to propagate peer lists, stress levels, + ## and configuration changes. + ## `redis` means that Refinery uses a Publish/Subscribe mechanism, + ## implemented on Redis, to propagate peer lists, stress levels, and + ## notification of configuration changes much more quickly than the + ## legacy mechanism. + ## The recommended setting is `redis`, especially for new installations. + ## If `redis` is specified, fields in `RedisPeerManagement` must also be + ## set. ## ## default: file ## Not eligible for live reload. @@ -690,20 +818,20 @@ PeerManagement: ## Peers is the list of peers to use when Type is "file", excluding self. ## ## This list is ignored when Type is "redis". The format is a list of - ## strings of the form "host:port". + ## strings of the form "scheme://host:port". ## ## Not eligible for live reload. # Peers: - # - 192.168.1.11:8081 - # - 192.168.1.12:8081 + # - http://192.168.1.11:8081 + # - http://192.168.1.12:8081 ########################### ## Redis Peer Management ## ########################### RedisPeerManagement: ## RedisPeerManagement controls how the Refinery cluster communicates - ## between peers when using Redis. Only applies when - ## `PeerManagement.Type` is "redis". + ## between peers when using Redis. Does not apply when + ## `PeerManagement.Type` is "file". ## #### ## Host is the host and port of the Redis instance to use for peer @@ -715,8 +843,22 @@ RedisPeerManagement: ## Not eligible for live reload. # Host: "localhost:6379" + ## ClusterHosts is a list of host and port pairs for the instances in a + ## Redis Cluster, and used for managing peer cluster membership. + ## + ## This configuration enables Refinery to connect to a Redis deployment + ## setup in Cluster Mode. Each entry in the list should follow the format + ## `host:port`. If `ClusterHosts` is specified, the `Host` setting will + ## be ignored. + ## + ## Not eligible for live reload. + # ClusterHosts: + # - - localhost:6379 + ## Username is the username used to connect to Redis for peer cluster - ## membership management. + ## membership management. Setting this value via a command line flag may + ## expose credentials - it is recommended to use the environment variable + ## or a configuration file. ## ## Many Redis installations do not use this field. ## @@ -724,7 +866,9 @@ RedisPeerManagement: # Username: "" ## Password is the password used to connect to Redis for peer cluster - ## membership management. + ## membership management. Setting this value via a command line flag may + ## expose credentials - it is recommended to use the environment variable + ## or a configuration file. ## ## Many Redis installations do not use this field. ## @@ -732,35 +876,15 @@ RedisPeerManagement: # Password: "" ## AuthCode is the string used to connect to Redis for peer cluster - ## membership management using an explicit AUTH command. + ## membership management using an explicit AUTH command. Setting this + ## value via a command line flag may expose credentials - it is + ## recommended to use the environment variable or a configuration file. ## ## Many Redis installations do not use this field. ## ## Not eligible for live reload. # AuthCode: "" - ## Prefix is a string used as a prefix for the keys in Redis while - ## storing the peer membership. - ## - ## It might be useful to override this in any situation where multiple - ## Refinery clusters or multiple applications want to share a single - ## Redis instance. It may not be blank. - ## - ## default: refinery - ## Not eligible for live reload. - # Prefix: refinery - - ## Database is the database number to use for the Redis instance storing - ## the peer membership. - ## - ## An integer from 0-15 indicating the database number to use for the - ## Redis instance storing the peer membership. It might be useful to set - ## this in any situation where multiple Refinery clusters or multiple - ## applications want to share a single Redis instance. - ## - ## Not eligible for live reload. - # Database: 0 - ## UseTLS enables TLS when connecting to Redis for peer cluster ## membership management. ## @@ -802,12 +926,13 @@ Collection: ## CacheCapacity is the number of traces to keep in the cache's circular ## buffer. ## - ## The collection cache is used to collect all spans into a trace as well - ## as remember the sampling decision for any spans that might come in - ## after the trace has been marked "complete" (either by timing out or - ## seeing the root span). The number of traces in the cache should be - ## many multiples (100x to 1000x) of the total number of concurrently - ## active traces (trace throughput * trace duration). + ## The collection cache is used to collect all active spans into traces. + ## It is organized as a circular buffer. When the buffer wraps around, + ## Refinery will try a few times to find an empty slot; if it fails, it + ## starts ejecting traces from the cache earlier than would otherwise be + ## necessary. Ideally, the size of the cache should be many multiples + ## (100x to 1000x) of the total number of concurrently active traces + ## (average trace throughput * average trace duration). ## ## default: 10000 ## Eligible for live reload. @@ -822,7 +947,7 @@ Collection: ## queue is contingent upon the number of peers within the cluster. ## Specifically, with N peers, the queue's span capacity is determined by ## (N-1)/N of the total number of spans. Its minimum value should be at - ## least three times the CacheCapacity. + ## least three times the `CacheCapacity`. ## ## default: 30000 ## Not eligible for live reload. @@ -834,7 +959,7 @@ Collection: ## The incoming span queue is used to buffer spans before they are ## processed. If this queue fills up, then subsequent spans will be ## dropped. Its minimum value should be at least three times the - ## CacheCapacity. + ## `CacheCapacity`. ## ## default: 30000 ## Not eligible for live reload. @@ -847,10 +972,10 @@ Collection: ## controlled by the container or deploy script. If this value is zero or ## not set, then `MaxMemoryPercentage` cannot be used to calculate the ## maximum allocation and `MaxAlloc` will be used instead. If set, then - ## this must be a memory size. Sizes with standard unit suffixes (`MB`, - ## `GiB`, etc.) and Kubernetes units (`M`, `Gi`, etc.) are supported. - ## Fractional values with a suffix are supported. If `AvailableMemory` is - ## set, `Collections.MaxAlloc` must not be defined. + ## this must be a memory size. Sizes with standard unit suffixes (such as + ## `MB` and `GiB`) and Kubernetes units (such as `M` and `Gi`) are + ## supported. Fractional values with a suffix are supported. If + ## `AvailableMemory` is set, `Collections.MaxAlloc` must not be defined. ## ## Eligible for live reload. # AvailableMemory: "4.5Gb" @@ -874,14 +999,41 @@ Collection: ## the Collector. ## ## If set, then this must be a memory size. Sizes with standard unit - ## suffixes (`MB`, `GiB`, etc.) and Kubernetes units (`M`, `Gi`, etc.) - ## are supported. Fractional values with a suffix are supported. See - ## `MaxMemoryPercentage` for more details. If set, + ## suffixes (such as `MB` and `GiB`) and Kubernetes units (such as `M` + ## and `Gi`) are supported. Fractional values with a suffix are + ## supported. See `MaxMemoryPercentage` for more details. If set, ## `Collections.AvailableMemory` must not be defined. ## ## Eligible for live reload. # MaxAlloc: "" + ## DisableRedistribution controls whether to transmit traces in cache to + ## remaining peers during cluster scaling event. + ## + ## If `true`, Refinery will NOT forward live traces in its cache to the + ## rest of the peers when peers join or leave the cluster. By disabling + ## this behavior, it can help to prevent disruptive bursts of network + ## traffic when large traces with long `TraceTimeout` are redistributed. + ## + ## Eligible for live reload. + # DisableRedistribution: false + + ## ShutdownDelay controls the maximum time Refinery can use while + ## draining traces at shutdown. + ## + ## This setting controls the duration that Refinery expects to have to + ## drain in-process traces before shutting down an instance. When asked + ## to shut down gracefully, Refinery stops accepting new spans + ## immediately and drains the remaining traces by sending them to + ## remaining peers. This value should be set to a bit less than the + ## normal timeout period for shutting down without forcibly terminating + ## the process. + ## + ## Accepts a duration string with units, like "15s". + ## default: 15s + ## Eligible for live reload. + # ShutdownDelay: 15s + ################## ## Buffer Sizes ## ################## @@ -1004,6 +1156,7 @@ GRPCServerParameters: ## If `false`, then the gRPC server is not started and no gRPC traffic is ## accepted. ## + ## default: true ## Not eligible for live reload. # Enabled: false @@ -1084,7 +1237,7 @@ GRPCServerParameters: ## memory available to the process by a single request. The size is ## expressed in bytes. ## - ## default: 5MB + ## default: 15MB ## Not eligible for live reload. # MaxSendMsgSize: "" @@ -1094,7 +1247,7 @@ GRPCServerParameters: ## memory available to the process by a single request. The size is ## expressed in bytes. ## - ## default: 5MB + ## default: 15MB ## Not eligible for live reload. # MaxRecvMsgSize: "" @@ -1235,31 +1388,16 @@ StressRelief: ## Eligible for live reload. # MinimumActivationDuration: 10s - ## MinimumStartupDuration is the minimum time that Stress Relief will - ## stay enabled. - ## - ## This setting is used when switching into Monitor mode. - ## When Stress Relief is enabled, it will start up in stressed mode for - ## at least this set duration of time to try to make sure that Refinery - ## can handle the load before it begins processing it in earnest. This is - ## to help address the problem of trying to bring a new node into an - ## already-overloaded cluster. - ## If this duration is `0`, then Refinery will not start in stressed - ## mode, which will provide faster startup at the possible cost of - ## startup instability. - ## - ## Accepts a duration string with units, like "3s". - ## default: 3s - ## Eligible for live reload. - # MinimumStartupDuration: 3s - ################################################### ## Config values removed by the config converter ## ################################################### ## The following configuration options are obsolete and are not included ## in the new configuration: ## + ## - PeerManagement.Prefix + ## - PeerManagement.Database ## - PeerManagement.Strategy ## - Collector ## - InMemCollector.CacheOverrunStrategy ## - SampleCacheConfig/SampleCache.Type + ## - StressRelief.MinimumStartupDuration diff --git a/generics/fanout.go b/generics/fanout.go new file mode 100644 index 0000000000..d0c552e8fc --- /dev/null +++ b/generics/fanout.go @@ -0,0 +1,237 @@ +package generics + +import "sync" + +// Fanout takes a slice of input, a parallelism factor, and a worker factory. It +// calls the generated worker on every element of the input, and returns a +// (possibly filtered) slice of the outputs in no particular order. Only the +// outputs that pass the predicate (if it is not nil) will be added to the +// slice. +// +// The factory takes an integer (the worker number) and constructs a function of +// type func(T) U that processes a single input and produces a single output. It +// also constructs a cleanup function, which may be nil. The cleanup function is +// called once for each worker, after the worker has completed processing all of +// its inputs. It is given the same index as the corresponding worker factory. +// +// If predicate is not nil, it will only add the output to the result slice if +// the predicate returns true. It will fan out the input to the worker function +// in parallel, and fan in the results to the output slice. +func Fanout[T, U any](input []T, parallelism int, workerFactory func(int) (worker func(T) U, cleanup func(int)), predicate func(U) bool) []U { + result := make([]U, 0) + + fanoutChan := make(chan T, parallelism) + faninChan := make(chan U, parallelism) + + // send all the trace IDs to the fanout channel + wgFans := sync.WaitGroup{} + wgFans.Add(1) + go func() { + defer wgFans.Done() + defer close(fanoutChan) + for i := range input { + fanoutChan <- input[i] + } + }() + + wgFans.Add(1) + go func() { + defer wgFans.Done() + for r := range faninChan { + result = append(result, r) + } + }() + + wgWorkers := sync.WaitGroup{} + for i := 0; i < parallelism; i++ { + wgWorkers.Add(1) + worker, cleanup := workerFactory(i) + go func(i int) { + defer wgWorkers.Done() + if cleanup != nil { + defer cleanup(i) + } + for u := range fanoutChan { + product := worker(u) + if predicate == nil || predicate(product) { + faninChan <- product + } + } + }(i) + } + + // wait for the workers to finish + wgWorkers.Wait() + // now we can close the fanin channel and wait for the fanin goroutine to finish + // fanout should already be done but this makes sure we don't lose track of it + close(faninChan) + wgFans.Wait() + + return result +} + +// EasyFanout is a convenience function for when you don't need all the +// features. It takes a slice of input, a parallelism factor, and a worker +// function. It calls the worker on every element of the input with the +// specified parallelism, and returns a slice of the outputs in no particular +// order. +func EasyFanout[T, U any](input []T, parallelism int, worker func(T) U) []U { + return Fanout(input, parallelism, func(int) (func(T) U, func(int)) { + return worker, nil + }, nil) +} + +// FanoutToMap takes a slice of input, a parallelism factor, and a worker +// factory. It calls the generated worker on every element of the input, and +// returns a (possibly filtered) map of the inputs to the outputs. Only the +// outputs that pass the predicate (if it is not nil) will be added to the map. +// +// The factory takes an integer (the worker number) and constructs a function of +// type func(T) U that processes a single input and produces a single output. It +// also constructs a cleanup function, which may be nil. The cleanup function is +// called once for each worker, after the worker has completed processing all of +// its inputs. It is given the same index as the corresponding worker factory. +// +// If predicate is not nil, it will only add the output to the result slice if +// the predicate returns true. It will fan out the input to the worker function +// in parallel, and fan in the results to the output slice. +func FanoutToMap[T comparable, U any](input []T, parallelism int, workerFactory func(int) (worker func(T) U, cleanup func(int)), predicate func(U) bool) map[T]U { + result := make(map[T]U) + type resultPair struct { + key T + val U + } + + fanoutChan := make(chan T, parallelism) + faninChan := make(chan resultPair, parallelism) + + // send all the trace IDs to the fanout channel + wgFans := sync.WaitGroup{} + wgFans.Add(1) + go func() { + defer wgFans.Done() + defer close(fanoutChan) + for i := range input { + fanoutChan <- input[i] + } + }() + + wgFans.Add(1) + go func() { + defer wgFans.Done() + for r := range faninChan { + result[r.key] = r.val + } + }() + + wgWorkers := sync.WaitGroup{} + for i := 0; i < parallelism; i++ { + wgWorkers.Add(1) + worker, cleanup := workerFactory(i) + go func(i int) { + defer wgWorkers.Done() + if cleanup != nil { + defer cleanup(i) + } + for t := range fanoutChan { + product := worker(t) + if predicate == nil || predicate(product) { + faninChan <- resultPair{t, product} + } + } + }(i) + } + + // wait for the workers to finish + wgWorkers.Wait() + // now we can close the fanin channel and wait for the fanin goroutine to finish + // fanout should already be done but this makes sure we don't lose track of it + close(faninChan) + wgFans.Wait() + + return result +} + +// EasyFanoutToMap is a convenience function for when you don't need all the +// features. It takes a slice of input, a parallelism factor, and a worker +// function. It calls the worker on every element of the input with the +// specified parallelism, and returns a map of the inputs to the outputs. +func EasyFanoutToMap[T comparable, U any](input []T, parallelism int, worker func(T) U) map[T]U { + return FanoutToMap(input, parallelism, func(int) (func(T) U, func(int)) { + return worker, nil + }, nil) +} + +// FanoutChunksToMap takes a slice of input, a chunk size, a maximum parallelism +// factor, and a worker factory. It calls the generated worker on every chunk of +// the input, and returns a (possibly filtered) map of the inputs to the +// outputs. Only the outputs that pass the predicate (if it is not nil) will be +// added to the map. +// +// The maximum parallelism factor is the maximum number of workers that will be +// run in parallel. The actual number of workers will be the minimum of the +// maximum parallelism factor and the number of chunks in the input. +func FanoutChunksToMap[T comparable, U any](input []T, chunkSize int, maxParallelism int, workerFactory func(int) (worker func([]T) map[T]U, cleanup func(int)), predicate func(U) bool) map[T]U { + result := make(map[T]U, 0) + + if chunkSize <= 0 { + chunkSize = 1 + } + + type resultPair struct { + key T + val U + } + parallelism := min(maxParallelism, max(len(input)/chunkSize, 1)) + fanoutChan := make(chan []T, parallelism) + faninChan := make(chan resultPair, parallelism) + + // send all the trace IDs to the fanout channel + wgFans := sync.WaitGroup{} + wgFans.Add(1) + go func() { + defer wgFans.Done() + defer close(fanoutChan) + for i := 0; i < len(input); i += chunkSize { + end := min(i+chunkSize, len(input)) + fanoutChan <- input[i:end] + } + }() + + wgFans.Add(1) + go func() { + defer wgFans.Done() + for r := range faninChan { + result[r.key] = r.val + } + }() + + wgWorkers := sync.WaitGroup{} + for i := 0; i < parallelism; i++ { + wgWorkers.Add(1) + worker, cleanup := workerFactory(i) + go func(i int) { + defer wgWorkers.Done() + if cleanup != nil { + defer cleanup(i) + } + for u := range fanoutChan { + products := worker(u) + for key, product := range products { + if predicate == nil || predicate(product) { + faninChan <- resultPair{key: key, val: product} + } + } + } + }(i) + } + + // wait for the workers to finish + wgWorkers.Wait() + // now we can close the fanin channel and wait for the fanin goroutine to finish + // fanout should already be done but this makes sure we don't lose track of it + close(faninChan) + wgFans.Wait() + + return result +} diff --git a/generics/fanout_test.go b/generics/fanout_test.go new file mode 100644 index 0000000000..988acba831 --- /dev/null +++ b/generics/fanout_test.go @@ -0,0 +1,196 @@ +package generics + +import ( + "crypto/sha256" + "encoding/hex" + "fmt" + "sync" + "testing" + "time" + + "github.com/stretchr/testify/assert" +) + +func TestFanout(t *testing.T) { + input := []int{1, 2, 3, 4, 5} + parallelism := 3 + workerFactory := func(i int) (func(int) int, func(int)) { + worker := func(i int) int { + return i * 2 + } + return worker, nil + } + + result := Fanout(input, parallelism, workerFactory, nil) + assert.ElementsMatch(t, []int{2, 4, 6, 8, 10}, result) +} + +func TestFanoutWithPredicate(t *testing.T) { + input := []int{1, 2, 3, 4, 5} + parallelism := 3 + workerFactory := func(i int) (func(int) int, func(int)) { + worker := func(i int) int { + return i * 2 + } + return worker, nil + } + predicate := func(i int) bool { + return i%4 == 0 + } + + result := Fanout(input, parallelism, workerFactory, predicate) + assert.ElementsMatch(t, []int{4, 8}, result) +} + +func TestFanoutWithCleanup(t *testing.T) { + input := []int{1, 2, 3, 4, 5} + parallelism := 4 + cleanups := []int{} + mut := sync.Mutex{} + workerFactory := func(i int) (func(int) int, func(int)) { + worker := func(i int) int { + return i * 2 + } + cleanup := func(i int) { + mut.Lock() + cleanups = append(cleanups, i) + mut.Unlock() + } + return worker, cleanup + } + + result := Fanout(input, parallelism, workerFactory, nil) + assert.ElementsMatch(t, []int{2, 4, 6, 8, 10}, result) + assert.ElementsMatch(t, []int{0, 1, 2, 3}, cleanups) +} + +var expected = map[int]int{ + 1: 2, + 2: 4, + 3: 6, + 4: 8, + 5: 10, +} + +func TestFanoutMap(t *testing.T) { + input := []int{1, 2, 3, 4, 5} + parallelism := 3 + workerFactory := func(i int) (func(int) int, func(int)) { + worker := func(i int) int { + return i * 2 + } + return worker, nil + } + + result := FanoutToMap(input, parallelism, workerFactory, nil) + assert.EqualValues(t, expected, result) +} + +func TestFanoutMapWithPredicate(t *testing.T) { + input := []int{1, 2, 3, 4, 5} + parallelism := 3 + workerFactory := func(i int) (func(int) int, func(int)) { + worker := func(i int) int { + return i * 2 + } + return worker, nil + } + predicate := func(i int) bool { + return i%4 == 0 + } + + result := FanoutToMap(input, parallelism, workerFactory, predicate) + assert.EqualValues(t, map[int]int{2: 4, 4: 8}, result) +} + +func TestFanoutMapWithCleanup(t *testing.T) { + input := []int{1, 2, 3, 4, 5} + parallelism := 4 + cleanups := []int{} + mut := sync.Mutex{} + workerFactory := func(i int) (func(int) int, func(int)) { + worker := func(i int) int { + return i * 2 + } + cleanup := func(i int) { + mut.Lock() + cleanups = append(cleanups, i) + mut.Unlock() + } + return worker, cleanup + } + + result := FanoutToMap(input, parallelism, workerFactory, nil) + assert.EqualValues(t, expected, result) + assert.ElementsMatch(t, []int{0, 1, 2, 3}, cleanups) +} + +func TestEasyFanout(t *testing.T) { + input := []int{1, 2, 3, 4, 5} + worker := func(i int) int { + return i * 2 + } + + result := EasyFanout(input, 3, worker) + assert.ElementsMatch(t, []int{2, 4, 6, 8, 10}, result) +} + +func TestEasyFanoutToMap(t *testing.T) { + input := []int{1, 2, 3, 4, 5} + worker := func(i int) int { + return i * 2 + } + + result := EasyFanoutToMap(input, 3, worker) + assert.EqualValues(t, expected, result) +} + +func BenchmarkFanoutParallelism(b *testing.B) { + parallelisms := []int{1, 3, 6, 10, 25, 100} + for _, parallelism := range parallelisms { + b.Run(fmt.Sprintf("parallelism%02d", parallelism), func(b *testing.B) { + + input := make([]int, b.N) + for i := range input { + input[i] = i + } + + workerFactory := func(i int) (func(int) string, func(int)) { + worker := func(i int) string { + h := sha256.Sum256(([]byte(fmt.Sprintf("%d", i)))) + time.Sleep(1 * time.Millisecond) + return hex.EncodeToString(h[:]) + } + cleanup := func(i int) {} + return worker, cleanup + } + b.ResetTimer() + _ = Fanout(input, parallelism, workerFactory, nil) + }) + } +} + +func BenchmarkFanoutMapParallelism(b *testing.B) { + parallelisms := []int{1, 3, 6, 10, 25, 100} + for _, parallelism := range parallelisms { + b.Run(fmt.Sprintf("parallelism%02d", parallelism), func(b *testing.B) { + + input := make([]int, b.N) + for i := range input { + input[i] = i + } + + workerFactory := func(i int) (func(int) string, func(int)) { + worker := func(i int) string { + h := sha256.Sum256(([]byte(fmt.Sprintf("%d", i)))) + time.Sleep(1 * time.Millisecond) + return hex.EncodeToString(h[:]) + } + cleanup := func(i int) {} + return worker, cleanup + } + b.ResetTimer() + _ = FanoutToMap(input, parallelism, workerFactory, nil) + }) + } +} diff --git a/generics/set.go b/generics/set.go new file mode 100644 index 0000000000..92ae99eee3 --- /dev/null +++ b/generics/set.go @@ -0,0 +1,79 @@ +package generics + +import "golang.org/x/exp/maps" + +// Set is a map[T]struct{}-backed unique set of items. +type Set[T comparable] map[T]struct{} + +// NewSet returns a new Set with elements `es`. +func NewSet[T comparable](es ...T) Set[T] { + s := make(Set[T], len(es)) + s.Add(es...) + return s +} + +func NewSetWithCapacity[T comparable](c int) Set[T] { + return make(Set[T], c) +} + +// Add adds elements `es` to the Set. +func (s Set[T]) Add(es ...T) { + for _, e := range es { + s[e] = struct{}{} + } + +} +func (s Set[T]) Remove(es ...T) { + for _, e := range es { + delete(s, e) + } +} + +// Add adds members of `b` to the Set. +func (s Set[T]) AddMembers(b Set[T]) { + for v := range b { + s[v] = struct{}{} + } +} + +// Contains returns true if the Set contains `e`. +func (s Set[T]) Contains(e T) bool { + _, ok := s[e] + return ok +} + +// Members returns the unique elements of the Set in indeterminate order. +func (s Set[T]) Members() []T { + return maps.Keys(s) +} + +// Intersect returns the common elements of the Set and `b`. +func (s Set[T]) Intersect(b Set[T]) Set[T] { + c := NewSet[T]() + for v := range s { + if b.Contains(v) { + c.Add(v) + } + } + return c +} + +// Difference returns the elements from the Set that do not exist in `b`. +func (s Set[T]) Difference(b Set[T]) Set[T] { + c := NewSet[T]() + for v := range s { + if !b.Contains(v) { + c.Add(v) + } + } + return c +} + +// Union returns the a new Set containing the combination of all unique elements +// from the Set and `b`. +func (s Set[T]) Union(b Set[T]) Set[T] { + c := make(Set[T], len(s)+len(b)) + c.AddMembers(s) + c.AddMembers(b) + return c +} diff --git a/generics/set_test.go b/generics/set_test.go new file mode 100644 index 0000000000..7dee5c8970 --- /dev/null +++ b/generics/set_test.go @@ -0,0 +1,68 @@ +package generics + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestSet(t *testing.T) { + s := NewSet(1, 2, 2) + s.Add(3, 3) + + assert.True(t, s.Contains(1)) + assert.True(t, s.Contains(2)) + assert.True(t, s.Contains(3)) + assert.ElementsMatch(t, []int{1, 2, 3}, s.Members()) + + s.AddMembers(NewSet(4)) + assert.True(t, s.Contains(4)) + assert.ElementsMatch(t, []int{1, 2, 3, 4}, s.Members()) + +} + +func TestUnion(t *testing.T) { + a := NewSet(1, 1, 2, 3, 3, 4) + b := NewSet(3, 3, 4, 5, 6, 6) + c := a.Union(b) + assert.ElementsMatch(t, []int{1, 2, 3, 4, 5, 6}, c.Members()) + + d := a.Union(NewSet[int]()) + assert.ElementsMatch(t, []int{1, 2, 3, 4}, d.Members()) +} + +func TestIntersect(t *testing.T) { + a := NewSet(1, 1, 2, 3, 3, 4) + b := NewSet(3, 3, 4, 5, 6, 6) + c := a.Intersect(b) + assert.ElementsMatch(t, []int{3, 4}, c.Members()) + + assert.Empty(t, a.Intersect(NewSet(9)).Members()) +} + +func TestDifference(t *testing.T) { + a := NewSet(1, 1, 2, 3, 3, 4) + b := NewSet(3, 3, 4, 5, 6, 6) + c := a.Difference(b) + assert.ElementsMatch(t, []int{1, 2}, c.Members()) + + d := b.Difference(a) + assert.ElementsMatch(t, []int{5, 6}, d.Members()) + +} + +var res Set[int] + +func BenchmarkUnion(b *testing.B) { + x := NewSet(0, 1, 2, 3, 4, 5) + y := NewSet(1, 3, 5, 7, 9) + for i := 0; i < 100; i++ { + x.Add(i) + y.Add(i) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + res = x.Union(y) + } +} diff --git a/generics/setttl.go b/generics/setttl.go new file mode 100644 index 0000000000..eef285ec23 --- /dev/null +++ b/generics/setttl.go @@ -0,0 +1,89 @@ +package generics + +import ( + "cmp" + "sort" + "sync" + "time" + + "github.com/jonboulle/clockwork" + "golang.org/x/exp/maps" +) + +// SetWithTTL is a unique set of items with a TTL (time to live) for each item. +// After the TTL expires, the item is automatically removed from the set when either Members or Length is called. +// It is safe for concurrent use. +type SetWithTTL[T cmp.Ordered] struct { + Items map[T]time.Time + TTL time.Duration + Clock clockwork.Clock + mut sync.RWMutex +} + +// NewSetWithTTL returns a new SetWithTTL with elements `es` and a TTL of `ttl`. +func NewSetWithTTL[T cmp.Ordered](ttl time.Duration, es ...T) *SetWithTTL[T] { + s := &SetWithTTL[T]{ + Items: make(map[T]time.Time, len(es)), + TTL: ttl, + Clock: clockwork.NewRealClock(), + } + s.Add(es...) + return s +} + +// Add adds elements `es` to the SetWithTTL. +func (s *SetWithTTL[T]) Add(es ...T) { + s.mut.Lock() + defer s.mut.Unlock() + for _, e := range es { + s.Items[e] = s.Clock.Now().Add(s.TTL) + } +} + +// Remove removes elements `es` from the SetWithTTL. +func (s *SetWithTTL[T]) Remove(es ...T) { + s.mut.Lock() + defer s.mut.Unlock() + for _, e := range es { + delete(s.Items, e) + } +} + +// Contains returns true if the SetWithTTL contains `e`. +// We don't have to clean up first because the test checks the TTL. +func (s *SetWithTTL[T]) Contains(e T) bool { + s.mut.RLock() + item, ok := s.Items[e] + s.mut.RUnlock() + if !ok { + return false + } + return item.After(s.Clock.Now()) +} + +func (s *SetWithTTL[T]) cleanup() int { + s.mut.Lock() + defer s.mut.Unlock() + maps.DeleteFunc(s.Items, func(k T, exp time.Time) bool { + return exp.Before(s.Clock.Now()) + }) + return len(s.Items) +} + +// Members returns the unique elements of the SetWithTTL in sorted order. +// It also removes any items that have expired. +func (s *SetWithTTL[T]) Members() []T { + s.cleanup() + s.mut.RLock() + members := maps.Keys(s.Items) + s.mut.RUnlock() + sort.Slice(members, func(i, j int) bool { + return cmp.Less(members[i], members[j]) + }) + return members +} + +// Length returns the number of items in the SetWithTTL after removing any expired items. +func (s *SetWithTTL[T]) Length() int { + return s.cleanup() +} diff --git a/generics/setttl_test.go b/generics/setttl_test.go new file mode 100644 index 0000000000..d86f805b40 --- /dev/null +++ b/generics/setttl_test.go @@ -0,0 +1,90 @@ +package generics + +import ( + "testing" + "time" + + "github.com/dgryski/go-wyhash" + "github.com/jonboulle/clockwork" + "github.com/stretchr/testify/assert" +) + +var seed = 3565269841805 +var rng = wyhash.Rng(seed) + +const charset = "abcdef0123456789" + +func genID(numChars int) string { + + id := make([]byte, numChars) + for i := 0; i < numChars; i++ { + id[i] = charset[int(rng.Next()%uint64(len(charset)))] + } + return string(id) +} + +func TestSetTTLBasics(t *testing.T) { + s := NewSetWithTTL(100*time.Millisecond, "a", "b", "b") + fakeclock := clockwork.NewFakeClock() + s.Clock = fakeclock + assert.Equal(t, 2, s.Length()) + fakeclock.Advance(50 * time.Millisecond) + s.Add("c") + assert.Equal(t, 3, s.Length()) + assert.Equal(t, s.Members(), []string{"a", "b", "c"}) + fakeclock.Advance(60 * time.Millisecond) + assert.Equal(t, 1, s.Length()) + assert.Equal(t, s.Members(), []string{"c"}) + fakeclock.Advance(100 * time.Millisecond) + assert.Equal(t, 0, s.Length()) + assert.Equal(t, s.Members(), []string{}) +} + +func BenchmarkSetWithTTLContains(b *testing.B) { + s := NewSetWithTTL[string](10 * time.Second) + fc := clockwork.NewFakeClock() + s.Clock = fc + + n := 10000 + traceIDs := make([]string, n) + for i := 0; i < n; i++ { + traceIDs[i] = genID(32) + if i%2 == 0 { + s.Add(traceIDs[i]) + } + fc.Advance(1 * time.Microsecond) + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + s.Contains(traceIDs[i%n]) + } +} + +func BenchmarkSetWithTTLExpire(b *testing.B) { + s := NewSetWithTTL[string](1 * time.Second) + fc := clockwork.NewFakeClock() + s.Clock = fc + + // 1K ids created at 1ms intervals + // we'll check them over the course of 1 second as well, so they should all expire by the end + n := 1000 + traceIDs := make([]string, n) + for i := 0; i < n; i++ { + traceIDs[i] = genID(32) + s.Add(traceIDs[i]) + fc.Advance(1 * time.Millisecond) + } + // make sure we have 1000 ids now + assert.Equal(b, n, s.Length()) + b.ResetTimer() + advanceTime := 100 * time.Second / time.Duration(b.N) + for i := 0; i < b.N; i++ { + s.Contains(traceIDs[i%n]) + if i%100 == 0 { + fc.Advance(advanceTime) + } + } + b.StopTimer() + // make sure all ids have expired by now (there might be 1 or 2 that haven't) + assert.GreaterOrEqual(b, 2, s.Length()) +} diff --git a/go.mod b/go.mod index 455fae9084..6525708883 100644 --- a/go.mod +++ b/go.mod @@ -1,81 +1,83 @@ module github.com/honeycombio/refinery -go 1.20 +go 1.22.5 require ( github.com/agnivade/levenshtein v1.1.1 - github.com/creasty/defaults v1.7.0 + github.com/creasty/defaults v1.8.0 github.com/davecgh/go-spew v1.1.1 github.com/dgryski/go-wyhash v0.0.0-20191203203029-c4841ae36371 github.com/facebookgo/inject v0.0.0-20180706035515-f23751cae28b github.com/facebookgo/startstop v0.0.0-20161013234910-bc158412526d - github.com/gomodule/redigo v1.8.9 + github.com/gomodule/redigo v1.9.2 github.com/gorilla/mux v1.8.1 + github.com/grafana/pyroscope-go/godeltaprof v0.1.8 github.com/hashicorp/golang-lru/v2 v2.0.7 - github.com/honeycombio/dynsampler-go v0.5.1 - github.com/honeycombio/husky v0.22.4 - github.com/honeycombio/libhoney-go v1.20.0 - github.com/jessevdk/go-flags v1.5.0 + github.com/honeycombio/dynsampler-go v0.6.0 + github.com/honeycombio/husky v0.30.0 + github.com/honeycombio/libhoney-go v1.23.1 + github.com/jessevdk/go-flags v1.6.1 + github.com/jonboulle/clockwork v0.4.0 github.com/json-iterator/go v1.1.12 - github.com/klauspost/compress v1.17.8 - github.com/panmari/cuckoofilter v1.0.3 - github.com/pelletier/go-toml/v2 v2.1.0 + github.com/klauspost/compress v1.17.9 + github.com/panmari/cuckoofilter v1.0.6 + github.com/pelletier/go-toml/v2 v2.2.3 github.com/pkg/errors v0.9.1 - github.com/prometheus/client_golang v1.17.0 + github.com/prometheus/client_golang v1.20.2 github.com/rcrowley/go-metrics v0.0.0-20200313005456-10cdbea86bc0 + github.com/redis/go-redis/v9 v9.6.1 github.com/sirupsen/logrus v1.9.3 github.com/sourcegraph/conc v0.3.0 - github.com/stretchr/testify v1.8.4 - github.com/tidwall/gjson v1.17.0 + github.com/stretchr/testify v1.9.0 + github.com/tidwall/gjson v1.17.3 github.com/vmihailenco/msgpack/v5 v5.4.1 - go.opentelemetry.io/otel v1.21.0 - go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v0.44.0 - go.opentelemetry.io/otel/metric v1.21.0 - go.opentelemetry.io/otel/sdk v1.21.0 - go.opentelemetry.io/otel/sdk/metric v1.21.0 - go.opentelemetry.io/proto/otlp v1.0.0 + go.opentelemetry.io/otel v1.29.0 + go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.29.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.29.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.29.0 + go.opentelemetry.io/otel/metric v1.29.0 + go.opentelemetry.io/otel/sdk v1.29.0 + go.opentelemetry.io/otel/sdk/metric v1.29.0 + go.opentelemetry.io/otel/trace v1.29.0 + go.opentelemetry.io/proto/otlp v1.3.1 go.uber.org/automaxprocs v1.5.3 golang.org/x/exp v0.0.0-20231127185646-65229373498e - google.golang.org/grpc v1.59.0 - google.golang.org/protobuf v1.31.0 + google.golang.org/grpc v1.66.0 + google.golang.org/protobuf v1.34.2 gopkg.in/alexcesaro/statsd.v2 v2.0.0 gopkg.in/yaml.v3 v3.0.1 ) require ( github.com/beorn7/perks v1.0.1 // indirect - github.com/cenkalti/backoff/v4 v4.2.1 // indirect - github.com/cespare/xxhash/v2 v2.2.0 // indirect + github.com/cenkalti/backoff/v4 v4.3.0 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/dgryski/go-metro v0.0.0-20200812162917-85c65e2d0165 // indirect + github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect github.com/facebookgo/clock v0.0.0-20150410010913-600d898af40a // indirect github.com/facebookgo/limitgroup v0.0.0-20150612190941-6abd8d71ec01 // indirect github.com/facebookgo/muster v0.0.0-20150708232844-fd3d7953fd52 // indirect github.com/facebookgo/structtag v0.0.0-20150214074306-217e25fb9691 // indirect - github.com/go-logr/logr v1.3.0 // indirect + github.com/go-logr/logr v1.4.2 // indirect github.com/go-logr/stdr v1.2.2 // indirect - github.com/golang/protobuf v1.5.3 // indirect - github.com/grafana/pyroscope-go/godeltaprof v0.1.8 // indirect - github.com/grpc-ecosystem/grpc-gateway/v2 v2.16.0 // indirect - github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.22.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect - github.com/prometheus/client_model v0.4.1-0.20230718164431-9a2bf3000d16 // indirect - github.com/prometheus/common v0.44.0 // indirect - github.com/prometheus/procfs v0.11.1 // indirect + github.com/prometheus/client_model v0.6.1 // indirect + github.com/prometheus/common v0.55.0 // indirect + github.com/prometheus/procfs v0.15.1 // indirect github.com/tidwall/match v1.1.1 // indirect github.com/tidwall/pretty v1.2.0 // indirect github.com/vmihailenco/tagparser/v2 v2.0.0 // indirect - go.opentelemetry.io/otel/trace v1.21.0 // indirect - go.uber.org/atomic v1.7.0 // indirect - go.uber.org/multierr v1.9.0 // indirect - golang.org/x/net v0.17.0 // indirect - golang.org/x/sys v0.14.0 // indirect - golang.org/x/text v0.13.0 // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20230822172742-b8732ec3820d // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20230822172742-b8732ec3820d // indirect + go.uber.org/multierr v1.11.0 // indirect + golang.org/x/net v0.28.0 // indirect + golang.org/x/sys v0.24.0 // indirect + golang.org/x/text v0.17.0 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20240822170219-fc7c04adadcd // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240822170219-fc7c04adadcd // indirect ) -replace go.opentelemetry.io/proto/otlp => github.com/honeycombio/opentelemetry-proto-go/otlp v0.19.0-compat - -replace github.com/panmari/cuckoofilter => github.com/honeycombio/cuckoofilter v0.0.0-20230630225016-cf48793fb7c1 +replace go.opentelemetry.io/proto/otlp => github.com/honeycombio/opentelemetry-proto-go/otlp v1.3.1-compat diff --git a/go.sum b/go.sum index 36e371487c..d06c6f99cd 100644 --- a/go.sum +++ b/go.sum @@ -6,17 +6,23 @@ github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0 h1:jfIu9sQUG6Ig github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0/go.mod h1:t2tdKJDJF9BV14lnkjHmOQgcvEKgtqs5a1N3LNdJhGE= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= -github.com/cenkalti/backoff/v4 v4.2.1 h1:y4OZtCnogmCPw98Zjyt5a6+QwPLGkiQsYW5oUqylYbM= -github.com/cenkalti/backoff/v4 v4.2.1/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= -github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= -github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/creasty/defaults v1.7.0 h1:eNdqZvc5B509z18lD8yc212CAqJNvfT1Jq6L8WowdBA= -github.com/creasty/defaults v1.7.0/go.mod h1:iGzKe6pbEHnpMPtfDXZEr0NVxWnPTjb1bbDy08fPzYM= +github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs= +github.com/bsm/ginkgo/v2 v2.12.0/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c= +github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA= +github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0= +github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= +github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/creasty/defaults v1.8.0 h1:z27FJxCAa0JKt3utc0sCImAEb+spPucmKoOdLHvHYKk= +github.com/creasty/defaults v1.8.0/go.mod h1:iGzKe6pbEHnpMPtfDXZEr0NVxWnPTjb1bbDy08fPzYM= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/dgryski/go-metro v0.0.0-20200812162917-85c65e2d0165 h1:BS21ZUJ/B5X2UVUbczfmdWH7GapPWAhxcMsDnjJTU1E= github.com/dgryski/go-metro v0.0.0-20200812162917-85c65e2d0165/go.mod h1:c9O8+fpSOX1DM8cPNSkX/qsBWdkD4yd2dpciOWQjpBw= +github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78= +github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc= github.com/dgryski/go-wyhash v0.0.0-20191203203029-c4841ae36371 h1:bz5ApY1kzFBvw3yckuyRBCtqGvprWrKswYK468nm+Gs= github.com/dgryski/go-wyhash v0.0.0-20191203203029-c4841ae36371/go.mod h1:/ENMIO1SQeJ5YQeUWWpbX8f+bS8INHrrhFjXgEqi4LA= github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48 h1:fRzb/w+pyskVMQ+UbP35JkH8yB7MYb4q/qhBarqZE6g= @@ -40,147 +46,140 @@ github.com/facebookgo/structtag v0.0.0-20150214074306-217e25fb9691/go.mod h1:sKL github.com/facebookgo/subset v0.0.0-20200203212716-c811ad88dec4 h1:7HZCaLC5+BZpmbhCOZJ293Lz68O7PYrF2EzeiFMwCLk= github.com/facebookgo/subset v0.0.0-20200203212716-c811ad88dec4/go.mod h1:5tD+neXqOorC30/tWg0LCSkrqj/AR6gu8yY8/fpw1q0= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= -github.com/go-logr/logr v1.3.0 h1:2y3SDp0ZXuc6/cjLSZ+Q3ir+QB9T/iG5yYRXqsagWSY= -github.com/go-logr/logr v1.3.0/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= +github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= -github.com/golang/glog v1.1.2 h1:DVjP2PbBOzHyzA+dn3WhHIq4NdVu3Q+pvivFICf/7fo= -github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= -github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= -github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= -github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= -github.com/gomodule/redigo v1.8.9 h1:Sl3u+2BI/kk+VEatbj0scLdrFhjPmbxOc1myhDP41ws= -github.com/gomodule/redigo v1.8.9/go.mod h1:7ArFNvsTjH8GMMzB4uy1snslv2BwmginuMs06a1uzZE= -github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/gomodule/redigo v1.9.2 h1:HrutZBLhSIU8abiSfW8pj8mPhOyMYjZT/wcA4/L9L9s= +github.com/gomodule/redigo v1.9.2/go.mod h1:KsU3hiK/Ay8U42qpaJk+kuNa3C+spxapWpM+ywhcgtw= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY= github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ= -github.com/grafana/pyroscope-go/godeltaprof v0.1.5 h1:gkFVqihFRL1Nro2FCC0u6mW47jclef96Zu8I/ykq+4E= -github.com/grafana/pyroscope-go/godeltaprof v0.1.5/go.mod h1:1HSPtjU8vLG0jE9JrTdzjgFqdJ/VgN7fvxBNq3luJko= github.com/grafana/pyroscope-go/godeltaprof v0.1.8 h1:iwOtYXeeVSAeYefJNaxDytgjKtUuKQbJqgAIjlnicKg= github.com/grafana/pyroscope-go/godeltaprof v0.1.8/go.mod h1:2+l7K7twW49Ct4wFluZD3tZ6e0SjanjcUUBPVD/UuGU= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.16.0 h1:YBftPWNWd4WwGqtY2yeZL2ef8rHAxPBD8KFhJpmcqms= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.16.0/go.mod h1:YN5jB8ie0yfIUg6VvR9Kz84aCaG7AsGZnLjhHbUqwPg= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.22.0 h1:asbCHRVmodnJTuQ3qamDwqVOIjwqUPTYmYuemVOx+Ys= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.22.0/go.mod h1:ggCgvZ2r7uOoQjOyu2Y1NhHmEPPzzuhWgcza5M1Ji1I= github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= -github.com/honeycombio/cuckoofilter v0.0.0-20230630225016-cf48793fb7c1 h1:MXRxSUQ+8B1wPV5eqSkLjlCOhH1hYx876Uc/DJQRxYE= -github.com/honeycombio/cuckoofilter v0.0.0-20230630225016-cf48793fb7c1/go.mod h1:VvArVnXCHeB+tpxMOLWvaoItNHWYEgOx8Lgs1JoS+cI= -github.com/honeycombio/dynsampler-go v0.5.1 h1:rwOsxLaSlE8RiriiCgBo/LoZjiLEe24CuXPABOGXV+k= -github.com/honeycombio/dynsampler-go v0.5.1/go.mod h1:pJqWFeoMN3syX74PEvlusieyGBbtIBjmTVjLc3thmK4= -github.com/honeycombio/husky v0.22.4 h1:2ZEenP3y1KD8xrP2nGvjeB4K/6F3Zfhgu6an2e7cSsI= -github.com/honeycombio/husky v0.22.4/go.mod h1:oIb9sbjnC3SdlmLd84JHF7MCaitRsvTnrNyO+yxWO0M= -github.com/honeycombio/libhoney-go v1.20.0 h1:PL54R0P9vxIyb28H3twbLb+DCqQlJdMQM55VZg1abKA= -github.com/honeycombio/libhoney-go v1.20.0/go.mod h1:RIaurCpfg5NDWSEV8t3QLcda9dUAiVNyWeHRAaSpN90= -github.com/honeycombio/opentelemetry-proto-go/otlp v0.19.0-compat h1:fMpIzVAl5C260HisnRWV//vfckZIC4qvn656M3VLLOY= -github.com/honeycombio/opentelemetry-proto-go/otlp v0.19.0-compat/go.mod h1:mC2aK20Z/exugKpqCgcpwEadiS0im8K6mZsD4Is/hCY= -github.com/jessevdk/go-flags v1.5.0 h1:1jKYvbxEjfUl0fmqTCOfonvskHHXMjBySTLW4y9LFvc= -github.com/jessevdk/go-flags v1.5.0/go.mod h1:Fw0T6WPc1dYxT4mKEZRfG5kJhaTDP9pj1c2EWnYs/m4= +github.com/honeycombio/dynsampler-go v0.6.0 h1:fs4mrfeFGU5V+ClwpblFzbWqn4Apb+lKlE7Ja5zL22I= +github.com/honeycombio/dynsampler-go v0.6.0/go.mod h1:pJqWFeoMN3syX74PEvlusieyGBbtIBjmTVjLc3thmK4= +github.com/honeycombio/husky v0.30.0 h1:eCISdKgFq2zwmB0d5miJnBgUV6As5QCKNtEXW94MP2E= +github.com/honeycombio/husky v0.30.0/go.mod h1:amJNyAKYHWGWrgz+hrLl2OCodbOD2bjR5arceKyh3qw= +github.com/honeycombio/libhoney-go v1.23.1 h1:dsZrY7wfnKyBnpQJeW9B+eawDYCZBGtmP06QEcE+YDM= +github.com/honeycombio/libhoney-go v1.23.1/go.mod h1:mbaBmUkuGwrVa9NdsskMaOzvkYMRbknsfIvavWq+5kA= +github.com/honeycombio/opentelemetry-proto-go/otlp v1.3.1-compat h1:i9CAIguM5tMQC9xSRihqdFBoh40OBOhuhfR8OrXsZ9o= +github.com/honeycombio/opentelemetry-proto-go/otlp v1.3.1-compat/go.mod h1:ZyEcAltAA7tCBVo5o+5klmG2l+43E1fjpxGxvOIskic= +github.com/jessevdk/go-flags v1.6.1 h1:Cvu5U8UGrLay1rZfv/zP7iLpSHGUZ/Ou68T0iX1bBK4= +github.com/jessevdk/go-flags v1.6.1/go.mod h1:Mk8T1hIAWpOiJiHa9rJASDK2UGWji0EuPGBnNLMooyc= +github.com/jonboulle/clockwork v0.4.0 h1:p4Cf1aMWXnXAUh8lVfewRBx1zaTSYKrKMF2g3ST4RZ4= +github.com/jonboulle/clockwork v0.4.0/go.mod h1:xgRqUGwRcjKCO1vbZUEtSLrqKoPSsUpK7fnezOII0kc= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= -github.com/klauspost/compress v1.16.6/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE= -github.com/klauspost/compress v1.17.4 h1:Ej5ixsIri7BrIjBkRZLTo6ghwrEtHFk7ijlczPW4fZ4= -github.com/klauspost/compress v1.17.4/go.mod h1:/dCuZOvVtNoHsyb+cuJD3itjs3NbnF6KH9zAO4BDxPM= -github.com/klauspost/compress v1.17.8 h1:YcnTYrq7MikUT7k0Yb5eceMmALQPYBW/Xltxn0NAMnU= -github.com/klauspost/compress v1.17.8/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw= +github.com/klauspost/compress v1.17.9 h1:6KIumPrER1LHsvBVuDa0r5xaG0Es51mhhB9BQB2qeMA= +github.com/klauspost/compress v1.17.9/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= -github.com/matttproud/golang_protobuf_extensions v1.0.4 h1:mmDVorXM7PCGKw94cs5zkfA9PSy5pEvNWRP0ET0TIVo= -github.com/matttproud/golang_protobuf_extensions v1.0.4/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= +github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= -github.com/pelletier/go-toml/v2 v2.1.0 h1:FnwAJ4oYMvbT/34k9zzHuZNrhlz48GB3/s6at6/MHO4= -github.com/pelletier/go-toml/v2 v2.1.0/go.mod h1:tJU2Z3ZkXwnxa4DPO899bsyIoywizdUvyaeZurnPPDc= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/panmari/cuckoofilter v1.0.6 h1:WKb1aSj16h22x0CKVtTCaRkJiCnVGPLEMGbNY8xwXf8= +github.com/panmari/cuckoofilter v1.0.6/go.mod h1:bKADbQPGbN6TxUvo/IbMEIUbKuASnpsOvrLTgpSX0aU= +github.com/pelletier/go-toml/v2 v2.2.3 h1:YmeHyLY8mFWbdkNWwpr+qIL2bEqT0o95WSdkNHvL12M= +github.com/pelletier/go-toml/v2 v2.2.3/go.mod h1:MfCQTFTvCcUyyvvwm1+G6H/jORL20Xlb6rzQu9GuUkc= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4g= -github.com/prometheus/client_golang v1.17.0 h1:rl2sfwZMtSthVU752MqfjQozy7blglC+1SOtjMAMh+Q= -github.com/prometheus/client_golang v1.17.0/go.mod h1:VeL+gMmOAxkS2IqfCq0ZmHSL+LjWfWDUmp1mBz9JgUY= -github.com/prometheus/client_model v0.4.1-0.20230718164431-9a2bf3000d16 h1:v7DLqVdK4VrYkVD5diGdl4sxJurKJEMnODWRJlxV9oM= -github.com/prometheus/client_model v0.4.1-0.20230718164431-9a2bf3000d16/go.mod h1:oMQmHW1/JoDwqLtg57MGgP/Fb1CJEYF2imWWhWtMkYU= -github.com/prometheus/common v0.44.0 h1:+5BrQJwiBB9xsMygAB3TNvpQKOwlkc25LbISbrdOOfY= -github.com/prometheus/common v0.44.0/go.mod h1:ofAIvZbQ1e/nugmZGz4/qCb9Ap1VoSTIO7x0VV9VvuY= -github.com/prometheus/procfs v0.11.1 h1:xRC8Iq1yyca5ypa9n1EZnWZkt7dwcoRPQwX/5gwaUuI= -github.com/prometheus/procfs v0.11.1/go.mod h1:eesXgaPo1q7lBpVMoMy0ZOFTth9hBn4W/y0/p/ScXhY= +github.com/prashantv/gostub v1.1.0/go.mod h1:A5zLQHz7ieHGG7is6LLXLz7I8+3LZzsrV0P1IAHhP5U= +github.com/prometheus/client_golang v1.20.2 h1:5ctymQzZlyOON1666svgwn3s6IKWgfbjsejTMiXIyjg= +github.com/prometheus/client_golang v1.20.2/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE= +github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= +github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= +github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G1dc= +github.com/prometheus/common v0.55.0/go.mod h1:2SECS4xJG1kd8XF9IcM1gMX6510RAEL65zxzNImwdc8= +github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= +github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= github.com/rcrowley/go-metrics v0.0.0-20200313005456-10cdbea86bc0 h1:MkV+77GLUNo5oJ0jf870itWm3D0Sjh7+Za9gazKc5LQ= github.com/rcrowley/go-metrics v0.0.0-20200313005456-10cdbea86bc0/go.mod h1:bCqnVzQkZxMG4s8nGwiZ5l3QUCyqpo9Y+/ZMZ9VjZe4= -github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= +github.com/redis/go-redis/v9 v9.6.1 h1:HHDteefn6ZkTtY5fGUE8tj8uy85AHk6zP7CpzIAM0y4= +github.com/redis/go-redis/v9 v9.6.1/go.mod h1:0C0c6ycQsdpVNQpxb1njEQIqkx5UcsM8FJCQLgE9+RA= +github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8= +github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= github.com/sourcegraph/conc v0.3.0 h1:OQTbbt6P72L20UqAkXXuLOj79LfEanQ+YQFNpLA9ySo= github.com/sourcegraph/conc v0.3.0/go.mod h1:Sdozi7LEKbFPqYX2/J+iBAM6HpqSLTASQIKqDmF7Mt0= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= -github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= -github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= -github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= -github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= -github.com/tidwall/gjson v1.17.0 h1:/Jocvlh98kcTfpN2+JzGQWQcqrPQwDrVEMApx/M5ZwM= -github.com/tidwall/gjson v1.17.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/tidwall/gjson v1.17.3 h1:bwWLZU7icoKRG+C+0PNwIKC6FCJO/Q3p2pZvuP0jN94= +github.com/tidwall/gjson v1.17.3/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= github.com/tidwall/pretty v1.2.0 h1:RWIZEg2iJ8/g6fDDYzMpobmaoGh5OLl4AXtGUGPcqCs= github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= -github.com/vmihailenco/msgpack/v5 v5.3.5/go.mod h1:7xyJ9e+0+9SaZT0Wt1RGleJXzli6Q/V5KbhBonMG9jc= github.com/vmihailenco/msgpack/v5 v5.4.1 h1:cQriyiUvjTwOHg8QZaPihLWeRAAVoCpE00IUPn0Bjt8= github.com/vmihailenco/msgpack/v5 v5.4.1/go.mod h1:GaZTsDaehaPpQVyxrf5mtQlH+pc21PIudVV/E3rRQok= github.com/vmihailenco/tagparser/v2 v2.0.0 h1:y09buUbR+b5aycVFQs/g70pqKVZNBmxwAhO7/IwNM9g= github.com/vmihailenco/tagparser/v2 v2.0.0/go.mod h1:Wri+At7QHww0WTrCBeu4J6bNtoV6mEfg5OIWRZA9qds= -go.opentelemetry.io/otel v1.21.0 h1:hzLeKBZEL7Okw2mGzZ0cc4k/A7Fta0uoPgaJCr8fsFc= -go.opentelemetry.io/otel v1.21.0/go.mod h1:QZzNPQPm1zLX4gZK4cMi+71eaorMSGT3A4znnUvNNEo= -go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v0.44.0 h1:bflGWrfYyuulcdxf14V6n9+CoQcu5SAAdHmDPAJnlps= -go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v0.44.0/go.mod h1:qcTO4xHAxZLaLxPd60TdE88rxtItPHgHWqOhOGRr0as= -go.opentelemetry.io/otel/metric v1.21.0 h1:tlYWfeo+Bocx5kLEloTjbcDwBuELRrIFxwdQ36PlJu4= -go.opentelemetry.io/otel/metric v1.21.0/go.mod h1:o1p3CA8nNHW8j5yuQLdc1eeqEaPfzug24uvsyIEJRWM= -go.opentelemetry.io/otel/sdk v1.21.0 h1:FTt8qirL1EysG6sTQRZ5TokkU8d0ugCj8htOgThZXQ8= -go.opentelemetry.io/otel/sdk v1.21.0/go.mod h1:Nna6Yv7PWTdgJHVRD9hIYywQBRx7pbox6nwBnZIxl/E= -go.opentelemetry.io/otel/sdk/metric v1.21.0 h1:smhI5oD714d6jHE6Tie36fPx4WDFIg+Y6RfAY4ICcR0= -go.opentelemetry.io/otel/sdk/metric v1.21.0/go.mod h1:FJ8RAsoPGv/wYMgBdUJXOm+6pzFY3YdljnXtv1SBE8Q= -go.opentelemetry.io/otel/trace v1.21.0 h1:WD9i5gzvoUPuXIXH24ZNBudiarZDKuekPqi/E8fpfLc= -go.opentelemetry.io/otel/trace v1.21.0/go.mod h1:LGbsEB0f9LGjN+OZaQQ26sohbOmiMR+BaslueVtS/qQ= -go.uber.org/atomic v1.7.0 h1:ADUqmZGgLDDfbSL9ZmPxKTybcoEYHgpYfELNoN+7hsw= -go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= +go.opentelemetry.io/otel v1.29.0 h1:PdomN/Al4q/lN6iBJEN3AwPvUiHPMlt93c8bqTG5Llw= +go.opentelemetry.io/otel v1.29.0/go.mod h1:N/WtXPs1CNCUEx+Agz5uouwCba+i+bJGFicT8SR4NP8= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.29.0 h1:xvhQxJ/C9+RTnAj5DpTg7LSM1vbbMTiXt7e9hsfqHNw= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.29.0/go.mod h1:Fcvs2Bz1jkDM+Wf5/ozBGmi3tQ/c9zPKLnsipnfhGAo= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.29.0 h1:dIIDULZJpgdiHz5tXrTgKIMLkus6jEFa7x5SOKcyR7E= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.29.0/go.mod h1:jlRVBe7+Z1wyxFSUs48L6OBQZ5JwH2Hg/Vbl+t9rAgI= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.29.0 h1:JAv0Jwtl01UFiyWZEMiJZBiTlv5A50zNs8lsthXqIio= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.29.0/go.mod h1:QNKLmUEAq2QUbPQUfvw4fmv0bgbK7UlOSFCnXyfvSNc= +go.opentelemetry.io/otel/metric v1.29.0 h1:vPf/HFWTNkPu1aYeIsc98l4ktOQaL6LeSoeV2g+8YLc= +go.opentelemetry.io/otel/metric v1.29.0/go.mod h1:auu/QWieFVWx+DmQOUMgj0F8LHWdgalxXqvp7BII/W8= +go.opentelemetry.io/otel/sdk v1.29.0 h1:vkqKjk7gwhS8VaWb0POZKmIEDimRCMsopNYnriHyryo= +go.opentelemetry.io/otel/sdk v1.29.0/go.mod h1:pM8Dx5WKnvxLCb+8lG1PRNIDxu9g9b9g59Qr7hfAAok= +go.opentelemetry.io/otel/sdk/metric v1.29.0 h1:K2CfmJohnRgvZ9UAj2/FhIf/okdWcNdBwe1m8xFXiSY= +go.opentelemetry.io/otel/sdk/metric v1.29.0/go.mod h1:6zZLdCl2fkauYoZIOn/soQIDSWFmNSRcICarHfuhNJQ= +go.opentelemetry.io/otel/trace v1.29.0 h1:J/8ZNK4XgR7a21DZUAsbF8pZ5Jcw1VhACmnYt39JTi4= +go.opentelemetry.io/otel/trace v1.29.0/go.mod h1:eHl3w0sp3paPkYstJOmAimxhiFXPg+MMTlEh3nsQgWQ= go.uber.org/automaxprocs v1.5.3 h1:kWazyxZUrS3Gs4qUpbwo5kEIMGe/DAvi5Z4tl2NW4j8= go.uber.org/automaxprocs v1.5.3/go.mod h1:eRbA25aqJrxAbsLO0xy5jVwPt7FQnRgjW+efnwa1WM0= -go.uber.org/multierr v1.9.0 h1:7fIwc/ZtS0q++VgcfqFDxSBZVv/Xo49/SYnDFupUwlI= -go.uber.org/multierr v1.9.0/go.mod h1:X2jQV1h+kxSjClGpnseKVIxpmcjrj7MNnI0bnlfKTVQ= +go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= +go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= golang.org/x/exp v0.0.0-20231127185646-65229373498e h1:Gvh4YaCaXNs6dKTlfgismwWZKyjVZXwOPfIyUaqU3No= golang.org/x/exp v0.0.0-20231127185646-65229373498e/go.mod h1:iRJReGqOEeBhDZGkGbynYwcHlctCvnjTYIamk7uXpHI= -golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM= -golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE= -golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sys v0.0.0-20210320140829-1e4c9ba3b0c4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/net v0.28.0 h1:a9JDOJc5GMUJ0+UDqmLT86WiEy7iWyIhz8gz8E4e5hE= +golang.org/x/net v0.28.0/go.mod h1:yqtgsTWOOnlGLG9GFRrK3++bGOUEkNBoHZc8MEDWPNg= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.14.0 h1:Vz7Qs629MkJkGyHxUlRHizWJRG2j8fbQKjELVSNhy7Q= -golang.org/x/sys v0.14.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k= -golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= -golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -google.golang.org/genproto v0.0.0-20230822172742-b8732ec3820d h1:VBu5YqKPv6XiJ199exd8Br+Aetz+o08F+PLMnwJQHAY= -google.golang.org/genproto/googleapis/api v0.0.0-20230822172742-b8732ec3820d h1:DoPTO70H+bcDXcd39vOqb2viZxgqeBeSGtZ55yZU4/Q= -google.golang.org/genproto/googleapis/api v0.0.0-20230822172742-b8732ec3820d/go.mod h1:KjSP20unUpOx5kyQUFa7k4OJg0qeJ7DEZflGDu2p6Bk= -google.golang.org/genproto/googleapis/rpc v0.0.0-20230822172742-b8732ec3820d h1:uvYuEyMHKNt+lT4K3bN6fGswmK8qSvcreM3BwjDh+y4= -google.golang.org/genproto/googleapis/rpc v0.0.0-20230822172742-b8732ec3820d/go.mod h1:+Bk1OCOj40wS2hwAMA+aCW9ypzm63QTBBHp6lQ3p+9M= -google.golang.org/grpc v1.59.0 h1:Z5Iec2pjwb+LEOqzpB2MR12/eKFhDPhuqW91O+4bwUk= -google.golang.org/grpc v1.59.0/go.mod h1:aUPDwccQo6OTjy7Hct4AfBPD1GptF4fyUjIkQ9YtF98= -google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= -google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= -google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8= -google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg= +golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/text v0.17.0 h1:XtiM5bkSOt+ewxlOE/aE/AKEHibwj/6gvWMl9Rsh0Qc= +golang.org/x/text v0.17.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= +google.golang.org/genproto/googleapis/api v0.0.0-20240822170219-fc7c04adadcd h1:BBOTEWLuuEGQy9n1y9MhVJ9Qt0BDu21X8qZs71/uPZo= +google.golang.org/genproto/googleapis/api v0.0.0-20240822170219-fc7c04adadcd/go.mod h1:fO8wJzT2zbQbAjbIoos1285VfEIYKDDY+Dt+WpTkh6g= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240822170219-fc7c04adadcd h1:6TEm2ZxXoQmFWFlt1vNxvVOa1Q0dXFQD1m/rYjXmS0E= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240822170219-fc7c04adadcd/go.mod h1:UqMtugtsSgubUsoxbuAoiCXvqvErP7Gf0so0mK9tHxU= +google.golang.org/grpc v1.66.0 h1:DibZuoBznOxbDQxRINckZcUvnCEvrW9pcWIE2yF9r1c= +google.golang.org/grpc v1.66.0/go.mod h1:s3/l6xSSCURdVfAnL+TqCNMyTDAGN6+lZeVxnZR128Y= +google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg= +google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw= gopkg.in/alexcesaro/statsd.v2 v2.0.0 h1:FXkZSCZIH17vLCO5sO2UucTHsH9pc+17F6pl3JVCwMc= gopkg.in/alexcesaro/statsd.v2 v2.0.0/go.mod h1:i0ubccKGzBVNBpdGV5MocxyA/XlLUJzA7SLonnE4drU= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/internal/configwatcher/watcher.go b/internal/configwatcher/watcher.go new file mode 100644 index 0000000000..aa6f5df156 --- /dev/null +++ b/internal/configwatcher/watcher.go @@ -0,0 +1,118 @@ +package configwatcher + +import ( + "context" + "math/rand" + "sync" + "time" + + "github.com/facebookgo/startstop" + "github.com/honeycombio/refinery/config" + "github.com/honeycombio/refinery/internal/otelutil" + "github.com/honeycombio/refinery/pubsub" + "github.com/jonboulle/clockwork" + "go.opentelemetry.io/otel/trace" + "go.opentelemetry.io/otel/trace/noop" +) + +const ConfigPubsubTopic = "cfg_update" + +// This exists in internal because it depends on both config and pubsub. +// So we have to create it after creating pubsub and let dependency injection work. + +// ConfigWatcher listens for configuration changes and publishes notice of them. +// It avoids sending duplicate messages by comparing the hash of the configs. +type ConfigWatcher struct { + Config config.Config `inject:""` + PubSub pubsub.PubSub `inject:""` + Tracer trace.Tracer `inject:"tracer"` + Clock clockwork.Clock `inject:""` + subscr pubsub.Subscription + msgTime time.Time + done chan struct{} + mut sync.RWMutex + startstop.Starter + startstop.Stopper +} + +// ReloadCallback is used to tell others that the config has changed. +// This gets called whenever it has actually changed, but it might have +// changed because we were told about it in pubsub, so we don't publish +// a message if the hashes are the same. +func (cw *ConfigWatcher) ReloadCallback(cfgHash, rulesHash string) { + ctx := context.Background() + ctx, span := otelutil.StartSpanMulti(ctx, cw.Tracer, "ConfigWatcher.ReloadCallback", map[string]any{ + "new_config_hash": cfgHash, + "new_rules_hash": rulesHash, + }) + defer span.End() + + // don't publish if we have recently received a message (this avoids storms) + now := time.Now() + cw.mut.RLock() + msgTime := cw.msgTime + cw.mut.RUnlock() + if now.Sub(msgTime) < time.Duration(cw.Config.GetGeneralConfig().ConfigReloadInterval) { + otelutil.AddSpanField(span, "sending", false) + return + } + + message := now.Format(time.RFC3339) + otelutil.AddSpanFields(span, map[string]any{"sending": true, "message": message}) + cw.PubSub.Publish(ctx, ConfigPubsubTopic, message) +} + +// SubscriptionListener listens for messages on the config pubsub topic and reloads the config +// if a new set of hashes is received. +func (cw *ConfigWatcher) SubscriptionListener(ctx context.Context, msg string) { + _, span := otelutil.StartSpanWith(ctx, cw.Tracer, "ConfigWatcher.SubscriptionListener", "message", msg) + defer span.End() + + // parse message as a time in RFC3339 format + msgTime, err := time.Parse(time.RFC3339, msg) + if err != nil { + return + } + cw.mut.Lock() + cw.msgTime = msgTime + cw.mut.Unlock() + // maybe reload the config (it will only reload if the hashes are different, + // and if they were, it will call the ReloadCallback) + cw.Config.Reload() +} + +// Monitor periodically wakes up and tells the config to reload itself. +// If it changed, it will publish a message to the pubsub through the ReloadCallback. +func (cw *ConfigWatcher) monitor() { + cw.done = make(chan struct{}) + cfgReload := cw.Config.GetGeneralConfig().ConfigReloadInterval + // adjust the requested time by +/- 10% to avoid everyone reloading at the same time + reload := time.Duration(float64(cfgReload) * (0.9 + 0.2*rand.Float64())) + ticker := time.NewTicker(time.Duration(reload)) + for { + select { + case <-cw.done: + return + case <-ticker.C: + cw.Config.Reload() + } + } +} + +func (cw *ConfigWatcher) Start() error { + if cw.Tracer == nil { + cw.Tracer = noop.NewTracerProvider().Tracer("test") + } + if cw.Config.GetGeneralConfig().ConfigReloadInterval != 0 { + go cw.monitor() + } + cw.subscr = cw.PubSub.Subscribe(context.Background(), ConfigPubsubTopic, cw.SubscriptionListener) + cw.Config.RegisterReloadCallback(cw.ReloadCallback) + return nil +} + +func (cw *ConfigWatcher) Stop() error { + close(cw.done) + cw.subscr.Close() + return nil +} diff --git a/internal/health/health.go b/internal/health/health.go new file mode 100644 index 0000000000..96794388b5 --- /dev/null +++ b/internal/health/health.go @@ -0,0 +1,246 @@ +package health + +import ( + "sync" + "time" + + "github.com/facebookgo/startstop" + "github.com/honeycombio/refinery/logger" + "github.com/honeycombio/refinery/metrics" + "github.com/jonboulle/clockwork" +) + +// We need a Health object that can be used by: +// - internal subsystems to tell it their readiness to receive traffic +// - the router to read back that data for reporting when it receives a health or readiness request +// either on grpc or on http +// We want that object in its own package so we don't have import cycles + +// We register a subsystem with an expected interval for reporting and if it +// doesn't report for a time exceeding the duration of that interval, we will +// mark it (and the whole application) as unhealthy (not alive). A subsystem can +// also report that it is alive but not ready; when this happens, we will mark +// it as not ready and the system as a whole as not ready but still alive. This +// is useful during shutdown. + +// Subsystems will typically Register during their startup, and then call Ready +// frequently once they are ready to receive traffic. Note that Registration +// does not start the ticker -- it only starts once Ready is called for the +// first time. + +// Recorder is the interface used by object that want to record their own health +// status and make it available to the system. +type Recorder interface { + Register(subsystem string, timeout time.Duration) + Unregister(subsystem string) + Ready(subsystem string, ready bool) +} + +// Reporter is the interface that is used to read back the health status of the system. +type Reporter interface { + IsAlive() bool + IsReady() bool +} + +// TickerTime is the interval at which we will survey health of all of the +// subsystems. We will decrement the counters for each subsystem that has +// registered. If a counter reaches 0, we will mark the subsystem as dead. This +// value should generally be less than the duration of any reporting timeout in +// the system. +var TickerTime = 500 * time.Millisecond + +// The Health object is the main object that subsystems will interact with. When +// subsystems are registered, they will be expected to report in at least once +// every timeout interval. If they don't, they will be marked as not alive. +type Health struct { + Clock clockwork.Clock `inject:""` + Metrics metrics.Metrics `inject:"genericMetrics"` + Logger logger.Logger `inject:""` + timeouts map[string]time.Duration + timeLeft map[string]time.Duration + readies map[string]bool + alives map[string]bool + mut sync.RWMutex + done chan struct{} + startstop.Starter + startstop.Stopper + Recorder + Reporter +} + +func (h *Health) Start() error { + // if we don't have a logger or metrics object, we'll use the null ones (makes testing easier) + if h.Logger == nil { + h.Logger = &logger.NullLogger{} + } + if h.Metrics == nil { + h.Metrics = &metrics.NullMetrics{} + } + h.timeouts = make(map[string]time.Duration) + h.timeLeft = make(map[string]time.Duration) + h.readies = make(map[string]bool) + h.alives = make(map[string]bool) + h.done = make(chan struct{}) + go h.ticker() + return nil +} + +func (h *Health) Stop() error { + close(h.done) + return nil +} + +func (h *Health) ticker() { + tick := h.Clock.NewTicker(TickerTime) + for { + select { + case <-tick.Chan(): + h.mut.Lock() + for subsystem, timeLeft := range h.timeLeft { + // only decrement positive counters since 0 means we're dead + if timeLeft > 0 { + h.timeLeft[subsystem] -= TickerTime + if h.timeLeft[subsystem] < 0 { + h.timeLeft[subsystem] = 0 + } + } + } + h.mut.Unlock() + case <-h.done: + return + } + } +} + +// Register a subsystem with the health system. The timeout is the maximum +// expected interval between subsystem reports. If Ready is not called within +// that interval (beginning from the time of calling Ready for the first time), +// it (and the entire server) will be marked as not alive. +func (h *Health) Register(subsystem string, timeout time.Duration) { + h.mut.Lock() + defer h.mut.Unlock() + h.timeouts[subsystem] = timeout + h.readies[subsystem] = false + // we use a negative value to indicate that we haven't seen a report yet so + // we don't return "dead" immediately + h.timeLeft[subsystem] = -1 + fields := map[string]any{ + "source": subsystem, + "timeout": timeout, + } + h.Logger.Debug().WithFields(fields).Logf("Registered Health ticker", subsystem, timeout) + if timeout < TickerTime { + h.Logger.Error().WithFields(fields).Logf("Registering a timeout less than the ticker time") + } +} + +// Unregister a subsystem with the health system. This marks the subsystem as not +// ready and removes it from the alive tracking. It also means that it no longer +// needs to report in. If it does report in, the report will be ignored. +func (h *Health) Unregister(subsystem string) { + h.mut.Lock() + defer h.mut.Unlock() + delete(h.timeouts, subsystem) + delete(h.timeLeft, subsystem) + delete(h.alives, subsystem) + + // we don't remove it from readies, but we mark it as not ready; + // an unregistered subsystem can never be ready. + h.readies[subsystem] = false +} + +// Ready is called by subsystems with a flag to indicate their readiness to +// receive traffic. If any subsystem is not ready, the system as a whole is not +// ready. Even unready subsystems will be marked as alive as long as they report +// in. +func (h *Health) Ready(subsystem string, ready bool) { + h.mut.Lock() + defer h.mut.Unlock() + if _, ok := h.timeouts[subsystem]; !ok { + // if a subsystem has an entry in readies but not in timeouts, it means + // it had called Unregister but is still reporting in. This is not an error. + if _, ok := h.readies[subsystem]; !ok { + // but if it was never registered, it IS an error + h.Logger.Error().WithField("subsystem", subsystem).Logf("Health.Ready called for unregistered subsystem") + } + return + } + if h.readies[subsystem] != ready { + h.Logger.Info().WithFields(map[string]any{ + "subsystem": subsystem, + "ready": ready, + }).Logf("Health.Ready reporting subsystem changing state") + } + h.readies[subsystem] = ready + h.timeLeft[subsystem] = h.timeouts[subsystem] + if !h.alives[subsystem] { + h.alives[subsystem] = true + h.Logger.Info().WithField("subsystem", subsystem).Logf("Health.Ready reporting subsystem alive") + } + h.Metrics.Gauge("is_ready", h.checkReady()) + h.Metrics.Gauge("is_alive", h.checkAlive()) +} + +// IsAlive returns true if all registered subsystems are alive +func (h *Health) IsAlive() bool { + h.mut.Lock() + defer h.mut.Unlock() + return h.checkAlive() +} + +// checkAlive returns true if all registered subsystems are alive +// only call with a write lock held +func (h *Health) checkAlive() bool { + // if any counter is 0, we're dead + for subsystem, a := range h.timeLeft { + if a == 0 { + if h.alives[subsystem] { + h.Logger.Error().WithField("subsystem", subsystem).Logf("IsAlive: subsystem dead due to timeout") + h.alives[subsystem] = false + } + return false + } + } + return true +} + +// IsReady returns true if all registered subsystems are ready +func (h *Health) IsReady() bool { + h.mut.RLock() + defer h.mut.RUnlock() + return h.checkReady() +} + +// checkReady returns true if all registered subsystems are ready +// only call with the lock held +func (h *Health) checkReady() bool { + // if no one has registered yet, we're not ready + if len(h.readies) == 0 { + h.Logger.Debug().Logf("IsReady: no one has registered yet") + return false + } + + // if any counter is not positive, we're not ready + for subsystem, counter := range h.timeLeft { + if counter <= 0 { + h.Logger.Info().WithFields(map[string]any{ + "subsystem": subsystem, + "counter": counter, + }).Logf("Health.IsReady failed due to counter <= 0") + return false + } + } + + // if any registered subsystem is not ready, we're not ready + ready := true + for subsystem, r := range h.readies { + if !r { + h.Logger.Info().WithFields(map[string]any{ + "subsystem": subsystem, + "ready": ready, + }).Logf("Health.IsReady reporting subsystem not ready") + } + ready = ready && r + } + return ready +} diff --git a/internal/health/health_test.go b/internal/health/health_test.go new file mode 100644 index 0000000000..1b9f783f56 --- /dev/null +++ b/internal/health/health_test.go @@ -0,0 +1,145 @@ +package health + +import ( + "testing" + "time" + + "github.com/jonboulle/clockwork" + "github.com/stretchr/testify/assert" +) + +func TestHealthStartup(t *testing.T) { + // Create a new Health object + cl := clockwork.NewFakeClock() + h := &Health{ + Clock: cl, + } + // Start the Health object + h.Start() + + // at time 0 with no registrations, it should be alive and not ready + assert.True(t, h.IsAlive()) + assert.False(t, h.IsReady()) + // Stop the Health object + h.Stop() +} + +func TestHealthRegistrationNotReady(t *testing.T) { + // Create a new Health object + cl := clockwork.NewFakeClock() + h := &Health{ + Clock: cl, + } + // Start the Health object + h.Start() + // at time 0 with no registrations, it should be alive and not ready + assert.True(t, h.IsAlive()) + assert.False(t, h.IsReady()) + + // register a service that will never report in + h.Register("foo", 1500*time.Millisecond) + // now it should also be alive and not ready + assert.True(t, h.IsAlive()) + assert.False(t, h.IsReady()) + + // and even after the timeout, it should still be alive and not ready + for i := 0; i < 10; i++ { + cl.Advance(500 * time.Millisecond) + time.Sleep(1 * time.Millisecond) // give goroutines time to run + } + assert.True(t, h.IsAlive()) + assert.False(t, h.IsReady()) + // Stop the Health object + h.Stop() +} + +func TestHealthRegistrationAndReady(t *testing.T) { + // Create a new Health object + cl := clockwork.NewFakeClock() + h := &Health{ + Clock: cl, + } + // Start the Health object + h.Start() + // register a service + h.Register("foo", 1500*time.Millisecond) + cl.Advance(500 * time.Millisecond) + // Tell h we're ready + h.Ready("foo", true) + // now h should also be alive and ready + assert.True(t, h.IsAlive()) + assert.True(t, h.IsReady()) + + // make some periodic ready calls, it should stay alive and ready + for i := 0; i < 10; i++ { + h.Ready("foo", true) + cl.Advance(500 * time.Millisecond) + time.Sleep(1 * time.Millisecond) // give goroutines time to run + assert.True(t, h.IsAlive()) + assert.True(t, h.IsReady()) + } + + // now run for a bit with no ready calls, it should be dead and not ready + for i := 0; i < 10; i++ { + cl.Advance(500 * time.Millisecond) + time.Sleep(1 * time.Millisecond) // give goroutines time to run + } + assert.False(t, h.IsAlive()) + assert.False(t, h.IsReady()) + // Stop the Health object + h.Stop() +} + +func TestHealthReadyFalse(t *testing.T) { + // Create a new Health object + cl := clockwork.NewFakeClock() + h := &Health{ + Clock: cl, + } + // Start the Health object + h.Start() + // register a service + h.Register("foo", 1500*time.Millisecond) + h.Ready("foo", true) + + cl.Advance(500 * time.Millisecond) + time.Sleep(1 * time.Millisecond) // give goroutines time to run + assert.True(t, h.IsAlive()) + assert.True(t, h.IsReady()) + + // tell it we're not ready + h.Ready("foo", false) + cl.Advance(500 * time.Millisecond) + time.Sleep(1 * time.Millisecond) // give goroutines time to run + assert.True(t, h.IsAlive()) + assert.False(t, h.IsReady()) + // Stop the Health object + h.Stop() +} + +func TestNotReadyFromOneService(t *testing.T) { + // Create a new Health object + cl := clockwork.NewFakeClock() + h := &Health{ + Clock: cl, + } + // Start the Health object + h.Start() + h.Register("foo", 1500*time.Millisecond) + h.Register("bar", 1500*time.Millisecond) + h.Register("baz", 1500*time.Millisecond) + h.Ready("foo", true) + h.Ready("bar", true) + h.Ready("baz", true) + assert.True(t, h.IsAlive()) + assert.True(t, h.IsReady()) + + // make bar not ready + h.Ready("bar", false) + cl.Advance(500 * time.Millisecond) + time.Sleep(1 * time.Millisecond) // give goroutines time to run + assert.True(t, h.IsAlive()) + assert.False(t, h.IsReady()) + // Stop the Health object + h.Stop() +} diff --git a/internal/otelutil/otel_tracing.go b/internal/otelutil/otel_tracing.go new file mode 100644 index 0000000000..c48747e88f --- /dev/null +++ b/internal/otelutil/otel_tracing.go @@ -0,0 +1,142 @@ +package otelutil + +import ( + "context" + "crypto/tls" + "fmt" + "log" + "net/url" + "strings" + + "github.com/honeycombio/refinery/config" + "github.com/honeycombio/refinery/types" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp" + "go.opentelemetry.io/otel/sdk/resource" + samplers "go.opentelemetry.io/otel/sdk/trace" + sdktrace "go.opentelemetry.io/otel/sdk/trace" + semconv "go.opentelemetry.io/otel/semconv/v1.26.0" + "go.opentelemetry.io/otel/trace" + "go.opentelemetry.io/otel/trace/noop" +) + +// telemetry helpers + +func AddException(span trace.Span, err error) { + span.AddEvent("exception", trace.WithAttributes( + attribute.KeyValue{Key: "exception.type", Value: attribute.StringValue("error")}, + attribute.KeyValue{Key: "exception.message", Value: attribute.StringValue(err.Error())}, + attribute.KeyValue{Key: "exception.stacktrace", Value: attribute.StringValue("stacktrace")}, + attribute.KeyValue{Key: "exception.escaped", Value: attribute.BoolValue(false)}, + )) +} + +// addSpanField adds a field to a span, using the appropriate method for the type of the value. +func AddSpanField(span trace.Span, key string, value interface{}) { + span.SetAttributes(Attributes(map[string]interface{}{key: value})...) +} + +// AddSpanFields adds multiple fields to a span, using the appropriate method for the type of each value. +func AddSpanFields(span trace.Span, fields map[string]interface{}) { + span.SetAttributes(Attributes(fields)...) +} + +// Attributes converts a map of fields to a slice of attribute.KeyValue, setting types appropriately. +func Attributes(fields map[string]interface{}) []attribute.KeyValue { + attrs := make([]attribute.KeyValue, 0, len(fields)) + for k, v := range fields { + kv := attribute.KeyValue{Key: attribute.Key(k)} + switch val := v.(type) { + case string: + kv.Value = attribute.StringValue(val) + case int: + kv.Value = attribute.IntValue(val) + case int64: + kv.Value = attribute.Int64Value(val) + case float64: + kv.Value = attribute.Float64Value(val) + case bool: + kv.Value = attribute.BoolValue(val) + default: + kv.Value = attribute.StringValue(fmt.Sprintf("%v", val)) + } + attrs = append(attrs, kv) + } + return attrs +} + +// Starts a span with no extra fields. +func StartSpan(ctx context.Context, tracer trace.Tracer, name string) (context.Context, trace.Span) { + return tracer.Start(ctx, name) +} + +// Starts a span with a single field. +func StartSpanWith(ctx context.Context, tracer trace.Tracer, name string, field string, value interface{}) (context.Context, trace.Span) { + return tracer.Start(ctx, name, trace.WithAttributes(Attributes(map[string]interface{}{field: value})...)) +} + +// Starts a span with multiple fields. +func StartSpanMulti(ctx context.Context, tracer trace.Tracer, name string, fields map[string]interface{}) (context.Context, trace.Span) { + return tracer.Start(ctx, name, trace.WithAttributes(Attributes(fields)...)) +} + +func SetupTracing(cfg config.OTelTracingConfig, resourceLibrary string, resourceVersion string) (tracer trace.Tracer, shutdown func()) { + if !cfg.Enabled { + pr := noop.NewTracerProvider() + return pr.Tracer(resourceLibrary, trace.WithInstrumentationVersion(resourceVersion)), func() {} + } + + cfg.APIHost = strings.TrimSuffix(cfg.APIHost, "/") + apihost, err := url.Parse(fmt.Sprintf("%s:443", cfg.APIHost)) + if err != nil { + log.Fatalf("failed to parse otel API host: %v", err) + } + + sampleRate := cfg.SampleRate + if sampleRate < 1 { + sampleRate = 1 + } + + var sampleRatio float64 = 1.0 / float64(sampleRate) + + // set up honeycomb specific headers if an API key is provided + headers := make(map[string]string) + if cfg.APIKey != "" { + headers = map[string]string{ + types.APIKeyHeader: cfg.APIKey, + } + + if types.IsLegacyAPIKey(cfg.APIKey) { + headers[types.DatasetHeader] = cfg.Dataset + } + } + + tlsconfig := &tls.Config{} + secureOption := otlptracehttp.WithTLSClientConfig(tlsconfig) + exporter, err := otlptrace.New( + context.Background(), + otlptracehttp.NewClient( + secureOption, + otlptracehttp.WithEndpoint(apihost.Host), + otlptracehttp.WithHeaders(headers), + otlptracehttp.WithCompression(otlptracehttp.GzipCompression), + ), + ) + if err != nil { + log.Fatalf("failure configuring otel trace exporter: %v", err) + } + + bsp := sdktrace.NewBatchSpanProcessor(exporter) + otel.SetTracerProvider(sdktrace.NewTracerProvider( + sdktrace.WithSpanProcessor(bsp), + sdktrace.WithSampler(samplers.TraceIDRatioBased(sampleRatio)), + sdktrace.WithResource(resource.NewWithAttributes(semconv.SchemaURL, semconv.ServiceNameKey.String(cfg.Dataset))), + )) + + return otel.Tracer(resourceLibrary, trace.WithInstrumentationVersion(resourceVersion)), func() { + bsp.Shutdown(context.Background()) + exporter.Shutdown(context.Background()) + } +} diff --git a/internal/peer/dns.go b/internal/peer/dns.go index f9998b1ace..72bb6c9fb4 100644 --- a/internal/peer/dns.go +++ b/internal/peer/dns.go @@ -19,15 +19,15 @@ var ( peerPort int = 8193 ) -type dnsPeers struct { +type DnsPeers struct { c config.Config peers []string peerLock sync.Mutex callbacks []func() } -func newDnsPeers(c config.Config, done chan struct{}) (Peers, error) { - peers := &dnsPeers{ +func NewDnsPeers(c config.Config, done chan struct{}) (Peers, error) { + peers := &DnsPeers{ c: c, } peerList, err := peers.getFromDns() @@ -44,7 +44,7 @@ func newDnsPeers(c config.Config, done chan struct{}) (Peers, error) { return peers, nil } -func (p *dnsPeers) getFromDns() ([]string, error) { +func (p *DnsPeers) getFromDns() ([]string, error) { ips, err := net.LookupIP(internalAddr) if err != nil { return nil, err @@ -62,7 +62,7 @@ func (p *dnsPeers) getFromDns() ([]string, error) { return addrs, nil } -func (p *dnsPeers) GetPeers() ([]string, error) { +func (p *DnsPeers) GetPeers() ([]string, error) { p.peerLock.Lock() defer p.peerLock.Unlock() retList := make([]string, len(p.peers)) @@ -70,7 +70,40 @@ func (p *dnsPeers) GetPeers() ([]string, error) { return retList, nil } -func (p *dnsPeers) watchPeers(done chan struct{}) { +func (p *DnsPeers) GetInstanceID() (string, error) { + p.peerLock.Lock() + defer p.peerLock.Unlock() + machineID, _ := os.Hostname() + + appName := os.Getenv("FLY_APP_NAME") + sixpnAddr := fmt.Sprintf("%s.vm.%s.internal", machineID, appName) + + ips, err := net.LookupIP(sixpnAddr) + if err != nil { + return "", err + } + + if len(ips) < 0 { + return "", fmt.Errorf("dns result empty") + } + + addr := url.URL{ + Scheme: "http", + Host: net.JoinHostPort(ips[0].String(), strconv.Itoa(peerPort)), + } + + return addr.String(), nil +} + +func (p *DnsPeers) Start() (err error) { + return nil +} + +func (p *DnsPeers) Ready() error { + return nil +} + +func (p *DnsPeers) watchPeers(done chan struct{}) { oldPeerList := p.peers sort.Strings(oldPeerList) tk := time.NewTicker(refreshCacheInterval) @@ -110,6 +143,21 @@ func (p *dnsPeers) watchPeers(done chan struct{}) { } } -func (p *dnsPeers) RegisterUpdatedPeersCallback(callback func()) { +func (p *DnsPeers) RegisterUpdatedPeersCallback(callback func()) { p.callbacks = append(p.callbacks, callback) } + +// equal tells whether a and b contain the same elements. +// A nil argument is equivalent to an empty slice. +// lifted from https://yourbasic.org/golang/compare-slices/ +func equal(a, b []string) bool { + if len(a) != len(b) { + return false + } + for i, v := range a { + if v != b[i] { + return false + } + } + return true +} diff --git a/internal/peer/file.go b/internal/peer/file.go index b47228a585..b995183152 100644 --- a/internal/peer/file.go +++ b/internal/peer/file.go @@ -1,31 +1,71 @@ package peer -import "github.com/honeycombio/refinery/config" +import ( + "fmt" + "net" -type filePeers struct { - c config.Config -} + "github.com/honeycombio/refinery/config" + "github.com/honeycombio/refinery/metrics" +) -// NewFilePeers returns a peers collection backed by the config file -func newFilePeers(c config.Config) Peers { - return &filePeers{ - c: c, - } +var _ Peers = (*FilePeers)(nil) + +type FilePeers struct { + Cfg config.Config `inject:""` + Metrics metrics.Metrics `inject:"metrics"` + Done chan struct{} + + id string } -func (p *filePeers) GetPeers() ([]string, error) { +func (p *FilePeers) GetPeers() ([]string, error) { // we never want to return an empty list of peers, so if the config // returns an empty list, return a single peer. This keeps the sharding // logic happy. - peers, err := p.c.GetPeers() + peers := p.Cfg.GetPeers() if len(peers) == 0 { - peers = []string{"http://127.0.0.1:8081"} + addr, err := p.publicAddr() + if err != nil { + return nil, err + + } + peers = []string{addr} } - return peers, err + p.Metrics.Gauge("num_file_peers", float64(len(peers))) + return peers, nil } -func (p *filePeers) RegisterUpdatedPeersCallback(callback func()) { +func (p *FilePeers) GetInstanceID() (string, error) { + return p.id, nil +} + +func (p *FilePeers) RegisterUpdatedPeersCallback(callback func()) { // whenever registered, call the callback immediately // otherwise do nothing since they never change callback() } + +func (p *FilePeers) Start() (err error) { + p.Metrics.Register("num_file_peers", "gauge") + + p.id, err = p.publicAddr() + if err != nil { + return err + } + + return nil +} + +func (p *FilePeers) Ready() error { + return nil +} + +func (p *FilePeers) publicAddr() (string, error) { + addr := p.Cfg.GetPeerListenAddr() + host, port, err := net.SplitHostPort(addr) + if err != nil { + return "", err + } + + return fmt.Sprintf("http://%s:%s", host, port), nil +} diff --git a/internal/peer/file_test.go b/internal/peer/file_test.go index f913cbb385..adb666d256 100644 --- a/internal/peer/file_test.go +++ b/internal/peer/file_test.go @@ -10,9 +10,14 @@ func TestFilePeers(t *testing.T) { peers := []string{"peer"} c := &config.MockConfig{ - GetPeersVal: peers, + PeerManagementType: "file", + GetPeersVal: peers, + GetPeerListenAddrVal: "10.244.0.114:8081", + } + p, err := newPeers(c) + if err != nil { + t.Error(err) } - p := newFilePeers(c) if d, _ := p.GetPeers(); !(len(d) == 1 && d[0] == "peer") { t.Error("received", d, "expected", "[peer]") diff --git a/internal/peer/mock.go b/internal/peer/mock.go index 2bca0c5c10..2a774e4152 100644 --- a/internal/peer/mock.go +++ b/internal/peer/mock.go @@ -1,12 +1,31 @@ package peer +var _ Peers = (*MockPeers)(nil) + type MockPeers struct { Peers []string + ID string } func (p *MockPeers) GetPeers() ([]string, error) { return p.Peers, nil } + +func (p *MockPeers) GetInstanceID() (string, error) { + return p.ID, nil +} + func (p *MockPeers) RegisterUpdatedPeersCallback(callback func()) { callback() } + +func (p *MockPeers) Start() error { + if len(p.ID) == 0 && len(p.Peers) > 0 { + p.ID = p.Peers[0] + } + return nil +} + +func (p *MockPeers) Ready() error { + return nil +} diff --git a/internal/peer/peers.go b/internal/peer/peers.go index fe16bc1ce9..988d81892e 100644 --- a/internal/peer/peers.go +++ b/internal/peer/peers.go @@ -4,30 +4,31 @@ import ( "context" "errors" + "github.com/facebookgo/startstop" "github.com/honeycombio/refinery/config" ) // Peers holds the collection of peers for the cluster type Peers interface { GetPeers() ([]string, error) - + GetInstanceID() (string, error) RegisterUpdatedPeersCallback(callback func()) + + Ready() error + // make it injectable + startstop.Starter } func NewPeers(ctx context.Context, c config.Config, done chan struct{}) (Peers, error) { - t, err := c.GetPeerManagementType() - - if err != nil { - return nil, err - } + t := c.GetPeerManagementType() switch t { case "file": - return newFilePeers(c), nil + return &FilePeers{Done: done}, nil case "redis": - return newRedisPeers(ctx, c, done) + return &RedisPubsubPeers{Done: done}, nil case "fly-dns": - return newDnsPeers(c, done) + return NewDnsPeers(c, done) default: return nil, errors.New("invalid config option 'PeerManagement.Type'") } diff --git a/internal/peer/peers_test.go b/internal/peer/peers_test.go index e481d0796c..390468ab1b 100644 --- a/internal/peer/peers_test.go +++ b/internal/peer/peers_test.go @@ -1,51 +1,85 @@ package peer import ( - "context" + "errors" + "fmt" + "os" "strings" "testing" "time" + "github.com/facebookgo/inject" + "github.com/facebookgo/startstop" "github.com/honeycombio/refinery/config" + "github.com/honeycombio/refinery/logger" + "github.com/honeycombio/refinery/metrics" + "github.com/honeycombio/refinery/pubsub" + "github.com/jonboulle/clockwork" + "github.com/sirupsen/logrus" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "go.opentelemetry.io/otel/trace/noop" ) -func TestNewPeers(t *testing.T) { - c := &config.MockConfig{ - PeerManagementType: "file", - PeerTimeout: 5 * time.Second, - TraceIdFieldNames: []string{"trace.trace_id"}, - ParentIdFieldNames: []string{"trace.parent_id"}, - } - - done := make(chan struct{}) - defer close(done) - p, err := NewPeers(context.Background(), c, done) - assert.NoError(t, err) - require.NotNil(t, p) - - switch i := p.(type) { - case *filePeers: +func newPeers(c config.Config) (Peers, error) { + var peers Peers + var pubsubber pubsub.PubSub + ptype := c.GetPeerManagementType() + switch ptype { + case "file": + peers = &FilePeers{ + Cfg: c, + Metrics: &metrics.NullMetrics{}, + } + // we know FilePeers doesn't need to be Started, so as long as we gave it a Cfg above, + // we can ask it how many peers we have. + // if we only have one, we can use the local pubsub implementation. + peerList, err := peers.GetPeers() + if err != nil { + return nil, err + } + if len(peerList) == 1 { + pubsubber = &pubsub.LocalPubSub{} + } else { + pubsubber = &pubsub.GoRedisPubSub{} + } + case "redis": + pubsubber = &pubsub.GoRedisPubSub{ + Metrics: &metrics.NullMetrics{}, + Tracer: noop.NewTracerProvider().Tracer("test"), + } + peers = &RedisPubsubPeers{} default: - t.Errorf("received %T expected %T", i, &filePeers{}) + // this should have been caught by validation + return nil, errors.New("invalid config option 'PeerManagement.Type'") } - c = &config.MockConfig{ - GetPeerListenAddrVal: "0.0.0.0:8081", - PeerManagementType: "redis", - PeerTimeout: 5 * time.Second, + // we need to include all the metrics types so we can inject them in case they're needed + var g inject.Graph + objects := []*inject.Object{ + {Value: c}, + {Value: peers}, + {Value: pubsubber}, + {Value: &metrics.NullMetrics{}, Name: "metrics"}, + {Value: &logger.NullLogger{}}, + {Value: clockwork.NewFakeClock()}, + } + err := g.Provide(objects...) + if err != nil { + return nil, fmt.Errorf("failed to provide injection graph. error: %+v\n", err) } - p, err = NewPeers(context.Background(), c, done) - assert.NoError(t, err) - require.NotNil(t, p) + if err := g.Populate(); err != nil { + return nil, fmt.Errorf("failed to populate injection graph. error: %+v\n", err) + } - switch i := p.(type) { - case *redisPeers: - default: - t.Errorf("received %T expected %T", i, &redisPeers{}) + ststLogger := logrus.New() + ststLogger.SetLevel(logrus.InfoLevel) + if err := startstop.Start(g.Objects(), ststLogger); err != nil { + fmt.Printf("failed to start injected dependencies. error: %+v\n", err) + os.Exit(1) } + return peers, nil } func TestPeerShutdown(t *testing.T) { @@ -55,12 +89,13 @@ func TestPeerShutdown(t *testing.T) { PeerTimeout: 5 * time.Second, } + p, err := newPeers(c) + require.NoError(t, err) + done := make(chan struct{}) - p, err := NewPeers(context.Background(), c, done) - assert.NoError(t, err) require.NotNil(t, p) - peer, ok := p.(*redisPeers) + peer, ok := p.(*RedisPubsubPeers) assert.True(t, ok) peers, err := peer.GetPeers() @@ -74,6 +109,6 @@ func TestPeerShutdown(t *testing.T) { assert.Eventually(t, func() bool { peers, err = peer.GetPeers() assert.NoError(t, err) - return len(peers) == 0 + return len(peers) == 1 }, 5*time.Second, 200*time.Millisecond) } diff --git a/internal/peer/pubsub_redis.go b/internal/peer/pubsub_redis.go new file mode 100644 index 0000000000..99d656b40b --- /dev/null +++ b/internal/peer/pubsub_redis.go @@ -0,0 +1,324 @@ +package peer + +import ( + "context" + "errors" + "fmt" + "math/rand" + "net" + "os" + "strings" + "time" + + "github.com/dgryski/go-wyhash" + "github.com/honeycombio/refinery/config" + "github.com/honeycombio/refinery/generics" + "github.com/honeycombio/refinery/logger" + "github.com/honeycombio/refinery/metrics" + "github.com/honeycombio/refinery/pubsub" + "github.com/jonboulle/clockwork" +) + +const ( + // PeerEntryTimeout is how long we will wait before expiring a peer that + // doesn't check in. The ratio of refresh to peer timeout should be about + // 1/3; we overshoot because we add a jitter to the refresh interval. + PeerEntryTimeout = 10 * time.Second + + // refreshCacheInterval is how frequently this host will re-register itself + // by publishing their address. This should happen about 3x during each + // timeout phase in order to allow multiple timeouts to fail and yet still + // keep the host in the mix. + refreshCacheInterval = 3 * time.Second +) + +type peerAction string + +const ( + Register peerAction = "R" + Unregister peerAction = "U" +) + +type peerCommand struct { + action peerAction + peer string +} + +func newPeerCommand(action peerAction, peer string) *peerCommand { + return &peerCommand{ + action: action, + peer: peer, + } +} + +func (p *peerCommand) unmarshal(msg string) bool { + if len(msg) < 2 { + return false + } + p.action = peerAction(msg[:1]) + p.peer = msg[1:] + switch p.action { + case Register, Unregister: + return true + default: + return false + } +} + +func (p *peerCommand) marshal() string { + return string(p.action) + p.peer +} + +var _ Peers = (*RedisPubsubPeers)(nil) + +type RedisPubsubPeers struct { + Config config.Config `inject:""` + Metrics metrics.Metrics `inject:"metrics"` + Logger logger.Logger `inject:""` + PubSub pubsub.PubSub `inject:""` + Clock clockwork.Clock `inject:""` + + // Done is a channel that will be closed when the service should stop. + // After it is closed, peers service should signal the rest of the cluster + // that it is no longer available. + // However, any messages send on the peers channel will still be processed + // since the pubsub subscription is still active. + Done chan struct{} + + peers *generics.SetWithTTL[string] + hash uint64 + callbacks []func() + sub pubsub.Subscription +} + +// checkHash checks the hash of the current list of peers and calls any registered callbacks +func (p *RedisPubsubPeers) checkHash() { + peers := p.peers.Members() + newhash := hashList(peers) + if newhash != p.hash { + p.hash = newhash + for _, cb := range p.callbacks { + go cb() + } + } + p.Metrics.Gauge("num_peers", float64(len(peers))) + p.Metrics.Gauge("peer_hash", float64(p.hash)) +} + +func (p *RedisPubsubPeers) listen(ctx context.Context, msg string) { + cmd := &peerCommand{} + if !cmd.unmarshal(msg) { + return + } + p.Metrics.Count("peer_messages", 1) + switch cmd.action { + case Unregister: + p.peers.Remove(cmd.peer) + case Register: + p.peers.Add(cmd.peer) + } + p.checkHash() +} + +func (p *RedisPubsubPeers) Start() error { + if p.PubSub == nil { + return errors.New("injected pubsub is nil") + } + // if we didn't get an injected logger or metrics, use the null ones (for tests) + if p.Metrics == nil { + p.Metrics = &metrics.NullMetrics{} + } + if p.Logger == nil { + p.Logger = &logger.NullLogger{} + } + + p.peers = generics.NewSetWithTTL[string](PeerEntryTimeout) + p.callbacks = make([]func(), 0) + p.Logger.Info().Logf("subscribing to pubsub peers channel") + p.sub = p.PubSub.Subscribe(context.Background(), "peers", p.listen) + + p.Metrics.Register("num_peers", "gauge") + p.Metrics.Register("peer_hash", "gauge") + p.Metrics.Register("peer_messages", "counter") + + myaddr, err := p.publicAddr() + if err != nil { + return err + } + p.peers.Add(myaddr) + return nil +} + +func (p *RedisPubsubPeers) Ready() error { + myaddr, err := p.publicAddr() + if err != nil { + return err + } + // periodically refresh our presence in the list of peers, and update peers as they come in + go func() { + // we want our refresh cache interval to vary from peer to peer so they + // don't always hit redis at the same time, so we add a random jitter of up + // to 20% of the interval + interval := refreshCacheInterval + time.Duration(rand.Int63n(int64(refreshCacheInterval/5))) + ticker := p.Clock.NewTicker(interval) + defer ticker.Stop() + + // every 25-35 seconds, log the current state of the peers + // (we could make this configurable if we wanted but it's not that important) + logTicker := p.Clock.NewTicker((time.Duration(rand.Intn(10000))*time.Millisecond + (25 * time.Second))) + defer logTicker.Stop() + for { + select { + case <-p.Done: + p.stop() + return + case <-ticker.Chan(): + + // publish our presence periodically + ctx, cancel := context.WithTimeout(context.Background(), p.Config.GetPeerTimeout()) + err := p.PubSub.Publish(ctx, "peers", newPeerCommand(Register, myaddr).marshal()) + if err != nil { + p.Logger.Error().WithFields(map[string]interface{}{ + "error": err, + "hostaddress": myaddr, + }).Logf("failed to publish peer address") + } + cancel() + case <-logTicker.Chan(): + p.Logger.Debug().WithFields(map[string]any{ + "peers": p.peers.Members(), + "hash": p.hash, + "num_peers": len(p.peers.Members()), + "self": myaddr, + }).Logf("peer report") + } + } + }() + + return nil +} + +// stop send a message to the pubsub channel to unregister this peer +// but it does not close the subscription. +func (p *RedisPubsubPeers) stop() { + // unregister ourselves + myaddr, err := p.publicAddr() + if err != nil { + p.Logger.Error().Logf("failed to get public address") + return + } + + err = p.PubSub.Publish(context.Background(), "peers", newPeerCommand(Unregister, myaddr).marshal()) + if err != nil { + p.Logger.Error().WithFields(map[string]interface{}{ + "error": err, + "hostaddress": myaddr, + }).Logf("failed to publish peer address") + } +} + +func (p *RedisPubsubPeers) GetPeers() ([]string, error) { + // we never want to return an empty list of peers, so if the system returns + // an empty list, return a single peer (its name doesn't really matter). + // This keeps the sharding logic happy. + peers := p.peers.Members() + if len(peers) == 0 { + peers = []string{"http://127.0.0.1:8081"} + } + return peers, nil +} + +func (p *RedisPubsubPeers) GetInstanceID() (string, error) { + return p.publicAddr() +} + +func (p *RedisPubsubPeers) RegisterUpdatedPeersCallback(callback func()) { + p.callbacks = append(p.callbacks, callback) +} + +func (p *RedisPubsubPeers) publicAddr() (string, error) { + // compute the public version of my peer listen address + listenAddr := p.Config.GetPeerListenAddr() + // first, extract the port + _, port, err := net.SplitHostPort(listenAddr) + + if err != nil { + return "", err + } + + var myIdentifier string + + // If RedisIdentifier is set, use as identifier. + if redisIdentifier := p.Config.GetRedisIdentifier(); redisIdentifier != "" { + myIdentifier = redisIdentifier + p.Logger.Info().WithField("identifier", myIdentifier).Logf("using specified RedisIdentifier from config") + } else { + // Otherwise, determine identifier from network interface. + myIdentifier, err = p.getIdentifierFromInterface() + if err != nil { + return "", err + } + } + + publicListenAddr := fmt.Sprintf("http://%s:%s", myIdentifier, port) + + return publicListenAddr, nil +} + +// getIdentifierFromInterface returns a string that uniquely identifies this +// host in the network. If an interface is specified, it will scan it to +// determine an identifier from the first IP address on that interface. +// Otherwise, it will use the hostname. +func (p *RedisPubsubPeers) getIdentifierFromInterface() (string, error) { + myIdentifier, _ := os.Hostname() + identifierInterfaceName := p.Config.GetIdentifierInterfaceName() + + if identifierInterfaceName != "" { + ifc, err := net.InterfaceByName(identifierInterfaceName) + if err != nil { + p.Logger.Error().WithField("interface", identifierInterfaceName). + Logf("IdentifierInterfaceName set but couldn't find interface by that name") + return "", err + } + addrs, err := ifc.Addrs() + if err != nil { + p.Logger.Error().WithField("interface", identifierInterfaceName). + Logf("IdentifierInterfaceName set but couldn't list addresses") + return "", err + } + var ipStr string + for _, addr := range addrs { + // ParseIP doesn't know what to do with the suffix + ip := net.ParseIP(strings.Split(addr.String(), "/")[0]) + ipv6 := p.Config.GetUseIPV6Identifier() + if ipv6 && ip.To16() != nil { + ipStr = fmt.Sprintf("[%s]", ip.String()) + break + } + if !ipv6 && ip.To4() != nil { + ipStr = ip.String() + break + } + } + if ipStr == "" { + err = errors.New("could not find a valid IP to use from interface") + p.Logger.Error().WithField("interface", ifc.Name). + Logf("IdentifierInterfaceName set but couldn't find a valid IP to use from interface") + return "", err + } + myIdentifier = ipStr + p.Logger.Info().WithField("identifier", myIdentifier).WithField("interface", ifc.Name). + Logf("using identifier from interface") + } + + return myIdentifier, nil +} + +// hashList hashes a list of strings into a single uint64 +func hashList(list []string) uint64 { + var h uint64 = 255798297204 // arbitrary seed + for _, s := range list { + h = wyhash.Hash([]byte(s), h) + } + return h +} diff --git a/internal/peer/pubsub_test.go b/internal/peer/pubsub_test.go new file mode 100644 index 0000000000..d0b02cb0b0 --- /dev/null +++ b/internal/peer/pubsub_test.go @@ -0,0 +1,56 @@ +package peer + +import ( + "testing" + + "github.com/honeycombio/refinery/config" + "github.com/honeycombio/refinery/logger" + "github.com/stretchr/testify/assert" +) + +func Test_publicAddr(t *testing.T) { + cfg := &config.MockConfig{ + GetPeerListenAddrVal: "127.0.0.1:3443", + RedisIdentifier: "somehostname", + IdentifierInterfaceName: "en0", + } + tests := []struct { + name string + c config.Config + want string + wantErr bool + }{ + {"basic", cfg, "http://somehostname:3443", false}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + peers := &RedisPubsubPeers{ + Config: tt.c, + Logger: &logger.NullLogger{}, + } + got, err := peers.publicAddr() + if (err != nil) != tt.wantErr { + t.Errorf("publicAddr() error = %v, wantErr %v", err, tt.wantErr) + return + } + if got != tt.want { + t.Errorf("publicAddr() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestPeerActions(t *testing.T) { + cmd := newPeerCommand(Register, "foo") + assert.Equal(t, "Rfoo", cmd.marshal()) + assert.Equal(t, "foo", cmd.peer) + assert.Equal(t, Register, cmd.action) + cmd2 := peerCommand{} + b := cmd2.unmarshal("Ubar") + assert.True(t, b) + assert.Equal(t, "bar", cmd2.peer) + assert.Equal(t, Unregister, cmd2.action) + + b = cmd2.unmarshal("invalid") + assert.False(t, b) +} diff --git a/internal/peer/redis.go b/internal/peer/redis.go deleted file mode 100644 index a8c0cba9b5..0000000000 --- a/internal/peer/redis.go +++ /dev/null @@ -1,359 +0,0 @@ -package peer - -import ( - "context" - "crypto/tls" - "errors" - "fmt" - "net" - "os" - "sort" - "strings" - "sync" - "time" - - "github.com/gomodule/redigo/redis" - "github.com/honeycombio/refinery/config" - "github.com/honeycombio/refinery/internal/redimem" - "github.com/sirupsen/logrus" -) - -const ( - // refreshCacheInterval is how frequently this host will re-register itself - // with Redis. This should happen about 3x during each timeout phase in order - // to allow multiple timeouts to fail and yet still keep the host in the mix. - // Falling out of Redis will result in re-hashing the host-trace affinity and - // will cause broken traces for those that fall on both sides of the rehashing. - // This is why it's important to ensure hosts stay in the pool. - refreshCacheInterval = 3 * time.Second - - // peerEntryTimeout is how long redis will wait before expiring a peer that - // doesn't check in. The ratio of refresh to peer timeout should be 1/3. Redis - // timeouts are in seconds and entries can last up to 2 seconds longer than - // their expected timeout (in my load testing), so the lower bound for this - // timer should be ... 5sec? - peerEntryTimeout = 10 * time.Second -) - -type redisPeers struct { - store *redimem.RedisMembership - peers []string - peerLock sync.Mutex - c config.Config - callbacks []func() - publicAddr string -} - -// NewRedisPeers returns a peers collection backed by redis -func newRedisPeers(ctx context.Context, c config.Config, done chan struct{}) (Peers, error) { - redisHost, _ := c.GetRedisHost() - - if redisHost == "" { - redisHost = "localhost:6379" - } - - options := buildOptions(c) - pool := &redis.Pool{ - MaxIdle: 3, - MaxActive: 30, - IdleTimeout: 5 * time.Minute, - Wait: true, - Dial: func() (redis.Conn, error) { - // if redis is started at the same time as refinery, connecting to redis can - // fail and cause refinery to error out. - // Instead, we will try to connect to redis for up to 10 seconds with - // a 1 second delay between attempts to allow the redis process to init - var ( - conn redis.Conn - err error - ) - for timeout := time.After(10 * time.Second); ; { - select { - case <-timeout: - return nil, err - default: - if authCode, _ := c.GetRedisAuthCode(); authCode != "" { - conn, err = redis.Dial("tcp", redisHost, options...) - if err != nil { - return nil, err - } - if _, err := conn.Do("AUTH", authCode); err != nil { - conn.Close() - return nil, err - } - if err == nil { - return conn, nil - } - } else { - conn, err = redis.Dial("tcp", redisHost, options...) - if err == nil { - return conn, nil - } - } - time.Sleep(time.Second) - } - } - }, - } - - // deal with this error - address, err := publicAddr(c) - - if err != nil { - return nil, err - } - - peers := &redisPeers{ - store: &redimem.RedisMembership{ - Prefix: c.GetRedisPrefix(), - Pool: pool, - }, - peers: make([]string, 1), - c: c, - callbacks: make([]func(), 0), - publicAddr: address, - } - - // register myself once - err = peers.store.Register(ctx, address, peerEntryTimeout) - if err != nil { - logrus.WithError(err).Errorf("failed to register self with redis peer store") - return nil, err - } - - // go establish a regular registration heartbeat to ensure I stay alive in redis - go peers.registerSelf(done) - - // get our peer list once to seed ourselves - peers.updatePeerListOnce() - - // go watch the list of peers and trigger callbacks whenever it changes. - // populate my local list of peers so each request can hit memory and only hit - // redis on a ticker - go peers.watchPeers(done) - - return peers, nil -} - -func (p *redisPeers) GetPeers() ([]string, error) { - p.peerLock.Lock() - defer p.peerLock.Unlock() - retList := make([]string, len(p.peers)) - copy(retList, p.peers) - return retList, nil -} - -func (p *redisPeers) RegisterUpdatedPeersCallback(cb func()) { - p.callbacks = append(p.callbacks, cb) -} - -// registerSelf inserts self into the peer list and updates self's entry on a -// regular basis so it doesn't time out and get removed from the list of peers. -// When this function stops, it tries to remove the registered key. -func (p *redisPeers) registerSelf(done chan struct{}) { - tk := time.NewTicker(refreshCacheInterval) - for { - select { - case <-tk.C: - ctx, cancel := context.WithTimeout(context.Background(), p.c.GetPeerTimeout()) - // every interval, insert a timeout record. we ignore the error - // here since Register() logs the error for us. - p.store.Register(ctx, p.publicAddr, peerEntryTimeout) - cancel() - case <-done: - // unregister ourselves - ctx, cancel := context.WithTimeout(context.Background(), p.c.GetPeerTimeout()) - p.store.Unregister(ctx, p.publicAddr) - cancel() - return - } - } -} - -func (p *redisPeers) updatePeerListOnce() { - ctx, cancel := context.WithTimeout(context.Background(), p.c.GetPeerTimeout()) - defer cancel() - - currentPeers, err := p.store.GetMembers(ctx) - if err != nil { - logrus.WithError(err). - WithFields(logrus.Fields{ - "name": p.publicAddr, - "timeout": p.c.GetPeerTimeout().String(), - }). - Error("get members failed") - return - } - sort.Strings(currentPeers) - // update peer list and trigger callbacks saying the peer list has changed - p.peerLock.Lock() - p.peers = currentPeers - p.peerLock.Unlock() -} - -func (p *redisPeers) watchPeers(done chan struct{}) { - oldPeerList := p.peers - sort.Strings(oldPeerList) - tk := time.NewTicker(refreshCacheInterval) - - for { - select { - case <-tk.C: - ctx, cancel := context.WithTimeout(context.Background(), p.c.GetPeerTimeout()) - currentPeers, err := p.store.GetMembers(ctx) - cancel() - - if err != nil { - logrus.WithError(err). - WithFields(logrus.Fields{ - "name": p.publicAddr, - "timeout": p.c.GetPeerTimeout().String(), - "oldPeers": oldPeerList, - }). - Error("get members failed during watch") - continue - } - - sort.Strings(currentPeers) - if !equal(oldPeerList, currentPeers) { - // update peer list and trigger callbacks saying the peer list has changed - p.peerLock.Lock() - p.peers = currentPeers - oldPeerList = currentPeers - p.peerLock.Unlock() - for _, callback := range p.callbacks { - // don't block on any of the callbacks. - go callback() - } - } - case <-done: - p.peerLock.Lock() - p.peers = []string{} - p.peerLock.Unlock() - return - } - } -} - -func buildOptions(c config.Config) []redis.DialOption { - options := []redis.DialOption{ - redis.DialReadTimeout(1 * time.Second), - redis.DialConnectTimeout(1 * time.Second), - redis.DialDatabase(c.GetRedisDatabase()), - } - - username, _ := c.GetRedisUsername() - if username != "" { - options = append(options, redis.DialUsername(username)) - } - - password, _ := c.GetRedisPassword() - if password != "" { - options = append(options, redis.DialPassword(password)) - } - - useTLS, _ := c.GetUseTLS() - tlsInsecure, _ := c.GetUseTLSInsecure() - if useTLS { - tlsConfig := &tls.Config{ - MinVersion: tls.VersionTLS12, - } - - if tlsInsecure { - tlsConfig.InsecureSkipVerify = true - } - - options = append(options, - redis.DialTLSConfig(tlsConfig), - redis.DialUseTLS(true)) - } - - return options -} - -func publicAddr(c config.Config) (string, error) { - // compute the public version of my peer listen address - listenAddr, _ := c.GetPeerListenAddr() - _, port, err := net.SplitHostPort(listenAddr) - - if err != nil { - return "", err - } - - var myIdentifier string - - // If RedisIdentifier is set, use as identifier. - if redisIdentifier, _ := c.GetRedisIdentifier(); redisIdentifier != "" { - myIdentifier = redisIdentifier - logrus.WithField("identifier", myIdentifier).Info("using specified RedisIdentifier from config") - } else { - // Otherwise, determine identifier from network interface. - myIdentifier, err = getIdentifierFromInterfaces(c) - if err != nil { - return "", err - } - } - - publicListenAddr := fmt.Sprintf("http://%s:%s", myIdentifier, port) - - return publicListenAddr, nil -} - -// Scan network interfaces to determine an identifier from either IP or hostname. -func getIdentifierFromInterfaces(c config.Config) (string, error) { - myIdentifier, _ := os.Hostname() - identifierInterfaceName, _ := c.GetIdentifierInterfaceName() - - if identifierInterfaceName != "" { - ifc, err := net.InterfaceByName(identifierInterfaceName) - if err != nil { - logrus.WithError(err).WithField("interface", identifierInterfaceName). - Error("IdentifierInterfaceName set but couldn't find interface by that name") - return "", err - } - addrs, err := ifc.Addrs() - if err != nil { - logrus.WithError(err).WithField("interface", identifierInterfaceName). - Error("IdentifierInterfaceName set but couldn't list addresses") - return "", err - } - var ipStr string - for _, addr := range addrs { - // ParseIP doesn't know what to do with the suffix - ip := net.ParseIP(strings.Split(addr.String(), "/")[0]) - ipv6, _ := c.GetUseIPV6Identifier() - if ipv6 && ip.To16() != nil { - ipStr = fmt.Sprintf("[%s]", ip.String()) - break - } - if !ipv6 && ip.To4() != nil { - ipStr = ip.String() - break - } - } - if ipStr == "" { - err = errors.New("could not find a valid IP to use from interface") - logrus.WithField("interface", ifc.Name).WithError(err) - return "", err - } - myIdentifier = ipStr - logrus.WithField("identifier", myIdentifier).WithField("interface", ifc.Name).Info("using identifier from interface") - } - - return myIdentifier, nil -} - -// equal tells whether a and b contain the same elements. -// A nil argument is equivalent to an empty slice. -// lifted from https://yourbasic.org/golang/compare-slices/ -func equal(a, b []string) bool { - if len(a) != len(b) { - return false - } - for i, v := range a { - if v != b[i] { - return false - } - } - return true -} diff --git a/logger/honeycomb.go b/logger/honeycomb.go index 8dfd15bd20..fa77cd1217 100644 --- a/logger/honeycomb.go +++ b/logger/honeycomb.go @@ -40,10 +40,7 @@ func (h *HoneycombLogger) Start() error { // preserve it. // TODO: make LogLevel part of the HoneycombLogger/LogrusLogger sections? h.level = h.Config.GetLoggerLevel() - loggerConfig, err := h.Config.GetHoneycombLoggerConfig() - if err != nil { - return err - } + loggerConfig := h.Config.GetHoneycombLoggerConfig() h.loggerConfig = loggerConfig var loggerTx transmission.Sender if h.loggerConfig.APIKey == "" { @@ -59,7 +56,7 @@ func (h *HoneycombLogger) Start() error { } } - if loggerConfig.SamplerEnabled { + if loggerConfig.GetSamplerEnabled() { h.sampler = &dynsampler.PerKeyThroughput{ ClearFrequencyDuration: 10 * time.Second, PerKeyThroughputPerSec: loggerConfig.SamplerThroughput, @@ -122,18 +119,11 @@ func (h *HoneycombLogger) readResponses() { } } -func (h *HoneycombLogger) reloadBuilder() { +func (h *HoneycombLogger) reloadBuilder(cfgHash, ruleHash string) { h.Debug().Logf("reloading config for Honeycomb logger") // preserve log level h.level = h.Config.GetLoggerLevel() - loggerConfig, err := h.Config.GetHoneycombLoggerConfig() - if err != nil { - // complain about this both to STDOUT and to the previously configured - // honeycomb logger - fmt.Printf("failed to reload configs for Honeycomb logger: %+v\n", err) - h.Error().Logf("failed to reload configs for Honeycomb logger: %+v", err) - return - } + loggerConfig := h.Config.GetHoneycombLoggerConfig() h.loggerConfig = loggerConfig h.builder.APIHost = h.loggerConfig.APIHost h.builder.WriteKey = h.loggerConfig.APIKey diff --git a/logger/logger.go b/logger/logger.go index 21eb019c08..bde15a12c1 100644 --- a/logger/logger.go +++ b/logger/logger.go @@ -29,16 +29,14 @@ type Entry interface { func GetLoggerImplementation(c config.Config) Logger { var logger Logger - loggerType, err := c.GetLoggerType() - if err != nil { - fmt.Printf("unable to get logger type from config: %v\n", err) - os.Exit(1) - } + loggerType := c.GetLoggerType() switch loggerType { case "honeycomb": logger = &HoneycombLogger{} case "stdout": logger = &StdoutLogger{} + case "none": + logger = &NullLogger{} default: fmt.Printf("unknown logger type %s. Exiting.\n", loggerType) os.Exit(1) diff --git a/logger/logrus.go b/logger/logrus.go index e18c41a268..0bfe9d847c 100644 --- a/logger/logrus.go +++ b/logger/logrus.go @@ -25,18 +25,15 @@ type StdoutLogger struct { var _ = Logger((*StdoutLogger)(nil)) type LogrusEntry struct { - entry *logrus.Entry - level logrus.Level + entry *logrus.Entry + level logrus.Level sampler dynsampler.Sampler } func (l *StdoutLogger) Start() error { l.logger = logrus.New() l.logger.SetLevel(l.level) - cfg, err := l.Config.GetStdoutLoggerConfig() - if err != nil { - return err - } + cfg := l.Config.GetStdoutLoggerConfig() if cfg.Structured { l.logger.SetFormatter(&logrus.JSONFormatter{}) @@ -44,11 +41,11 @@ func (l *StdoutLogger) Start() error { if cfg.SamplerEnabled { l.sampler = &dynsampler.PerKeyThroughput{ - ClearFrequencyDuration: 10*time.Second, + ClearFrequencyDuration: 10 * time.Second, PerKeyThroughputPerSec: cfg.SamplerThroughput, - MaxKeys: 1000, + MaxKeys: 1000, } - err = l.sampler.Start() + err := l.sampler.Start() if err != nil { return err } @@ -63,8 +60,8 @@ func (l *StdoutLogger) Debug() Entry { } return &LogrusEntry{ - entry: logrus.NewEntry(l.logger), - level: logrus.DebugLevel, + entry: logrus.NewEntry(l.logger), + level: logrus.DebugLevel, sampler: l.sampler, } } @@ -75,8 +72,8 @@ func (l *StdoutLogger) Info() Entry { } return &LogrusEntry{ - entry: logrus.NewEntry(l.logger), - level: logrus.InfoLevel, + entry: logrus.NewEntry(l.logger), + level: logrus.InfoLevel, sampler: l.sampler, } } @@ -87,8 +84,8 @@ func (l *StdoutLogger) Warn() Entry { } return &LogrusEntry{ - entry: logrus.NewEntry(l.logger), - level: logrus.WarnLevel, + entry: logrus.NewEntry(l.logger), + level: logrus.WarnLevel, sampler: l.sampler, } } @@ -99,8 +96,8 @@ func (l *StdoutLogger) Error() Entry { } return &LogrusEntry{ - entry: logrus.NewEntry(l.logger), - level: logrus.ErrorLevel, + entry: logrus.NewEntry(l.logger), + level: logrus.ErrorLevel, sampler: l.sampler, } } @@ -120,24 +117,24 @@ func (l *StdoutLogger) SetLevel(level string) error { func (l *LogrusEntry) WithField(key string, value interface{}) Entry { return &LogrusEntry{ - entry: l.entry.WithField(key, value), - level: l.level, + entry: l.entry.WithField(key, value), + level: l.level, sampler: l.sampler, } } func (l *LogrusEntry) WithString(key string, value string) Entry { return &LogrusEntry{ - entry: l.entry.WithField(key, value), - level: l.level, + entry: l.entry.WithField(key, value), + level: l.level, sampler: l.sampler, } } func (l *LogrusEntry) WithFields(fields map[string]interface{}) Entry { return &LogrusEntry{ - entry: l.entry.WithFields(fields), - level: l.level, + entry: l.entry.WithFields(fields), + level: l.level, sampler: l.sampler, } } @@ -148,8 +145,8 @@ func (l *LogrusEntry) Logf(f string, args ...interface{}) { // this will give us a different sample rate for each level and format string // and avoid high cardinality args making the throughput sampler less effective rate := l.sampler.GetSampleRate(fmt.Sprintf("%s:%s", l.level, f)) - if shouldDrop(uint(rate)){ - return + if shouldDrop(uint(rate)) { + return } l.entry.WithField("SampleRate", rate) } @@ -172,4 +169,4 @@ func shouldDrop(rate uint) bool { } return rand.Intn(int(rate)) != 0 -} \ No newline at end of file +} diff --git a/metrics/legacy.go b/metrics/legacy.go index 6993bbc350..8df829d355 100644 --- a/metrics/legacy.go +++ b/metrics/legacy.go @@ -65,6 +65,9 @@ type updown struct { } func (h *LegacyMetrics) Start() error { + h.lock.Lock() + defer h.lock.Unlock() + h.Logger.Debug().Logf("Starting LegacyMetrics") defer func() { h.Logger.Debug().Logf("Finished starting LegacyMetrics") }() mc := h.Config.GetLegacyMetricsConfig() @@ -89,7 +92,7 @@ func (h *LegacyMetrics) Start() error { return nil } -func (h *LegacyMetrics) reloadBuilder() { +func (h *LegacyMetrics) reloadBuilder(cfgHash, ruleHash string) { h.Logger.Debug().Logf("reloading config for honeycomb metrics reporter") mc := h.Config.GetLegacyMetricsConfig() h.libhClient.Close() diff --git a/metrics/multi_metrics.go b/metrics/multi_metrics.go index cbfcfee5fe..d773f3f805 100644 --- a/metrics/multi_metrics.go +++ b/metrics/multi_metrics.go @@ -50,8 +50,9 @@ func (m *MultiMetrics) Start() error { return nil } -// This is not safe for concurrent use! func (m *MultiMetrics) AddChild(met Metrics) { + m.lock.Lock() + defer m.lock.Unlock() m.children = append(m.children, met) } diff --git a/metrics/otel_metrics.go b/metrics/otel_metrics.go index 231138e635..1fc12899ae 100644 --- a/metrics/otel_metrics.go +++ b/metrics/otel_metrics.go @@ -34,7 +34,7 @@ type OTelMetrics struct { counters map[string]metric.Int64Counter gauges map[string]metric.Float64ObservableGauge - histograms map[string]metric.Int64Histogram + histograms map[string]metric.Float64Histogram updowns map[string]metric.Int64UpDownCounter // values keeps a map of all the non-histogram metrics and their current value @@ -47,9 +47,12 @@ type OTelMetrics struct { func (o *OTelMetrics) Start() error { cfg := o.Config.GetOTelMetricsConfig() + o.lock.Lock() + defer o.lock.Unlock() + o.counters = make(map[string]metric.Int64Counter) o.gauges = make(map[string]metric.Float64ObservableGauge) - o.histograms = make(map[string]metric.Int64Histogram) + o.histograms = make(map[string]metric.Float64Histogram) o.updowns = make(map[string]metric.Int64UpDownCounter) o.values = make(map[string]float64) @@ -179,6 +182,9 @@ func (o *OTelMetrics) Start() error { } func (o *OTelMetrics) Register(name string, metricType string) { + o.lock.Lock() + defer o.lock.Unlock() + switch metricType { case "counter": ctr, err := o.meter.Int64Counter(name) @@ -189,9 +195,15 @@ func (o *OTelMetrics) Register(name string, metricType string) { o.counters[name] = ctr case "gauge": var f metric.Float64Callback = func(_ context.Context, result metric.Float64Observer) error { + // this callback is invoked from outside this function call, so we + // need to Rlock when we read the values map. We don't know how long + // Observe() takes, so we make a copy of the value and unlock before + // calling Observe. o.lock.RLock() - defer o.lock.RUnlock() - result.Observe(o.values[name]) + v := o.values[name] + o.lock.RUnlock() + + result.Observe(v) return nil } g, err := o.meter.Float64ObservableGauge(name, @@ -203,7 +215,7 @@ func (o *OTelMetrics) Register(name string, metricType string) { } o.gauges[name] = g case "histogram": - h, err := o.meter.Int64Histogram(name) + h, err := o.meter.Float64Histogram(name) if err != nil { o.Logger.Error().WithString("msg", "failed to create histogram").WithString("name", name) return @@ -256,7 +268,7 @@ func (o *OTelMetrics) Histogram(name string, val interface{}) { if h, ok := o.histograms[name]; ok { f := ConvertNumeric(val) - h.Record(context.Background(), int64(f)) + h.Record(context.Background(), f) o.values[name] += f } } diff --git a/metrics/prometheus.go b/metrics/prometheus.go index 578e06418f..8b54a11d62 100644 --- a/metrics/prometheus.go +++ b/metrics/prometheus.go @@ -30,6 +30,9 @@ func (p *PromMetrics) Start() error { defer func() { p.Logger.Debug().Logf("Finished starting PromMetrics") }() pc := p.Config.GetPrometheusMetricsConfig() + p.lock.Lock() + defer p.lock.Unlock() + p.metrics = make(map[string]interface{}) p.values = make(map[string]float64) diff --git a/pubsub/pubsub.go b/pubsub/pubsub.go new file mode 100644 index 0000000000..cbf4d0c311 --- /dev/null +++ b/pubsub/pubsub.go @@ -0,0 +1,46 @@ +package pubsub + +import ( + "context" + + "github.com/facebookgo/startstop" +) + +// general usage: +// pubsub := pubsub.NewXXXPubSub() +// pubsub.Start() +// defer pubsub.Stop() +// ctx := context.Background() +// sub := pubsub.Subscribe(ctx, "topic", func(msg string) { +// fmt.Println(msg) +// } +// pubsub.Publish(ctx, "topic", "message") +// sub.Close() // optional +// pubsub.Close() + +type PubSub interface { + // Publish sends a message to all subscribers of the specified topic. + Publish(ctx context.Context, topic, message string) error + // Subscribe returns a Subscription to the specified topic. + // The callback will be called for each message published to the topic. + // There is no unsubscribe method; close the subscription to stop receiving messages. + // The subscription only exists to provide a way to stop receiving messages; if you don't need to stop, + // you can ignore the return value. + Subscribe(ctx context.Context, topic string, callback SubscriptionCallback) Subscription + // Close shuts down all topics and the pubsub connection. + Close() + + // we want to embed startstop.Starter and startstop.Stopper so that we + // can participate in injection + startstop.Starter + startstop.Stopper +} + +// SubscriptionCallback is the function signature for a subscription callback. +type SubscriptionCallback func(context.Context, string) + +type Subscription interface { + // Close stops the subscription which means the callback will no longer be called. + // Optional; the topic will be closed when the pubsub connection is closed. + Close() +} diff --git a/pubsub/pubsub_goredis.go b/pubsub/pubsub_goredis.go new file mode 100644 index 0000000000..33a3a471e0 --- /dev/null +++ b/pubsub/pubsub_goredis.go @@ -0,0 +1,188 @@ +package pubsub + +import ( + "context" + "crypto/tls" + "sync" + + "go.opentelemetry.io/otel/trace" + + "github.com/honeycombio/refinery/config" + "github.com/honeycombio/refinery/internal/otelutil" + "github.com/honeycombio/refinery/logger" + "github.com/honeycombio/refinery/metrics" + "github.com/redis/go-redis/v9" +) + +// Notes for the future: we implemented a Redis-based PubSub system using 3 +// different libraries: go-redis, redigo, and rueidis. All three implementations +// perform similarly, but go-redis is definitely the easiest to use for PubSub. +// The rueidis library is probably the fastest for high-performance Redis use +// when you want Redis to be a database or cache, and it has some nice features +// like automatic pipelining, but it's pretty low-level and the documentation is +// poor. Redigo is feeling pretty old at this point. + +// GoRedisPubSub is a PubSub implementation that uses Redis as the message broker +// and the go-redis library to interact with Redis. +type GoRedisPubSub struct { + Config config.Config `inject:""` + Logger logger.Logger `inject:""` + Metrics metrics.Metrics `inject:"metrics"` + Tracer trace.Tracer `inject:"tracer"` + client redis.UniversalClient + subs []*GoRedisSubscription + mut sync.RWMutex +} + +// Ensure that GoRedisPubSub implements PubSub +var _ PubSub = (*GoRedisPubSub)(nil) + +type GoRedisSubscription struct { + topic string + pubsub *redis.PubSub + cb SubscriptionCallback + done chan struct{} + once sync.Once +} + +// Ensure that GoRedisSubscription implements Subscription +var _ Subscription = (*GoRedisSubscription)(nil) + +func (ps *GoRedisPubSub) Start() error { + options := new(redis.UniversalOptions) + var ( + authcode string + clusterModeEnabled bool + ) + + if ps.Config != nil { + redisCfg := ps.Config.GetRedisPeerManagement() + hosts := []string{redisCfg.Host} + // if we have a cluster host, use that instead of the regular host + if len(redisCfg.ClusterHosts) > 0 { + ps.Logger.Info().Logf("ClusterHosts was specified, setting up Redis Cluster") + hosts = redisCfg.ClusterHosts + clusterModeEnabled = true + } + + authcode = redisCfg.AuthCode + + options.Addrs = hosts + options.Username = redisCfg.Username + options.Password = redisCfg.Password + options.DB = redisCfg.Database + + if redisCfg.UseTLS { + ps.Logger.Info().WithField("TLSInsecure", redisCfg.UseTLSInsecure).Logf("Using TLS with Redis") + options.TLSConfig = &tls.Config{ + MinVersion: tls.VersionTLS12, + InsecureSkipVerify: redisCfg.UseTLSInsecure, + } + } + } + + var client redis.UniversalClient + if clusterModeEnabled { + ps.Logger.Info().WithField("hosts", options.Addrs).Logf("Using Redis Cluster Client") + client = redis.NewClusterClient(options.Cluster()) + } else { + ps.Logger.Info().WithField("hosts", options.Addrs).Logf("Using Redis Universal client") + client = redis.NewUniversalClient(options) + } + + // if an authcode was provided, use it to authenticate the connection + if authcode != "" { + ps.Logger.Info().Logf("Using Redis AuthCode to authenticate connection") + pipe := client.Pipeline() + pipe.Auth(context.Background(), authcode) + if _, err := pipe.Exec(context.Background()); err != nil { + return err + } + } + + ps.Metrics.Register("redis_pubsub_published", "counter") + ps.Metrics.Register("redis_pubsub_received", "counter") + + ps.client = client + ps.subs = make([]*GoRedisSubscription, 0) + return nil +} + +func (ps *GoRedisPubSub) Stop() error { + ps.Close() + return nil +} + +func (ps *GoRedisPubSub) Close() { + ps.mut.Lock() + for _, sub := range ps.subs { + sub.Close() + } + ps.subs = nil + ps.mut.Unlock() + ps.client.Close() +} + +func (ps *GoRedisPubSub) Publish(ctx context.Context, topic, message string) error { + ctx, span := otelutil.StartSpanMulti(ctx, ps.Tracer, "GoRedisPubSub.Publish", map[string]interface{}{ + "topic": topic, + "message": message, + }) + + defer span.End() + + ps.Metrics.Count("redis_pubsub_published", 1) + return ps.client.Publish(ctx, topic, message).Err() +} + +// Subscribe creates a new Subscription to the given topic, and calls the provided callback +// whenever a message is received on that topic. +// Note that the same topic is Subscribed to multiple times, this will incur a separate +// connection to Redis for each Subscription. +func (ps *GoRedisPubSub) Subscribe(ctx context.Context, topic string, callback SubscriptionCallback) Subscription { + ctx, span := otelutil.StartSpanWith(ctx, ps.Tracer, "GoRedisPubSub.Subscribe", "topic", topic) + defer span.End() + + sub := &GoRedisSubscription{ + topic: topic, + pubsub: ps.client.Subscribe(ctx, topic), + cb: callback, + done: make(chan struct{}), + } + ps.mut.Lock() + ps.subs = append(ps.subs, sub) + ps.mut.Unlock() + go func() { + receiveRootCtx := context.Background() + redisch := sub.pubsub.Channel() + for { + select { + case <-sub.done: + return + case msg := <-redisch: + if msg == nil { + continue + } + receiveCtx, span := otelutil.StartSpanMulti(receiveRootCtx, ps.Tracer, "GoRedisPubSub.Receive", map[string]interface{}{ + "topic": topic, + "message_queue_size": len(redisch), + "message": msg.Payload, + }) + ps.Metrics.Count("redis_pubsub_received", 1) + + go func(cbCtx context.Context, span trace.Span, payload string) { + defer span.End() + + sub.cb(cbCtx, payload) + }(receiveCtx, span, msg.Payload) + } + } + }() + return sub +} + +func (s *GoRedisSubscription) Close() { + s.once.Do(func() { + close(s.done) + }) +} diff --git a/pubsub/pubsub_local.go b/pubsub/pubsub_local.go new file mode 100644 index 0000000000..ec9012aabf --- /dev/null +++ b/pubsub/pubsub_local.go @@ -0,0 +1,109 @@ +package pubsub + +import ( + "context" + "sync" + + "github.com/honeycombio/refinery/config" + "github.com/honeycombio/refinery/metrics" +) + +// LocalPubSub is a PubSub implementation that uses local channels to send messages; it does +// not communicate with any external processes. +// subs are individual channels for each subscription +type LocalPubSub struct { + Config config.Config `inject:""` + Metrics metrics.Metrics `inject:"metrics"` + topics map[string][]*LocalSubscription + mut sync.RWMutex +} + +// Ensure that LocalPubSub implements PubSub +var _ PubSub = (*LocalPubSub)(nil) + +type LocalSubscription struct { + ps *LocalPubSub + topic string + cb SubscriptionCallback + mut sync.RWMutex +} + +// Ensure that LocalSubscription implements Subscription +var _ Subscription = (*LocalSubscription)(nil) + +// Start initializes the LocalPubSub +func (ps *LocalPubSub) Start() error { + ps.topics = make(map[string][]*LocalSubscription) + if ps.Metrics == nil { + ps.Metrics = &metrics.NullMetrics{} + } + ps.Metrics.Register("local_pubsub_published", "counter") + ps.Metrics.Register("local_pubsub_received", "counter") + return nil +} + +// Stop shuts down the LocalPubSub +func (ps *LocalPubSub) Stop() error { + ps.Close() + return nil +} + +func (ps *LocalPubSub) Close() { + ps.mut.Lock() + defer ps.mut.Unlock() + for _, subs := range ps.topics { + for i := range subs { + subs[i].cb = nil + } + } + ps.topics = make(map[string][]*LocalSubscription, 0) +} + +func (ps *LocalPubSub) ensureTopic(topic string) { + if _, ok := ps.topics[topic]; !ok { + ps.topics[topic] = make([]*LocalSubscription, 0) + } +} + +func (ps *LocalPubSub) Publish(ctx context.Context, topic, message string) error { + ps.mut.Lock() + ps.ensureTopic(topic) + ps.Metrics.Count("local_pubsub_published", 1) + ps.Metrics.Count("local_pubsub_received", len(ps.topics[topic])) + // make a copy of our subs so we don't hold the lock while calling them + subs := make([]*LocalSubscription, 0, len(ps.topics[topic])) + subs = append(subs, ps.topics[topic]...) + ps.mut.Unlock() + for _, sub := range subs { + // don't wait around for slow consumers + if sub.cb != nil { + go sub.cb(ctx, message) + } + } + return nil +} + +func (ps *LocalPubSub) Subscribe(ctx context.Context, topic string, callback SubscriptionCallback) Subscription { + ps.mut.Lock() + ps.ensureTopic(topic) + sub := &LocalSubscription{ps: ps, topic: topic, cb: callback} + ps.topics[topic] = append(ps.topics[topic], sub) + ps.mut.Unlock() + return sub +} + +func (s *LocalSubscription) Close() { + s.ps.mut.RLock() + // make a copy of our subs so we don't hold the lock while calling them + subs := make([]*LocalSubscription, 0, len(s.ps.topics[s.topic])) + subs = append(subs, s.ps.topics[s.topic]...) + s.ps.mut.RUnlock() + for _, sub := range subs { + if sub == s { + sub.mut.Lock() + sub.cb = nil + sub.mut.Unlock() + return + } + } +} diff --git a/pubsub/pubsub_test.go b/pubsub/pubsub_test.go new file mode 100644 index 0000000000..4ad630ff96 --- /dev/null +++ b/pubsub/pubsub_test.go @@ -0,0 +1,266 @@ +package pubsub_test + +import ( + "context" + "fmt" + "strconv" + "sync" + "testing" + "time" + + "github.com/honeycombio/refinery/logger" + "github.com/honeycombio/refinery/metrics" + "github.com/honeycombio/refinery/pubsub" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.opentelemetry.io/otel/trace/noop" +) + +var types = []string{ + "goredis", + "local", +} + +func newPubSub(typ string) pubsub.PubSub { + var ps pubsub.PubSub + m := &metrics.NullMetrics{} + m.Start() + tracer := noop.NewTracerProvider().Tracer("test") + switch typ { + case "goredis": + ps = &pubsub.GoRedisPubSub{ + Metrics: m, + Tracer: tracer, + Logger: &logger.NullLogger{}, + } + case "local": + ps = &pubsub.LocalPubSub{ + Metrics: m, + } + default: + panic("unknown pubsub type") + } + ps.Start() + return ps +} + +type pubsubListener struct { + lock sync.Mutex + msgs []string +} + +func (l *pubsubListener) Listen(ctx context.Context, msg string) { + l.lock.Lock() + defer l.lock.Unlock() + l.msgs = append(l.msgs, msg) +} + +func (l *pubsubListener) Messages() []string { + l.lock.Lock() + defer l.lock.Unlock() + return l.msgs +} + +func TestPubSubBasics(t *testing.T) { + ctx := context.Background() + for _, typ := range types { + t.Run(typ, func(t *testing.T) { + ps := newPubSub(typ) + + l1 := &pubsubListener{} + ps.Subscribe(ctx, "topic", l1.Listen) + + wg := sync.WaitGroup{} + wg.Add(1) + go func() { + time.Sleep(100 * time.Millisecond) + for i := 0; i < 10; i++ { + err := ps.Publish(ctx, "topic", fmt.Sprintf("message %d", i)) + assert.NoError(t, err) + } + time.Sleep(100 * time.Millisecond) + wg.Done() + }() + wg.Wait() + ps.Close() + require.Len(t, l1.Messages(), 10) + }) + } +} + +func TestPubSubMultiSubscriber(t *testing.T) { + const messageCount = 10 + ctx := context.Background() + for _, typ := range types { + t.Run(typ, func(t *testing.T) { + ps := newPubSub(typ) + l1 := &pubsubListener{} + l2 := &pubsubListener{} + ps.Subscribe(ctx, "topic", l1.Listen) + ps.Subscribe(ctx, "topic", l2.Listen) + + wg := &sync.WaitGroup{} + wg.Add(1) + go func() { + time.Sleep(100 * time.Millisecond) + for i := 0; i < messageCount; i++ { + err := ps.Publish(ctx, "topic", fmt.Sprintf("message %d", i)) + require.NoError(t, err) + } + time.Sleep(100 * time.Millisecond) + wg.Done() + }() + wg.Wait() + ps.Close() + require.Len(t, l1.Messages(), messageCount) + require.Len(t, l2.Messages(), messageCount) + }) + } +} + +func TestPubSubMultiTopic(t *testing.T) { + const topicCount = 3 + const messageCount = 10 + const expectedTotal = 55 // sum of [1..messageCount] + ctx := context.Background() + for _, typ := range types { + t.Run(typ, func(t *testing.T) { + ps := newPubSub(typ) + time.Sleep(500 * time.Millisecond) + topics := make([]string, topicCount) + listeners := make([]*pubsubListener, topicCount) + for i := 0; i < topicCount; i++ { + topics[i] = fmt.Sprintf("topic%d", i) + listeners[i] = &pubsubListener{} + } + totals := make([]int, topicCount) + subs := make([]pubsub.Subscription, topicCount) + for ix := 0; ix < topicCount; ix++ { + subs[ix] = ps.Subscribe(ctx, topics[ix], listeners[ix].Listen) + } + + wg := sync.WaitGroup{} + wg.Add(1) + go func() { + time.Sleep(100 * time.Millisecond) + for j := 0; j < topicCount; j++ { + for i := 0; i < messageCount; i++ { + // we want a different sum for each topic + err := ps.Publish(ctx, topics[j], fmt.Sprintf("%d", (i+1)*(j+1))) + require.NoError(t, err) + } + } + time.Sleep(500 * time.Millisecond) + ps.Close() + wg.Done() + }() + wg.Wait() + for ix := 0; ix < topicCount; ix++ { + assert.Len(t, listeners[ix].Messages(), messageCount, "topic %d", ix) + for _, msg := range listeners[ix].Messages() { + n, _ := strconv.Atoi(msg) + totals[ix] += n + } + } + + // validate that all the topics each add up to the desired total + for i := 0; i < topicCount; i++ { + require.Equal(t, expectedTotal*(i+1), totals[i]) + } + }) + } +} + +func TestPubSubLatency(t *testing.T) { + const messageCount = 1000 + ctx := context.Background() + for _, typ := range types { + t.Run(typ, func(t *testing.T) { + ps := newPubSub(typ) + var count, total, tmin, tmax int64 + mut := sync.Mutex{} + + wg := sync.WaitGroup{} + wg.Add(2) + go func() { + time.Sleep(300 * time.Millisecond) + for i := 0; i < messageCount; i++ { + err := ps.Publish(ctx, "topic", fmt.Sprintf("%d", time.Now().UnixNano())) + require.NoError(t, err) + } + + // now wait for all messages to arrive + require.Eventually(t, func() bool { + mut.Lock() + done := count == messageCount + mut.Unlock() + return done + }, 5*time.Second, 100*time.Millisecond) + + ps.Close() + wg.Done() + }() + + ps.Subscribe(ctx, "topic", func(ctx context.Context, msg string) { + sent, err := strconv.Atoi(msg) + require.NoError(t, err) + rcvd := time.Now().UnixNano() + latency := rcvd - int64(sent) + require.True(t, latency >= 0) + mut.Lock() + total += latency + if tmin == 0 || latency < tmin { + tmin = latency + } + if latency > tmax { + tmax = latency + } + count++ + mut.Unlock() + }) + wg.Done() + + wg.Wait() + require.Equal(t, int64(messageCount), count) + require.True(t, total > 0) + average := total / int64(count) + t.Logf("average: %d ns, min: %d ns, max: %d ns", average, tmin, tmax) + // in general, we want low latency, so we put some ballpark numbers here + // to make sure we're not doing something crazy + require.Less(t, average, int64(100*time.Millisecond)) + require.Less(t, tmax, int64(500*time.Millisecond)) + }) + } +} + +func BenchmarkPubSub(b *testing.B) { + ctx := context.Background() + for _, typ := range types { + b.Run(typ, func(b *testing.B) { + ps := newPubSub(typ) + time.Sleep(100 * time.Millisecond) + + li := &pubsubListener{} + ps.Subscribe(ctx, "topic", li.Listen) + + wg := sync.WaitGroup{} + wg.Add(1) + b.ResetTimer() + go func() { + time.Sleep(100 * time.Millisecond) + for i := 0; i < b.N; i++ { + err := ps.Publish(ctx, "topic", fmt.Sprintf("message %d", i)) + require.NoError(b, err) + } + require.EventuallyWithT(b, func(collect *assert.CollectT) { + assert.Len(collect, li.Messages(), b.N) + }, 5*time.Second, 10*time.Millisecond) + ps.Close() + wg.Done() + }() + + wg.Wait() + require.Len(b, li.Messages(), b.N) + }) + } +} diff --git a/refinery_config.md b/refinery_config.md index 55ee65e95f..a209b471fb 100644 --- a/refinery_config.md +++ b/refinery_config.md @@ -66,7 +66,8 @@ When Refinery receives telemetry using an API key associated with a Honeycomb Cl Refinery will attempt to read its configuration and check for changes at approximately this interval. This time is varied by a random amount up to 10% to avoid all instances refreshing together. In installations where configuration changes are handled by restarting Refinery, which is often the case when using Kubernetes, disable this feature with a value of `0s`. -If the config file is being loaded from a URL, it may be wise to increase this value to avoid overloading the file server. +As of Refinery v2.7, news of a configuration change is immediately propagated to all peers, and they will attempt to reload their configurations. +Note that external factors (for example, Kubernetes ConfigMaps) may cause delays in propagating configuration changes. - Not eligible for live reload. - Type: `duration` @@ -146,10 +147,41 @@ This list only applies to span traffic - other Honeycomb API actions will be pro If `true`, then only traffic using the keys listed in `ReceiveKeys` is accepted. Events arriving with API keys not in the `ReceiveKeys` list will be rejected with an HTTP `401` error. If `false`, then all traffic is accepted and `ReceiveKeys` is ignored. +This setting is applied **before** the `SendKey` and `SendKeyMode` settings. - Eligible for live reload. - Type: `bool` +### `SendKey` + +`SendKey` is an optional Honeycomb API key that Refinery can use to send data to Honeycomb, depending on configuration. + +If `SendKey` is set to a valid Honeycomb key, then Refinery can use the listed key to send data. +The exact behavior depends on the value of `SendKeyMode`. + +- Eligible for live reload. +- Type: `string` +- Example: `SetThisToAHoneycombKey` + +### `SendKeyMode` + +`SendKeyMode` controls how SendKey is used to replace or augment API keys used in incoming telemetry. + +Controls how SendKey is used to replace or supply API keys used in incoming telemetry. +If `AcceptOnlyListedKeys` is `true`, then `SendKeys` will only be used for events with keys listed in `ReceiveKeys`. +`none` uses the incoming key for all telemetry (default). +`all` overwrites all keys, even missing ones, with `SendKey`. +`nonblank` overwrites all supplied keys but will not inject `SendKey` if the incoming key is blank. +`listedonly` overwrites only the keys listed in `ReceiveKeys`. +`unlisted` uses the `SendKey` for all events *except* those with keys listed in `ReceiveKeys`, which use their original keys. +`missingonly` uses the SendKey only to inject keys into events with blank keys. +All other events use their original keys. + +- Eligible for live reload. +- Type: `string` +- Default: `none` +- Options: `none`, `all`, `nonblank`, `listedonly`, `unlisted`, `missingonly` + ## Refinery Telemetry `RefineryTelemetry` contains configuration information for the telemetry that Refinery uses to record its own operation. @@ -161,8 +193,8 @@ If `false`, then all traffic is accepted and `ReceiveKeys` is ignored. When enabled, this setting causes traces that are sent to Honeycomb to include the field `meta.refinery.reason`. This field contains text indicating which rule was evaluated that caused the trace to be included. This setting also includes the field `meta.refinery.send_reason`, which contains the reason that the trace was sent. -Possible values of this field are `trace_send_got_root`, which means that the root span arrived; `trace_send_expired`, which means that TraceTimeout was reached; `trace_send_ejected_full`, which means that the trace cache was full; and `trace_send_ejected_memsize`, which means that refinery was out of memory. -These names are also the names of metrics that refinery tracks. +Possible values of this field are `trace_send_got_root`, which means that the root span arrived; `trace_send_expired`, which means that `TraceTimeout` was reached; `trace_send_ejected_full`, which means that the trace cache was full; and `trace_send_ejected_memsize`, which means that Refinery was out of memory. +These names are also the names of metrics that Refinery tracks. We recommend enabling this setting whenever a rules-based sampler is in use, as it is useful for debugging and understanding the behavior of your Refinery installation. - Eligible for live reload. @@ -178,7 +210,7 @@ This value is available to the rules-based sampler, making it possible to write If `true` and `AddCountsToRoot` is set to false, then Refinery will add `meta.span_count` to the root span. - Eligible for live reload. -- Type: `bool` +- Type: `defaulttrue` - Default: `true` ### `AddCountsToRoot` @@ -201,7 +233,7 @@ If `true`, then Refinery will ignore the `AddSpanCountToRoot` setting and add th If `true`, then Refinery will add the following tag to all traces: - `meta.refinery.local_hostname`: the hostname of the Refinery node - Eligible for live reload. -- Type: `bool` +- Type: `defaulttrue` - Default: `true` ## Traces @@ -210,12 +242,12 @@ If `true`, then Refinery will add the following tag to all traces: - `meta.refin ### `SendDelay` -`SendDelay` is the duration to wait before sending a trace. +`SendDelay` is the duration to wait after the root span arrives before sending a trace. -This setting is a short timer that is triggered when a trace is complete. +This setting is a short timer that is triggered when a trace is marked complete by the arrival of the root span. Refinery waits for this duration before sending the trace. -The reason for this setting is to allow for small network delays or clock jitters to elapse and any final spans to arrive before sending the trace. -Set to "0" for immediate sending. +This setting exists to allow for asynchronous spans and small network delays to elapse before sending the trace. +`SendDelay` is not applied if the `TraceTimeout` expires or the `SpanLimit` is reached. - Eligible for live reload. - Type: `duration` @@ -237,7 +269,11 @@ By default, this setting uses the `DefaultBatchTimeout` in `libhoney` as its val A long timer; it represents the outside boundary of how long to wait before making the trace decision about an incomplete trace. Normally trace decisions (send or drop) are made when the root span arrives. -Sometimes the root span never arrives (for example, due to crashes) and this timer ensures sending a trace even without having received the root span. +Sometimes the root span never arrives (for example, due to crashes). +Once this timer fires, Refinery will make a trace decision based on the spans that have arrived so far. +This ensures sending a trace even when the root span never arrives. +After the trace decision has been made, Refinery retains a record of that decision for a period of time. +When additional spans (including the root span) arrive, they will be kept or dropped based on the original decision. If particularly long-lived traces are present in your data, then you should increase this timer. Note that this increase will also increase the memory requirements for Refinery. @@ -245,6 +281,17 @@ Note that this increase will also increase the memory requirements for Refinery. - Type: `duration` - Default: `60s` +### `SpanLimit` + +`SpanLimit` is the number of spans after which a trace becomes eligible for a trace decision. + +This setting helps to keep memory usage under control. +If a trace has more than this set number of spans, then it becomes eligible for a trace decision. +It's most helpful in a situation where a sudden burst of many spans in a large trace hits Refinery all at once, causing memory usage to spike and possibly crashing Refinery. + +- Eligible for live reload. +- Type: `int` + ### `MaxBatchSize` `MaxBatchSize` is the maximum number of events to be included in each batch for sending. @@ -286,7 +333,7 @@ If this value is not specified, then the debug service runs on the first open po ### `QueryAuthToken` -`QueryAuthToken` is the token that must be specified to access the `/query` endpoint. +`QueryAuthToken` is the token that must be specified to access the `/query` endpoint. Setting this value via a command line flag may expose credentials - it is recommended to use the environment variable or a configuration file. This token must be specified with the header "X-Honeycomb-Refinery-Query" in order for a `/query` request to succeed. These `/query` requests are intended for debugging Refinery during setup and are not typically needed in normal operation. @@ -371,7 +418,7 @@ Refinery's internal logs will be sent to this host using the standard Honeycomb ### `APIKey` -`APIKey` is the API key used to send Refinery's logs to Honeycomb. +`APIKey` is the API key used to send Refinery's logs to Honeycomb. Setting this value via a command line flag may expose credentials - it is recommended to use the environment variable or a configuration file. It is recommended that you create a separate team and key for Refinery logs. @@ -398,7 +445,7 @@ The sample rate is controlled by the `SamplerThroughput` setting. The sampler used throttles the rate of logs sent to Honeycomb from any given source within Refinery -- it should effectively limit the rate of redundant messages. - Not eligible for live reload. -- Type: `bool` +- Type: `defaulttrue` - Default: `true` ### `SamplerThroughput` @@ -502,7 +549,7 @@ Refinery's internal metrics will be sent to this host using the standard Honeyco ### `APIKey` -`APIKey` is the API key used by Refinery to send its metrics to Honeycomb. +`APIKey` is the API key used by Refinery to send its metrics to Honeycomb. Setting this value via a command line flag may expose credentials - it is recommended to use the environment variable or a configuration file. It is recommended that you create a separate team and key for Refinery metrics. @@ -559,7 +606,7 @@ Refinery's internal metrics will be sent to the `/v1/metrics` endpoint on this h ### `APIKey` -`APIKey` is the API key used to send Honeycomb metrics via OpenTelemetry. +`APIKey` is the API key used to send Honeycomb metrics via OpenTelemetry. Setting this value via a command line flag may expose credentials - it is recommended to use the environment variable or a configuration file. It is recommended that you create a separate team and key for Refinery metrics. If this is blank, then Refinery will not set the Honeycomb-specific headers for OpenTelemetry, and your `APIHost` must be set to a valid OpenTelemetry endpoint. @@ -601,6 +648,62 @@ In rare circumstances, compression costs may outweigh the benefits, in which cas - Default: `gzip` - Options: `none`, `gzip` +## OpenTelemetry Tracing + +`OTelTracing` contains configuration for Refinery's own tracing. + +### `Enabled` + +`Enabled` controls whether to send Refinery's own OpenTelemetry traces. + +The setting specifies if Refinery sends traces. + +- Not eligible for live reload. +- Type: `bool` + +### `APIHost` + +`APIHost` is the URL of the OpenTelemetry API to which traces will be sent. + +Refinery's internal traces will be sent to the `/v1/traces` endpoint on this host. + +- Not eligible for live reload. +- Type: `url` +- Default: `https://api.honeycomb.io` + +### `APIKey` + +`APIKey` is the API key used to send Refinery's traces to Honeycomb. Setting this value via a command line flag may expose credentials - it is recommended to use the environment variable or a configuration file. + +It is recommended that you create a separate team and key for Refinery telemetry. +If this value is blank, then Refinery will not set the Honeycomb-specific headers for OpenTelemetry, and your `APIHost` must be set to a valid OpenTelemetry endpoint. + +- Not eligible for live reload. +- Type: `string` +- Example: `SetThisToAHoneycombKey` +- Environment variable: `REFINERY_HONEYCOMB_TRACES_API_KEY, REFINERY_HONEYCOMB_API_KEY` + +### `Dataset` + +`Dataset` is the Honeycomb dataset to which Refinery sends its OpenTelemetry metrics. + +Only used if `APIKey` is specified. + +- Not eligible for live reload. +- Type: `string` +- Default: `Refinery Traces` + +### `SampleRate` + +`SampleRate` is the rate at which Refinery samples its own traces. + +This is the Honeycomb sample rate used to sample traces sent by Refinery. +Since each incoming span generates multiple outgoing spans, a minimum sample rate of `100` is strongly advised. + +- Eligible for live reload. +- Type: `int` +- Default: `100` + ## Peer Management `PeerManagement` controls how the Refinery cluster communicates between peers. @@ -611,7 +714,10 @@ In rare circumstances, compression costs may outweigh the benefits, in which cas Peer management is the mechanism by which Refinery locates its peers. `file` means that Refinery gets its peer list from the Peers list in this config file. -`redis` means that Refinery self-registers with a Redis instance and gets its peer list from there. +It also prevents Refinery from using a publish/subscribe mechanism to propagate peer lists, stress levels, and configuration changes. +`redis` means that Refinery uses a Publish/Subscribe mechanism, implemented on Redis, to propagate peer lists, stress levels, and notification of configuration changes much more quickly than the legacy mechanism. +The recommended setting is `redis`, especially for new installations. +If `redis` is specified, fields in `RedisPeerManagement` must also be set. - Not eligible for live reload. - Type: `string` @@ -657,16 +763,16 @@ If this value is specified, then Refinery will use the first IPV6 unicast addres `Peers` is the list of peers to use when Type is "file", excluding self. This list is ignored when Type is "redis". -The format is a list of strings of the form "host:port". +The format is a list of strings of the form "scheme://host:port". - Not eligible for live reload. - Type: `stringarray` -- Example: `192.168.1.11:8081,192.168.1.12:8081` +- Example: `http://192.168.1.11:8081,http://192.168.1.12:8081` ## Redis Peer Management `RedisPeerManagement` controls how the Refinery cluster communicates between peers when using Redis. -Only applies when `PeerManagement.Type` is "redis". +Does not apply when `PeerManagement.Type` is "file". ### `Host` @@ -679,9 +785,21 @@ Must be in the form `host:port`. - Example: `localhost:6379` - Environment variable: `REFINERY_REDIS_HOST` +### `ClusterHosts` + +`ClusterHosts` is a list of host and port pairs for the instances in a Redis Cluster, and used for managing peer cluster membership. + +This configuration enables Refinery to connect to a Redis deployment setup in Cluster Mode. +Each entry in the list should follow the format `host:port`. +If `ClusterHosts` is specified, the `Host` setting will be ignored. + +- Not eligible for live reload. +- Type: `stringarray` +- Example: `- localhost:6379` + ### `Username` -`Username` is the username used to connect to Redis for peer cluster membership management. +`Username` is the username used to connect to Redis for peer cluster membership management. Setting this value via a command line flag may expose credentials - it is recommended to use the environment variable or a configuration file. Many Redis installations do not use this field. @@ -691,7 +809,7 @@ Many Redis installations do not use this field. ### `Password` -`Password` is the password used to connect to Redis for peer cluster membership management. +`Password` is the password used to connect to Redis for peer cluster membership management. Setting this value via a command line flag may expose credentials - it is recommended to use the environment variable or a configuration file. Many Redis installations do not use this field. @@ -701,7 +819,7 @@ Many Redis installations do not use this field. ### `AuthCode` -`AuthCode` is the string used to connect to Redis for peer cluster membership management using an explicit AUTH command. +`AuthCode` is the string used to connect to Redis for peer cluster membership management using an explicit AUTH command. Setting this value via a command line flag may expose credentials - it is recommended to use the environment variable or a configuration file. Many Redis installations do not use this field. @@ -709,29 +827,6 @@ Many Redis installations do not use this field. - Type: `string` - Environment variable: `REFINERY_REDIS_AUTH_CODE` -### `Prefix` - -`Prefix` is a string used as a prefix for the keys in Redis while storing the peer membership. - -It might be useful to override this in any situation where multiple Refinery clusters or multiple applications want to share a single Redis instance. -It may not be blank. - -- Not eligible for live reload. -- Type: `string` -- Default: `refinery` -- Example: `customPrefix` - -### `Database` - -`Database` is the database number to use for the Redis instance storing the peer membership. - -An integer from 0-15 indicating the database number to use for the Redis instance storing the peer membership. -It might be useful to set this in any situation where multiple Refinery clusters or multiple applications want to share a single Redis instance. - -- Not eligible for live reload. -- Type: `int` -- Example: `1` - ### `UseTLS` `UseTLS` enables TLS when connecting to Redis for peer cluster membership management. @@ -770,8 +865,10 @@ This is not recommended for production use since a burst of traffic could cause `CacheCapacity` is the number of traces to keep in the cache's circular buffer. -The collection cache is used to collect all spans into a trace as well as remember the sampling decision for any spans that might come in after the trace has been marked "complete" (either by timing out or seeing the root span). -The number of traces in the cache should be many multiples (100x to 1000x) of the total number of concurrently active traces (trace throughput * trace duration). +The collection cache is used to collect all active spans into traces. +It is organized as a circular buffer. +When the buffer wraps around, Refinery will try a few times to find an empty slot; if it fails, it starts ejecting traces from the cache earlier than would otherwise be necessary. +Ideally, the size of the cache should be many multiples (100x to 1000x) of the total number of concurrently active traces (average trace throughput * average trace duration). - Eligible for live reload. - Type: `int` @@ -847,6 +944,28 @@ If set, `Collections.AvailableMemory` must not be defined. - Eligible for live reload. - Type: `memorysize` +### `DisableRedistribution` + +`DisableRedistribution` controls whether to transmit traces in cache to remaining peers during cluster scaling event. + +If `true`, Refinery will NOT forward live traces in its cache to the rest of the peers when peers join or leave the cluster. +By disabling this behavior, it can help to prevent disruptive bursts of network traffic when large traces with long `TraceTimeout` are redistributed. + +- Eligible for live reload. +- Type: `bool` + +### `ShutdownDelay` + +`ShutdownDelay` controls the maximum time Refinery can use while draining traces at shutdown. + +This setting controls the duration that Refinery expects to have to drain in-process traces before shutting down an instance. +When asked to shut down gracefully, Refinery stops accepting new spans immediately and drains the remaining traces by sending them to remaining peers. +This value should be set to a bit less than the normal timeout period for shutting down without forcibly terminating the process. + +- Eligible for live reload. +- Type: `duration` +- Default: `15s` + ## Buffer Sizes `BufferSizes` contains the settings that are relevant to the sizes of communications buffers. @@ -899,7 +1018,7 @@ If it costs money to transmit data between Refinery instances (for example, when The option to disable it is provided as an escape hatch for deployments that value lower CPU utilization over data transfer costs. - Not eligible for live reload. -- Type: `bool` +- Type: `defaulttrue` - Default: `true` ### `AdditionalAttributes` @@ -951,7 +1070,8 @@ A trace without a `parent_id` is assumed to be a root span. If `false`, then the gRPC server is not started and no gRPC traffic is accepted. - Not eligible for live reload. -- Type: `bool` +- Type: `defaulttrue` +- Default: `true` ### `ListenAddr` @@ -1031,7 +1151,7 @@ The size is expressed in bytes. - Not eligible for live reload. - Type: `memorysize` -- Default: `5MB` +- Default: `15MB` ### `MaxRecvMsgSize` @@ -1042,7 +1162,7 @@ The size is expressed in bytes. - Not eligible for live reload. - Type: `memorysize` -- Default: `5MB` +- Default: `15MB` ## Sample Cache @@ -1154,16 +1274,3 @@ This setting helps to prevent oscillations. - Type: `duration` - Default: `10s` -### `MinimumStartupDuration` - -`MinimumStartupDuration` is the minimum time that Stress Relief will stay enabled. - -This setting is used when switching into Monitor mode. -When Stress Relief is enabled, it will start up in stressed mode for at least this set duration of time to try to make sure that Refinery can handle the load before it begins processing it in earnest. -This is to help address the problem of trying to bring a new node into an already-overloaded cluster. -If this duration is `0`, then Refinery will not start in stressed mode, which will provide faster startup at the possible cost of startup instability. - -- Eligible for live reload. -- Type: `duration` -- Default: `3s` - diff --git a/refinery_rules.md b/refinery_rules.md index d2b48142f7..b168a1842d 100644 --- a/refinery_rules.md +++ b/refinery_rules.md @@ -52,6 +52,8 @@ It indicates a ratio, where one sample trace is kept for every N traces seen. For example, a `SampleRate` of `30` will keep 1 out of every 30 traces. The choice on whether to keep any specific trace is random, so the rate is approximate. The sample rate is calculated from the trace ID, so all spans with the same trace ID will be sampled or not sampled together. +A `SampleRate` of `1` or less will keep all traces. +Specifying this value is required. - Type: `int` @@ -71,6 +73,8 @@ It indicates a ratio, where one sample trace is kept for every N traces seen. For example, a `SampleRate` of `30` will keep 1 out of every 30 traces. The choice on whether to keep any specific trace is random, so the rate is approximate. The sample rate is calculated from the trace ID, so all spans with the same trace ID will be sampled or not sampled together. +A `SampleRate` of `1` or less will keep all traces. +Specifying this value is required. - Type: `int` @@ -79,6 +83,7 @@ The sample rate is calculated from the trace ID, so all spans with the same trac The duration after which the Dynamic Sampler should reset its internal counters. It should be specified as a duration string. For example, "30s" or "1m". +Defaults to "30s". - Type: `duration` @@ -110,7 +115,7 @@ Defaults to `500`; Dynamic Samplers will rarely achieve their sampling goals wit ### `UseTraceLength` Indicates whether to include the trace length (number of spans in the trace) as part of the key. -The number of spans is exact, so if there are normally small variations in trace length, we recommend setting this field to `false`. +The number of spans is exact, so if there are normally small variations in trace length, we recommend setting this field to `false` (the default). If your traces are consistent lengths and changes in trace length is a useful indicator to view in Honeycomb, then set this field to `true`. - Type: `bool` @@ -134,6 +139,8 @@ It indicates a ratio, where one sample trace is kept for every N traces seen. For example, a `SampleRate` of `30` will keep 1 out of every 30 traces. The choice on whether to keep any specific trace is random, so the rate is approximate. The sample rate is calculated from the trace ID, so all spans with the same trace ID will be sampled or not sampled together. +A `SampleRate` of `1` or less will keep all traces. +Specifying this value is required. - Type: `int` @@ -141,7 +148,8 @@ The sample rate is calculated from the trace ID, so all spans with the same trac The duration after which the EMA Dynamic Sampler should recalculate its internal counters. It should be specified as a duration string. -For example, "30s" or "1m". +For example, `30s` or `1m`. +Defaults to `15s`. - Type: `duration` @@ -151,6 +159,7 @@ The weight to use when calculating the EMA. It should be a number between `0` and `1`. Larger values weight the average more toward recent observations. In other words, a larger weight will cause sample rates more quickly adapt to traffic patterns, while a smaller weight will result in sample rates that are less sensitive to bursts or drops in traffic and thus more consistent over time. +The default value is `0.5`. - Type: `float` @@ -159,7 +168,7 @@ In other words, a larger weight will cause sample rates more quickly adapt to tr Indicates the threshold for removing keys from the EMA. The EMA of any key will approach `0` if it is not repeatedly observed, but will never truly reach it, so this field determines what constitutes "zero". Keys with averages below this threshold will be removed from the EMA. -Default is the same as `Weight`, as this prevents a key with the smallest integer value (1) from being aged out immediately. +Default is the value of `Weight`, as this prevents a key with the smallest integer value (1) from being aged out immediately. This value should generally be less than (<=) `Weight`, unless you have very specific reasons to set it higher. - Type: `float` @@ -208,7 +217,7 @@ Defaults to `500`; Dynamic Samplers will rarely achieve their sampling goals wit ### `UseTraceLength` Indicates whether to include the trace length (number of spans in the trace) as part of the key. -The number of spans is exact, so if there are normally small variations in trace length, we recommend setting this field to `false`. +The number of spans is exact, so if there are normally small variations in trace length, we recommend setting this field to `false` (the default). If your traces are consistent lengths and changes in trace length is a useful indicator to view in Honeycomb, then set this field to `true`. - Type: `bool` @@ -237,7 +246,7 @@ This value is calculated for the individual instance, not for the cluster; if yo Indicates whether to use the cluster size to calculate the goal throughput. If `true`, then the goal throughput will be divided by the number of instances in the cluster. -If `false`, then the goal throughput will be the value specified in `GoalThroughputPerSec`. +If `false` (the default), then the goal throughput will be the value specified in `GoalThroughputPerSec`. - Type: `bool` @@ -245,6 +254,7 @@ If `false`, then the goal throughput will be the value specified in `GoalThrough `InitialSampleRate` is the sample rate to use during startup, before the sampler has accumulated enough data to calculate a reasonable throughput. This is mainly useful in situations where unsampled throughput is high enough to cause problems. +Default value is `10`. - Type: `int` @@ -252,7 +262,8 @@ This is mainly useful in situations where unsampled throughput is high enough to The duration after which the EMA Dynamic Sampler should recalculate its internal counters. It should be specified as a duration string. -For example, "30s" or "1m". +For example, `30s` or `1m`. +Defaults to `15s`. - Type: `duration` @@ -262,6 +273,7 @@ The weight to use when calculating the EMA. It should be a number between `0` and `1`. Larger values weight the average more toward recent observations. In other words, a larger weight will cause sample rates more quickly adapt to traffic patterns, while a smaller weight will result in sample rates that are less sensitive to bursts or drops in traffic and thus more consistent over time. +The default value is `0.5`. - Type: `float` @@ -270,7 +282,7 @@ In other words, a larger weight will cause sample rates more quickly adapt to tr Indicates the threshold for removing keys from the EMA. The EMA of any key will approach `0` if it is not repeatedly observed, but will never truly reach it, so this field determines what constitutes "zero". Keys with averages below this threshold will be removed from the EMA. -Default is the same as `Weight`, as this prevents a key with the smallest integer value (1) from being aged out immediately. +Default is the value of `Weight`, as this prevents a key with the smallest integer value (1) from being aged out immediately. This value should generally be less than (<=) `Weight`, unless you have very specific reasons to set it higher. - Type: `float` @@ -319,7 +331,7 @@ Defaults to `500`; Dynamic Samplers will rarely achieve their sampling goals wit ### `UseTraceLength` Indicates whether to include the trace length (number of spans in the trace) as part of the key. -The number of spans is exact, so if there are normally small variations in trace length, we recommend setting this field to `false`. +The number of spans is exact, so if there are normally small variations in trace length, we recommend setting this field to `false` (the default). If your traces are consistent lengths and changes in trace length is a useful indicator to view in Honeycomb, then set this field to `true`. - Type: `bool` @@ -340,7 +352,7 @@ The Windowed Throughput Sampler resolves this by introducing two different, tuna recomputing sampling rate. A standard configuration would be to set `UpdateFrequency` to `1s` and `LookbackFrequency` to `30s`. In this configuration, for every second, we lookback at the last 30 seconds of data in order to compute the new sampling rate. -The actual sampling rate computation is nearly identical to the original Throughput Sampler, but this variant has better support for floating point numbers. +The actual sampling rate computation is nearly identical to the original Throughput Sampler, but this variant has better support for floating point numbers and does a better job with less-common keys. ### `GoalThroughputPerSec` @@ -355,7 +367,7 @@ This value is calculated for the individual instance, not for the cluster; if yo Indicates whether to use the cluster size to calculate the goal throughput. If `true`, then the goal throughput will be divided by the number of instances in the cluster. -If `false`, then the goal throughput will be the value specified in `GoalThroughputPerSec`. +If `false` (the default), then the goal throughput will be the value specified in `GoalThroughputPerSec`. - Type: `bool` @@ -363,7 +375,8 @@ If `false`, then the goal throughput will be the value specified in `GoalThrough The duration between sampling rate computations. It should be specified as a duration string. -For example, "30s" or "1m". +For example, `30s` or `1m`. +Defaults to `1s`. - Type: `duration` @@ -403,7 +416,7 @@ Defaults to `500`; Dynamic Samplers will rarely achieve their sampling goals wit ### `UseTraceLength` Indicates whether to include the trace length (number of spans in the trace) as part of the key. -The number of spans is exact, so if there are normally small variations in trace length, we recommend setting this field to `false`. +The number of spans is exact, so if there are normally small variations in trace length, we recommend setting this field to `false` (the default). If your traces are consistent lengths and changes in trace length is a useful indicator to view in Honeycomb, then set this field to `true`. - Type: `bool` @@ -424,7 +437,7 @@ Rules-based samplers will usually be configured to have the last rule be a defau ### `CheckNestedFields` Indicates whether to expand nested JSON when evaluating rules. -If false, nested JSON will be treated as a string. +If false (the default), nested JSON will be treated as a string. If `true`, nested JSON will be expanded into a `map[string]interface{}` and the value of the field will be the value of the nested field. For example, if you have a field called `http.request.headers` and you want to check the value of the `User-Agent` header, then you would set this to `true` and use `http.request.headers.User-Agent` as the field name in your rule. This is a computationally expensive option and may cause performance problems if you have a large number of spans with nested JSON. @@ -478,8 +491,8 @@ A no-condition rule is typically used for the last rule to provide a default beh ### `Scope` Controls the scope of the rule evaluation. -If set to "trace" (the default), then each condition can apply to any span in the trace independently. -If set to "span", then all of the conditions in the rule will be evaluated against each span in the trace and the rule only succeeds if all of the conditions match on a single span together. +If set to `trace` (the default), then each condition can apply to any span in the trace independently. +If set to `span`, then all of the conditions in the rule will be evaluated against each span in the trace and the rule only succeeds if all of the conditions match on a single span together. - Type: `string` @@ -492,26 +505,52 @@ If there are no conditions, then the rule will always match. ### `Field` The field to check. -This can be any field in the trace. +This can name any field in the trace. If the field is not present, then the condition will not match. The comparison is case-sensitive. +The field can also include a prefix that changes the span used for evaluation of the field. +The only prefix currently supported is `root`, as in `root.http.status`. +Specifying `root.` causes the condition to be evaluated against the root span. +For example, if the `Field` is `root.url`, then the condition will be processed using the url field from the root span. +The setting `Scope: span` for a rule does not change the meaning of this prefix -- the condition is still evaluated on the root span and is treated as if it were part of the span being processed. +When using the `root.` prefix on a field with a `not-exists` operator, include the `has-root-span: true` condition in the rule. +The `not-exists` condition on a `root.`-prefixed field will evaluate to false if the existence of the root span is not checked and the root span does not exist. +The primary reason a root span is not present on a trace when a sampling decision is being made is when the root span takes longer to complete than the configured TraceTimeout. - Type: `string` +### `Fields` + +An array of field names to check. +These can name any field in the trace. +The fields are checked in the order defined here, and the first named field that contains a value will be used for the condition. +Only the first populated field will be used, even if the condition fails. +If a `root.` prefix is present on a field, but the root span is not on the trace, that field will be skipped. +If none of the fields are present, then the condition will not match. +The comparison is case-sensitive. +All fields are checked as individual fields before any of them are checked as nested fields (see `CheckNestedFields`). + +- Type: `stringarray` + ### `Operator` The comparison operator to use. String comparisons are case-sensitive. +For most cases, use negative operators (`!=`, `does-not-contain`, `not-exists`, and `not-in`) in a rule with a scope of "span". +WARNING: Rules can have `Scope: trace` or `Scope: span`. +Using a negative operator with `Scope: trace` will cause the condition be true if **any** single span in the entire trace matches. +Use `Scope: span` with negative operators. - Type: `string` -- Options: `=`, `!=`, `>`, `<`, `>=`, `<=`, `starts-with`, `contains`, `does-not-contain`, `exists`, `not-exists`, `has-root-span` +- Options: `=`, `!=`, `>`, `<`, `>=`, `<=`, `starts-with`, `contains`, `does-not-contain`, `exists`, `not-exists`, `has-root-span`, `matches`, `in`, `not-in` ### `Value` The value to compare against. If `Datatype` is not specified, then the value and the field will be compared based on the type of the field. +The `in` and `not-in` operators can accept a list of values, which should all be of the same datatype. -- Type: `anyscalar` +- Type: `sliceorscalar` ### `Datatype` @@ -519,6 +558,7 @@ The datatype to use when comparing the value and the field. If `Datatype` is specified, then both values will be converted (best-effort) to that type and then compared. Errors in conversion will result in the comparison evaluating to `false`. This is especially useful when a field like `http status code` may be rendered as strings by some environments and as numbers or booleans by others. +The best practice is to always specify `Datatype`; this avoids ambiguity, allows for more accurate comparisons, and offers a minor performance improvement. - Type: `string` @@ -545,7 +585,7 @@ This is not the same as the Sample Rate. Indicates whether to use the cluster size to calculate the goal throughput. If `true`, then the goal throughput will be divided by the number of instances in the cluster. -If `false`, then the goal throughput will be the value specified in `GoalThroughputPerSec`. +If `false` (the default), then the goal throughput will be the value specified in `GoalThroughputPerSec`. - Type: `bool` @@ -554,6 +594,7 @@ If `false`, then the goal throughput will be the value specified in `GoalThrough The duration after which the Dynamic Sampler should reset its internal counters. It should be specified as a duration string. For example, "30s" or "1m". +Defaults to "30s". - Type: `duration` @@ -585,7 +626,7 @@ Defaults to `500`; Dynamic Samplers will rarely achieve their sampling goals wit ### `UseTraceLength` Indicates whether to include the trace length (number of spans in the trace) as part of the key. -The number of spans is exact, so if there are normally small variations in trace length, we recommend setting this field to `false`. +The number of spans is exact, so if there are normally small variations in trace length, we recommend setting this field to `false` (the default). If your traces are consistent lengths and changes in trace length is a useful indicator to view in Honeycomb, then set this field to `true`. - Type: `bool` diff --git a/route/errors.go b/route/errors.go index c901f8be32..f619d311fd 100644 --- a/route/errors.go +++ b/route/errors.go @@ -2,6 +2,7 @@ package route import ( "fmt" + "io" "net/http" "runtime/debug" @@ -73,3 +74,15 @@ func (r *Router) handlerReturnWithError(w http.ResponseWriter, he handlerError, w.Write(jsonErrMsg) } + +func (r *Router) handleOTLPFailureResponse(w http.ResponseWriter, req *http.Request, otlpErr husky.OTLPError) { + r.Logger.Error().Logf(otlpErr.Error()) + if err := husky.WriteOtlpHttpFailureResponse(w, req, otlpErr); err != nil { + // If we made it here we had a problem writing an OTLP HTTP response + resp := fmt.Sprintf("failed to write otlp http response, %v", err.Error()) + r.Logger.Error().Logf(resp) + w.Header().Set("Content-Type", "text/plain") + w.WriteHeader(http.StatusInternalServerError) + _, _ = io.WriteString(w, resp) + } +} diff --git a/route/middleware.go b/route/middleware.go index 3a9a67979b..0c8897e6c0 100644 --- a/route/middleware.go +++ b/route/middleware.go @@ -2,7 +2,6 @@ package route import ( "context" - "errors" "fmt" "math/rand" "net/http" @@ -38,23 +37,24 @@ func (r *Router) queryTokenChecker(next http.Handler) http.Handler { }) } -func (r *Router) apiKeyChecker(next http.Handler) http.Handler { +func (r *Router) apiKeyProcessor(next http.Handler) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) { apiKey := req.Header.Get(types.APIKeyHeader) if apiKey == "" { apiKey = req.Header.Get(types.APIKeyHeaderShort) } - if apiKey == "" { - err := errors.New("no " + types.APIKeyHeader + " header found from within authing middleware") + + keycfg := r.Config.GetAccessKeyConfig() + + overwriteWith, err := keycfg.CheckAndMaybeReplaceKey(apiKey) + if err != nil { r.handlerReturnWithError(w, ErrAuthNeeded, err) return } - if r.Config.IsAPIKeyValid(apiKey) { - next.ServeHTTP(w, req) - return + if overwriteWith != apiKey { + req.Header.Set(types.APIKeyHeader, overwriteWith) } - err := fmt.Errorf("api key %s not found in list of authorized keys", apiKey) - r.handlerReturnWithError(w, ErrAuthNeeded, err) + next.ServeHTTP(w, req) }) } diff --git a/route/otlp_logs.go b/route/otlp_logs.go new file mode 100644 index 0000000000..371e1f1af9 --- /dev/null +++ b/route/otlp_logs.go @@ -0,0 +1,82 @@ +package route + +import ( + "context" + "errors" + "net/http" + + huskyotlp "github.com/honeycombio/husky/otlp" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" + + collectorlogs "go.opentelemetry.io/proto/otlp/collector/logs/v1" +) + +func (r *Router) postOTLPLogs(w http.ResponseWriter, req *http.Request) { + ri := huskyotlp.GetRequestInfoFromHttpHeaders(req.Header) + + if err := ri.ValidateLogsHeaders(); err != nil { + if errors.Is(err, huskyotlp.ErrInvalidContentType) { + r.handlerReturnWithError(w, ErrInvalidContentType, err) + } else { + r.handleOTLPFailureResponse(w, req, huskyotlp.OTLPError{Message: err.Error(), HTTPStatusCode: http.StatusUnauthorized}) + } + return + } + + apicfg := r.Config.GetAccessKeyConfig() + keyToUse, err := apicfg.CheckAndMaybeReplaceKey(ri.ApiKey) + + if err != nil { + r.handleOTLPFailureResponse(w, req, huskyotlp.OTLPError{Message: err.Error(), HTTPStatusCode: http.StatusUnauthorized}) + return + } + + result, err := huskyotlp.TranslateLogsRequestFromReader(req.Context(), req.Body, ri) + if err != nil { + r.handleOTLPFailureResponse(w, req, huskyotlp.OTLPError{Message: err.Error(), HTTPStatusCode: http.StatusInternalServerError}) + return + } + + if err := r.processOTLPRequest(req.Context(), result.Batches, keyToUse); err != nil { + r.handleOTLPFailureResponse(w, req, huskyotlp.OTLPError{Message: err.Error(), HTTPStatusCode: http.StatusInternalServerError}) + return + } + + _ = huskyotlp.WriteOtlpHttpTraceSuccessResponse(w, req) +} + +type LogsServer struct { + router *Router + collectorlogs.UnimplementedLogsServiceServer +} + +func NewLogsServer(router *Router) *LogsServer { + logsServer := LogsServer{router: router} + return &logsServer +} + +func (l *LogsServer) Export(ctx context.Context, req *collectorlogs.ExportLogsServiceRequest) (*collectorlogs.ExportLogsServiceResponse, error) { + ri := huskyotlp.GetRequestInfoFromGrpcMetadata(ctx) + if err := ri.ValidateLogsHeaders(); err != nil { + return nil, huskyotlp.AsGRPCError(err) + } + + apicfg := l.router.Config.GetAccessKeyConfig() + keyToUse, err := apicfg.CheckAndMaybeReplaceKey(ri.ApiKey) + + if err != nil { + return nil, status.Error(codes.Unauthenticated, err.Error()) + } + + result, err := huskyotlp.TranslateLogsRequest(ctx, req, ri) + if err != nil { + return nil, huskyotlp.AsGRPCError(err) + } + + if err := l.router.processOTLPRequest(ctx, result.Batches, keyToUse); err != nil { + return nil, huskyotlp.AsGRPCError(err) + } + + return &collectorlogs.ExportLogsServiceResponse{}, nil +} diff --git a/route/otlp_logs_test.go b/route/otlp_logs_test.go new file mode 100644 index 0000000000..5ba74ab758 --- /dev/null +++ b/route/otlp_logs_test.go @@ -0,0 +1,407 @@ +package route + +import ( + "bytes" + "compress/gzip" + "context" + "fmt" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" + + huskyotlp "github.com/honeycombio/husky/otlp" + "github.com/honeycombio/refinery/collect" + "github.com/honeycombio/refinery/config" + "github.com/honeycombio/refinery/logger" + "github.com/honeycombio/refinery/metrics" + "github.com/honeycombio/refinery/sharder" + "github.com/honeycombio/refinery/transmit" + "github.com/klauspost/compress/zstd" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + collectorlogs "go.opentelemetry.io/proto/otlp/collector/logs/v1" + common "go.opentelemetry.io/proto/otlp/common/v1" + logs "go.opentelemetry.io/proto/otlp/logs/v1" + resource "go.opentelemetry.io/proto/otlp/resource/v1" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/metadata" + "google.golang.org/grpc/status" + "google.golang.org/protobuf/encoding/protojson" + "google.golang.org/protobuf/proto" +) + +func TestLogsOTLPHandler(t *testing.T) { + md := metadata.New(map[string]string{"x-honeycomb-team": legacyAPIKey, "x-honeycomb-dataset": "ds"}) + ctx := metadata.NewIncomingContext(context.Background(), md) + + mockMetrics := metrics.MockMetrics{} + mockMetrics.Start() + mockTransmission := &transmit.MockTransmission{} + mockTransmission.Start() + mockCollector := collect.NewMockCollector() + decoders, err := makeDecoders(1) + if err != nil { + t.Error(err) + } + logger := &logger.MockLogger{} + router := &Router{ + Config: &config.MockConfig{ + TraceIdFieldNames: []string{"trace.trace_id"}, + }, + Metrics: &mockMetrics, + UpstreamTransmission: mockTransmission, + iopLogger: iopLogger{ + Logger: logger, + incomingOrPeer: "incoming", + }, + Logger: logger, + zstdDecoders: decoders, + environmentCache: newEnvironmentCache(time.Second, nil), + Sharder: &sharder.SingleServerSharder{ + Logger: logger, + }, + Collector: mockCollector, + incomingOrPeer: "incoming", + } + logsServer := NewLogsServer(router) + + t.Run("can receive OTLP over gRPC", func(t *testing.T) { + req := &collectorlogs.ExportLogsServiceRequest{ + ResourceLogs: []*logs.ResourceLogs{{ + ScopeLogs: []*logs.ScopeLogs{{ + LogRecords: createLogsRecords(), + }}, + }}, + } + _, err := logsServer.Export(ctx, req) + if err != nil { + t.Errorf(`Unexpected error: %s`, err) + } + assert.Equal(t, 1, len(mockTransmission.Events)) + mockTransmission.Flush() + }) + + t.Run("invalid headers", func(t *testing.T) { + req := &collectorlogs.ExportLogsServiceRequest{} + body, err := proto.Marshal(req) + assert.NoError(t, err) + anEmptyRequestBody := bytes.NewReader(body) // Empty because we're testing headers, not the body. + + testCases := []struct { + name string + requestContentType string + expectedResponseStatus int + expectedResponseContentType string + expectedResponseBody string + }{ + { + name: "no key/bad content-type", + requestContentType: "application/nope", + expectedResponseStatus: http.StatusUnsupportedMediaType, // Prioritize erroring on bad content type over other header issues. + expectedResponseContentType: "text/plain", + expectedResponseBody: huskyotlp.ErrInvalidContentType.Message, + }, + { + name: "no key/json", + requestContentType: "application/json", + expectedResponseStatus: http.StatusUnauthorized, + expectedResponseContentType: "application/json", + expectedResponseBody: fmt.Sprintf("{\"message\":\"%s\"}", huskyotlp.ErrMissingAPIKeyHeader.Message), + }, + { + name: "no key/protobuf", + requestContentType: "application/protobuf", + expectedResponseStatus: http.StatusUnauthorized, + expectedResponseContentType: "application/protobuf", + expectedResponseBody: fmt.Sprintf("\x12!%s", huskyotlp.ErrMissingAPIKeyHeader.Message), + }, + } + + for _, tC := range testCases { + t.Run(tC.name, func(t *testing.T) { + request, err := http.NewRequest("POST", "/v1/traces", anEmptyRequestBody) + require.NoError(t, err) + request.Header = http.Header{} + request.Header.Set("content-type", tC.requestContentType) + response := httptest.NewRecorder() + router.postOTLPTrace(response, request) + + assert.Equal(t, tC.expectedResponseStatus, response.Code) + assert.Equal(t, tC.expectedResponseContentType, response.Header().Get("content-type")) + assert.Equal(t, tC.expectedResponseBody, response.Body.String()) + }) + } + }) + + t.Run("can receive OTLP over HTTP/protobuf", func(t *testing.T) { + req := &collectorlogs.ExportLogsServiceRequest{ + ResourceLogs: []*logs.ResourceLogs{{ + ScopeLogs: []*logs.ScopeLogs{{ + LogRecords: createLogsRecords(), + }}, + }}, + } + body, err := proto.Marshal(req) + if err != nil { + t.Error(err) + } + + request, _ := http.NewRequest("POST", "/v1/logs", strings.NewReader(string(body))) + request.Header = http.Header{} + request.Header.Set("content-type", "application/protobuf") + request.Header.Set("x-honeycomb-team", legacyAPIKey) + request.Header.Set("x-honeycomb-dataset", "dataset") + + w := httptest.NewRecorder() + router.postOTLPLogs(w, request) + assert.Equal(t, w.Code, http.StatusOK) + + assert.Equal(t, 1, len(mockTransmission.Events)) + mockTransmission.Flush() + }) + + t.Run("can receive OTLP over HTTP/protobuf with gzip encoding", func(t *testing.T) { + req := &collectorlogs.ExportLogsServiceRequest{ + ResourceLogs: []*logs.ResourceLogs{{ + ScopeLogs: []*logs.ScopeLogs{{ + LogRecords: createLogsRecords(), + }}, + }}, + } + body, err := proto.Marshal(req) + if err != nil { + t.Error(err) + } + + buf := new(bytes.Buffer) + writer := gzip.NewWriter(buf) + writer.Write(body) + writer.Close() + if err != nil { + t.Error(err) + } + + request, _ := http.NewRequest("POST", "/v1/logs", strings.NewReader(buf.String())) + request.Header = http.Header{} + request.Header.Set("content-type", "application/protobuf") + request.Header.Set("content-encoding", "gzip") + request.Header.Set("x-honeycomb-team", legacyAPIKey) + request.Header.Set("x-honeycomb-dataset", "dataset") + + w := httptest.NewRecorder() + router.postOTLPLogs(w, request) + assert.Equal(t, w.Code, http.StatusOK) + + assert.Equal(t, 1, len(mockTransmission.Events)) + mockTransmission.Flush() + }) + + t.Run("can receive OTLP over HTTP/protobuf with zstd encoding", func(t *testing.T) { + req := &collectorlogs.ExportLogsServiceRequest{ + ResourceLogs: []*logs.ResourceLogs{{ + ScopeLogs: []*logs.ScopeLogs{{ + LogRecords: createLogsRecords(), + }}, + }}, + } + body, err := proto.Marshal(req) + if err != nil { + t.Error(err) + } + + buf := new(bytes.Buffer) + writer, err := zstd.NewWriter(buf) + if err != nil { + t.Error(err) + } + writer.Write(body) + writer.Close() + if err != nil { + t.Error(err) + } + + request, _ := http.NewRequest("POST", "/v1/logs", strings.NewReader(buf.String())) + request.Header = http.Header{} + request.Header.Set("content-type", "application/protobuf") + request.Header.Set("content-encoding", "zstd") + request.Header.Set("x-honeycomb-team", legacyAPIKey) + request.Header.Set("x-honeycomb-dataset", "dataset") + + w := httptest.NewRecorder() + router.postOTLPLogs(w, request) + assert.Equal(t, w.Code, http.StatusOK) + + assert.Equal(t, 1, len(mockTransmission.Events)) + mockTransmission.Flush() + }) + + t.Run("accepts OTLP over HTTP/JSON ", func(t *testing.T) { + req := &collectorlogs.ExportLogsServiceRequest{ + ResourceLogs: []*logs.ResourceLogs{{ + ScopeLogs: []*logs.ScopeLogs{{ + LogRecords: createLogsRecords(), + }}, + }}, + } + body, err := protojson.Marshal(req) + if err != nil { + t.Error(err) + } + + request, _ := http.NewRequest("POST", "/v1/logs", bytes.NewReader(body)) + request.Header = http.Header{} + request.Header.Set("content-type", "application/json") + request.Header.Set("x-honeycomb-team", legacyAPIKey) + request.Header.Set("x-honeycomb-dataset", "dataset") + + w := httptest.NewRecorder() + router.postOTLPLogs(w, request) + assert.Equal(t, w.Code, http.StatusOK) + assert.Equal(t, "{}", w.Body.String()) + + assert.Equal(t, 1, len(mockTransmission.Events)) + mockTransmission.Flush() + }) + + t.Run("rejects bad API keys - HTTP", func(t *testing.T) { + router.Config.(*config.MockConfig).GetAccessKeyConfigVal = config.AccessKeyConfig{ + ReceiveKeys: []string{}, + AcceptOnlyListedKeys: true, + } + defer func() { + router.Config.(*config.MockConfig).GetAccessKeyConfigVal = config.AccessKeyConfig{ + ReceiveKeys: []string{}, + AcceptOnlyListedKeys: false, + } + }() + req := &collectorlogs.ExportLogsServiceRequest{ + ResourceLogs: []*logs.ResourceLogs{{ + ScopeLogs: []*logs.ScopeLogs{{ + LogRecords: createLogsRecords(), + }}, + }}, + } + body, err := protojson.Marshal(req) + if err != nil { + t.Error(err) + } + + request, _ := http.NewRequest("POST", "/v1/logs", bytes.NewReader(body)) + request.Header = http.Header{} + request.Header.Set("content-type", "application/json") + request.Header.Set("x-honeycomb-team", legacyAPIKey) + request.Header.Set("x-honeycomb-dataset", "dataset") + + w := httptest.NewRecorder() + router.postOTLPLogs(w, request) + assert.Equal(t, http.StatusUnauthorized, w.Code) + assert.Contains(t, w.Body.String(), "not found in list of authorized keys") + + assert.Equal(t, 0, len(mockTransmission.Events)) + mockTransmission.Flush() + }) + + t.Run("rejects bad API keys - gRPC", func(t *testing.T) { + router.Config.(*config.MockConfig).GetAccessKeyConfigVal = config.AccessKeyConfig{ + ReceiveKeys: []string{}, + AcceptOnlyListedKeys: true, + } + defer func() { + router.Config.(*config.MockConfig).GetAccessKeyConfigVal = config.AccessKeyConfig{ + ReceiveKeys: []string{}, + AcceptOnlyListedKeys: false, + } + }() + req := &collectorlogs.ExportLogsServiceRequest{ + ResourceLogs: []*logs.ResourceLogs{{ + ScopeLogs: []*logs.ScopeLogs{{ + LogRecords: createLogsRecords(), + }}, + }}, + } + _, err := logsServer.Export(ctx, req) + assert.Equal(t, codes.Unauthenticated, status.Code(err)) + assert.Contains(t, err.Error(), "not found in list of authorized keys") + assert.Equal(t, 0, len(mockTransmission.Events)) + mockTransmission.Flush() + }) + + t.Run("logs with trace ID are added to collector", func(t *testing.T) { + req := &collectorlogs.ExportLogsServiceRequest{ + ResourceLogs: []*logs.ResourceLogs{{ + Resource: createResource(), + ScopeLogs: []*logs.ScopeLogs{{ + LogRecords: []*logs.LogRecord{{ + TimeUnixNano: uint64(time.Now().UnixNano()), + Attributes: []*common.KeyValue{{ + Key: "trace.trace_id", + Value: &common.AnyValue{ + Value: &common.AnyValue_StringValue{StringValue: "1234567890abcdef"}, + }, + }}, + }}, + }}, + }}, + } + _, err := logsServer.Export(ctx, req) + if err != nil { + t.Errorf(`Unexpected error: %s`, err) + } + assert.Equal(t, 0, len(mockTransmission.Events)) + mockTransmission.Flush() + assert.Equal(t, 1, len(router.Collector.(*collect.MockCollector).Spans)) + mockCollector.Flush() + }) + + t.Run("logs without trace ID are added to transmission", func(t *testing.T) { + req := &collectorlogs.ExportLogsServiceRequest{ + ResourceLogs: []*logs.ResourceLogs{{ + Resource: createResource(), + ScopeLogs: []*logs.ScopeLogs{{ + LogRecords: []*logs.LogRecord{{ + TimeUnixNano: uint64(time.Now().UnixNano()), + }}, + }}, + }}, + } + _, err := logsServer.Export(ctx, req) + if err != nil { + t.Errorf(`Unexpected error: %s`, err) + } + assert.Equal(t, 1, len(mockTransmission.Events)) + mockTransmission.Flush() + assert.Equal(t, 0, len(router.Collector.(*collect.MockCollector).Spans)) + mockCollector.Flush() + }) +} + +func createLogsRecords() []*logs.LogRecord { + now := time.Now() + return []*logs.LogRecord{ + { + TimeUnixNano: uint64(now.UnixNano()), + Body: &common.AnyValue{ + Value: &common.AnyValue_StringValue{StringValue: "log message"}, + }, + Attributes: []*common.KeyValue{ + { + Key: "attribute_key", + Value: &common.AnyValue{ + Value: &common.AnyValue_StringValue{StringValue: "attribute_value"}, + }, + }, + }, + SeverityText: "INFO", + }, + } +} + +func createResource() *resource.Resource { + return &resource.Resource{ + Attributes: []*common.KeyValue{ + {Key: "service.name", Value: &common.AnyValue{Value: &common.AnyValue_StringValue{StringValue: "my-service"}}}, + }, + } +} diff --git a/route/otlp_trace.go b/route/otlp_trace.go index dc07fdcb1c..c0137b0a76 100644 --- a/route/otlp_trace.go +++ b/route/otlp_trace.go @@ -3,42 +3,47 @@ package route import ( "context" "errors" - "fmt" "net/http" huskyotlp "github.com/honeycombio/husky/otlp" - "github.com/honeycombio/refinery/types" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" collectortrace "go.opentelemetry.io/proto/otlp/collector/trace/v1" ) -func (r *Router) postOTLP(w http.ResponseWriter, req *http.Request) { +func (r *Router) postOTLPTrace(w http.ResponseWriter, req *http.Request) { ri := huskyotlp.GetRequestInfoFromHttpHeaders(req.Header) - if !r.Config.IsAPIKeyValid(ri.ApiKey) { - err := fmt.Errorf("api key %s not found in list of authorized keys", ri.ApiKey) - r.handlerReturnWithError(w, ErrAuthNeeded, err) - return - } - if err := ri.ValidateTracesHeaders(); err != nil { if errors.Is(err, huskyotlp.ErrInvalidContentType) { - r.handlerReturnWithError(w, ErrInvalidContentType, err) + r.handleOTLPFailureResponse(w, req, huskyotlp.ErrInvalidContentType) } else { - r.handlerReturnWithError(w, ErrAuthNeeded, err) + r.handleOTLPFailureResponse(w, req, huskyotlp.OTLPError{Message: err.Error(), HTTPStatusCode: http.StatusUnauthorized}) } return } - result, err := huskyotlp.TranslateTraceRequestFromReader(req.Body, ri) + apicfg := r.Config.GetAccessKeyConfig() + keyToUse, err := apicfg.CheckAndMaybeReplaceKey(ri.ApiKey) + + if err != nil { + r.handleOTLPFailureResponse(w, req, huskyotlp.OTLPError{Message: err.Error(), HTTPStatusCode: http.StatusUnauthorized}) + return + } + + result, err := huskyotlp.TranslateTraceRequestFromReader(req.Context(), req.Body, ri) if err != nil { - r.handlerReturnWithError(w, ErrUpstreamFailed, err) + r.handleOTLPFailureResponse(w, req, huskyotlp.OTLPError{Message: err.Error(), HTTPStatusCode: http.StatusInternalServerError}) return } - if err := processTraceRequest(req.Context(), r, result.Batches, ri.ApiKey); err != nil { - r.handlerReturnWithError(w, ErrUpstreamFailed, err) + if err := r.processOTLPRequest(req.Context(), result.Batches, keyToUse); err != nil { + r.handleOTLPFailureResponse(w, req, huskyotlp.OTLPError{Message: err.Error(), HTTPStatusCode: http.StatusInternalServerError}) + return } + + _ = huskyotlp.WriteOtlpHttpTraceSuccessResponse(w, req) } type TraceServer struct { @@ -57,54 +62,21 @@ func (t *TraceServer) Export(ctx context.Context, req *collectortrace.ExportTrac return nil, huskyotlp.AsGRPCError(err) } - result, err := huskyotlp.TranslateTraceRequest(req, ri) - if err != nil { - return nil, huskyotlp.AsGRPCError(err) - } - - if err := processTraceRequest(ctx, t.router, result.Batches, ri.ApiKey); err != nil { - return nil, huskyotlp.AsGRPCError(err) - } - - return &collectortrace.ExportTraceServiceResponse{}, nil -} - -func processTraceRequest( - ctx context.Context, - router *Router, - batches []huskyotlp.Batch, - apiKey string) error { + apicfg := t.router.Config.GetAccessKeyConfig() + keyToUse, err := apicfg.CheckAndMaybeReplaceKey(ri.ApiKey) - var requestID types.RequestIDContextKey - apiHost, err := router.Config.GetHoneycombAPI() if err != nil { - router.Logger.Error().Logf("Unable to retrieve APIHost from config while processing OTLP batch") - return err + return nil, status.Error(codes.Unauthenticated, err.Error()) } - // get environment name - will be empty for legacy keys - environment, err := router.getEnvironmentName(apiKey) + result, err := huskyotlp.TranslateTraceRequest(ctx, req, ri) if err != nil { - return nil + return nil, huskyotlp.AsGRPCError(err) } - for _, batch := range batches { - for _, ev := range batch.Events { - event := &types.Event{ - Context: ctx, - APIHost: apiHost, - APIKey: apiKey, - Dataset: batch.Dataset, - Environment: environment, - SampleRate: uint(ev.SampleRate), - Timestamp: ev.Timestamp, - Data: ev.Attributes, - } - if err = router.processEvent(event, requestID); err != nil { - router.Logger.Error().Logf("Error processing event: " + err.Error()) - } - } + if err := t.router.processOTLPRequest(ctx, result.Batches, keyToUse); err != nil { + return nil, huskyotlp.AsGRPCError(err) } - return nil + return &collectortrace.ExportTraceServiceResponse{}, nil } diff --git a/route/otlp_trace_test.go b/route/otlp_trace_test.go index efdc672798..93fbb390e6 100644 --- a/route/otlp_trace_test.go +++ b/route/otlp_trace_test.go @@ -5,6 +5,7 @@ import ( "compress/gzip" "context" "encoding/hex" + "fmt" "net/http" "net/http/httptest" "strings" @@ -18,11 +19,14 @@ import ( "github.com/honeycombio/refinery/transmit" "github.com/klauspost/compress/zstd" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" collectortrace "go.opentelemetry.io/proto/otlp/collector/trace/v1" common "go.opentelemetry.io/proto/otlp/common/v1" resource "go.opentelemetry.io/proto/otlp/resource/v1" trace "go.opentelemetry.io/proto/otlp/trace/v1" + "google.golang.org/grpc/codes" "google.golang.org/grpc/metadata" + "google.golang.org/grpc/status" "google.golang.org/protobuf/encoding/protojson" "google.golang.org/protobuf/proto" ) @@ -41,8 +45,23 @@ func TestOTLPHandler(t *testing.T) { if err != nil { t.Error(err) } + + conf := &config.MockConfig{ + GetTracesConfigVal: config.TracesConfig{ + SendTicker: config.Duration(2 * time.Millisecond), + SendDelay: config.Duration(1 * time.Millisecond), + TraceTimeout: config.Duration(60 * time.Second), + MaxBatchSize: 500, + }, + GetSamplerTypeVal: &config.DeterministicSamplerConfig{SampleRate: 1}, + GetCollectionConfigVal: config.CollectionConfig{ + CacheCapacity: 100, + MaxAlloc: 100, + }, + } + router := &Router{ - Config: &config.MockConfig{}, + Config: conf, Metrics: &mockMetrics, UpstreamTransmission: mockTransmission, iopLogger: iopLogger{ @@ -54,17 +73,6 @@ func TestOTLPHandler(t *testing.T) { environmentCache: newEnvironmentCache(time.Second, nil), } - conf := &config.MockConfig{ - GetSendDelayVal: 0, - GetTraceTimeoutVal: 60 * time.Second, - GetSamplerTypeVal: &config.DeterministicSamplerConfig{SampleRate: 1}, - SendTickerVal: 2 * time.Millisecond, - GetCollectionConfigVal: config.CollectionConfig{ - CacheCapacity: 100, - MaxAlloc: 100, - }, - } - t.Run("span with status", func(t *testing.T) { req := &collectortrace.ExportTraceServiceRequest{ ResourceSpans: []*trace.ResourceSpans{{ @@ -130,7 +138,7 @@ func TestOTLPHandler(t *testing.T) { t.Errorf(`Unexpected error: %s`, err) } - time.Sleep(conf.SendTickerVal * 2) + time.Sleep(conf.GetTracesConfigVal.GetSendTickerValue() * 2) mockTransmission.Mux.Lock() assert.Equal(t, 2, len(mockTransmission.Events)) @@ -180,7 +188,7 @@ func TestOTLPHandler(t *testing.T) { t.Errorf(`Unexpected error: %s`, err) } - time.Sleep(conf.SendTickerVal * 2) + time.Sleep(conf.GetTracesConfigVal.GetSendTickerValue() * 2) assert.Equal(t, 2, len(mockTransmission.Events)) spanLink := mockTransmission.Events[1] @@ -193,6 +201,58 @@ func TestOTLPHandler(t *testing.T) { mockTransmission.Flush() }) + t.Run("invalid headers", func(t *testing.T) { + req := &collectortrace.ExportTraceServiceRequest{} + body, err := proto.Marshal(req) + assert.NoError(t, err) + anEmptyRequestBody := bytes.NewReader(body) // Empty because we're testing headers, not the body. + + testCases := []struct { + name string + requestContentType string + expectedResponseStatus int + expectedResponseContentType string + expectedResponseBody string + }{ + { + name: "no key/bad content-type", + requestContentType: "application/nope", + expectedResponseStatus: http.StatusUnsupportedMediaType, // Prioritize erroring on bad content type over other header issues. + expectedResponseContentType: "text/plain", + expectedResponseBody: huskyotlp.ErrInvalidContentType.Message, + }, + { + name: "no key/json", + requestContentType: "application/json", + expectedResponseStatus: http.StatusUnauthorized, + expectedResponseContentType: "application/json", + expectedResponseBody: fmt.Sprintf("{\"message\":\"%s\"}", huskyotlp.ErrMissingAPIKeyHeader.Message), + }, + { + name: "no key/protobuf", + requestContentType: "application/protobuf", + expectedResponseStatus: http.StatusUnauthorized, + expectedResponseContentType: "application/protobuf", + expectedResponseBody: fmt.Sprintf("\x12!%s", huskyotlp.ErrMissingAPIKeyHeader.Message), + }, + } + + for _, tC := range testCases { + t.Run(tC.name, func(t *testing.T) { + request, err := http.NewRequest("POST", "/v1/traces", anEmptyRequestBody) + require.NoError(t, err) + request.Header = http.Header{} + request.Header.Set("content-type", tC.requestContentType) + response := httptest.NewRecorder() + router.postOTLPTrace(response, request) + + assert.Equal(t, tC.expectedResponseStatus, response.Code) + assert.Equal(t, tC.expectedResponseContentType, response.Header().Get("content-type")) + assert.Equal(t, tC.expectedResponseBody, response.Body.String()) + }) + } + }) + t.Run("can receive OTLP over HTTP/protobuf", func(t *testing.T) { req := &collectortrace.ExportTraceServiceRequest{ ResourceSpans: []*trace.ResourceSpans{{ @@ -213,7 +273,7 @@ func TestOTLPHandler(t *testing.T) { request.Header.Set("x-honeycomb-dataset", "dataset") w := httptest.NewRecorder() - router.postOTLP(w, request) + router.postOTLPTrace(w, request) assert.Equal(t, w.Code, http.StatusOK) assert.Equal(t, 2, len(mockTransmission.Events)) @@ -249,7 +309,7 @@ func TestOTLPHandler(t *testing.T) { request.Header.Set("x-honeycomb-dataset", "dataset") w := httptest.NewRecorder() - router.postOTLP(w, request) + router.postOTLPTrace(w, request) assert.Equal(t, w.Code, http.StatusOK) assert.Equal(t, 2, len(mockTransmission.Events)) @@ -288,7 +348,7 @@ func TestOTLPHandler(t *testing.T) { request.Header.Set("x-honeycomb-dataset", "dataset") w := httptest.NewRecorder() - router.postOTLP(w, request) + router.postOTLPTrace(w, request) assert.Equal(t, w.Code, http.StatusOK) assert.Equal(t, 2, len(mockTransmission.Events)) @@ -315,9 +375,9 @@ func TestOTLPHandler(t *testing.T) { request.Header.Set("x-honeycomb-dataset", "dataset") w := httptest.NewRecorder() - router.postOTLP(w, request) + router.postOTLPTrace(w, request) assert.Equal(t, w.Code, http.StatusOK) - assert.Equal(t, "", w.Body.String()) + assert.Equal(t, "{}", w.Body.String()) assert.Equal(t, 2, len(mockTransmission.Events)) mockTransmission.Flush() @@ -387,8 +447,11 @@ func TestOTLPHandler(t *testing.T) { mockTransmission.Flush() }) - t.Run("rejects bad API keys", func(t *testing.T) { - router.Config.(*config.MockConfig).IsAPIKeyValidFunc = func(k string) bool { return false } + t.Run("rejects bad API keys - HTTP", func(t *testing.T) { + router.Config.(*config.MockConfig).GetAccessKeyConfigVal = config.AccessKeyConfig{ + ReceiveKeys: []string{}, + AcceptOnlyListedKeys: true, + } req := &collectortrace.ExportTraceServiceRequest{ ResourceSpans: []*trace.ResourceSpans{{ ScopeSpans: []*trace.ScopeSpans{{ @@ -408,14 +471,33 @@ func TestOTLPHandler(t *testing.T) { request.Header.Set("x-honeycomb-dataset", "dataset") w := httptest.NewRecorder() - router.postOTLP(w, request) - assert.Equal(t, http.StatusBadRequest, w.Code) + router.postOTLPTrace(w, request) + assert.Equal(t, http.StatusUnauthorized, w.Code) assert.Contains(t, w.Body.String(), "not found in list of authorized keys") assert.Equal(t, 0, len(mockTransmission.Events)) mockTransmission.Flush() }) + t.Run("rejects bad API keys - gRPC", func(t *testing.T) { + router.Config.(*config.MockConfig).GetAccessKeyConfigVal = config.AccessKeyConfig{ + ReceiveKeys: []string{}, + AcceptOnlyListedKeys: true, + } + req := &collectortrace.ExportTraceServiceRequest{ + ResourceSpans: []*trace.ResourceSpans{{ + ScopeSpans: []*trace.ScopeSpans{{ + Spans: helperOTLPRequestSpansWithStatus(), + }}, + }}, + } + traceServer := NewTraceServer(router) + _, err := traceServer.Export(ctx, req) + assert.Equal(t, codes.Unauthenticated, status.Code(err)) + assert.Contains(t, err.Error(), "not found in list of authorized keys") + assert.Equal(t, 0, len(mockTransmission.Events)) + mockTransmission.Flush() + }) } func helperOTLPRequestSpansWithoutStatus() []*trace.Span { diff --git a/route/proxy.go b/route/proxy.go index faefceaef0..1982853dca 100644 --- a/route/proxy.go +++ b/route/proxy.go @@ -13,13 +13,7 @@ import ( func (r *Router) proxy(w http.ResponseWriter, req *http.Request) { r.Metrics.Increment(r.incomingOrPeer + "_router_proxied") r.Logger.Debug().Logf("proxying request for %s", req.URL.Path) - upstreamTarget, err := r.Config.GetHoneycombAPI() - if err != nil { - w.WriteHeader(http.StatusServiceUnavailable) - io.WriteString(w, `{"error":"upstream target unavailable"}`) - r.Logger.Error().Logf("error getting honeycomb API config: %s", err) - return - } + upstreamTarget := r.Config.GetHoneycombAPI() forwarded := req.Header.Get("X-Forwarded-For") // let's copy the request over to a new one and // dispatch it upstream diff --git a/route/route.go b/route/route.go index b608445039..249d727917 100644 --- a/route/route.go +++ b/route/route.go @@ -7,6 +7,7 @@ import ( "encoding/json" "errors" "fmt" + "html" "io" "math" "net" @@ -23,6 +24,7 @@ import ( "github.com/pelletier/go-toml/v2" "github.com/vmihailenco/msgpack/v5" "google.golang.org/grpc" + healthserver "google.golang.org/grpc/health" "google.golang.org/grpc/health/grpc_health_v1" "google.golang.org/grpc/keepalive" "google.golang.org/grpc/metadata" @@ -31,14 +33,18 @@ import ( // grpc/gzip compressor, auto registers on import _ "google.golang.org/grpc/encoding/gzip" + huskyotlp "github.com/honeycombio/husky/otlp" + "github.com/honeycombio/refinery/collect" "github.com/honeycombio/refinery/config" + "github.com/honeycombio/refinery/internal/health" "github.com/honeycombio/refinery/logger" "github.com/honeycombio/refinery/metrics" "github.com/honeycombio/refinery/sharder" "github.com/honeycombio/refinery/transmit" "github.com/honeycombio/refinery/types" + collectorlogs "go.opentelemetry.io/proto/otlp/collector/logs/v1" collectortrace "go.opentelemetry.io/proto/otlp/collector/trace/v1" ) @@ -49,13 +55,15 @@ const ( numZstdDecoders = 4 traceIDShortLength = 8 traceIDLongLength = 16 - GRPCMessageSizeMax int = 5000000 // 5MB + GRPCMessageSizeMax int = 5_000_000 // 5MB + HTTPMessageSizeMax = 5_000_000 // 5MB defaultSampleRate = 1 ) type Router struct { Config config.Config `inject:""` Logger logger.Logger `inject:""` + Health health.Reporter `inject:""` HTTPTransport *http.Transport `inject:"upstreamTransport"` UpstreamTransmission transmit.Transmission `inject:"upstreamTransmission"` PeerTransmission transmit.Transmission `inject:"peerTransmission"` @@ -81,8 +89,10 @@ type Router struct { server *http.Server grpcServer *grpc.Server doneWG sync.WaitGroup + donech chan struct{} environmentCache *environmentCache + hsrv *healthserver.Server } type BatchResponse struct { @@ -149,8 +159,8 @@ func (r *Router) LnS(incomingOrPeer string) { muxxer.Use(r.requestLogger) muxxer.Use(r.panicCatcher) - // answer a basic health check locally muxxer.HandleFunc("/alive", r.alive).Name("local health") + muxxer.HandleFunc("/ready", r.ready).Name("local readiness") muxxer.HandleFunc("/panic", r.panic).Name("intentional panic") muxxer.HandleFunc("/version", r.version).Name("report version info") @@ -165,7 +175,8 @@ func (r *Router) LnS(incomingOrPeer string) { // require an auth header for events and batches authedMuxxer := muxxer.PathPrefix("/1/").Methods("POST").Subrouter() - authedMuxxer.Use(r.apiKeyChecker) + authedMuxxer.UseEncodedPath() + authedMuxxer.Use(r.apiKeyProcessor) // handle events and batches authedMuxxer.HandleFunc("/events/{datasetName}", r.event).Name("event") @@ -179,23 +190,11 @@ func (r *Router) LnS(incomingOrPeer string) { var listenAddr, grpcAddr string if r.incomingOrPeer == "incoming" { - listenAddr, err = r.Config.GetListenAddr() - if err != nil { - r.iopLogger.Error().Logf("failed to get listen addr config: %s", err) - return - } - // GRPC listen addr is optional, err means addr was not empty and invalid - grpcAddr, err = r.Config.GetGRPCListenAddr() - if err != nil { - r.iopLogger.Error().Logf("failed to get grpc listen addr config: %s", err) - return - } + listenAddr = r.Config.GetListenAddr() + // GRPC listen addr is optional + grpcAddr = r.Config.GetGRPCListenAddr() } else { - listenAddr, err = r.Config.GetPeerListenAddr() - if err != nil { - r.iopLogger.Error().Logf("failed to get peer listen addr config: %s", err) - return - } + listenAddr = r.Config.GetPeerListenAddr() } r.iopLogger.Info().Logf("Listening on %s", listenAddr) @@ -205,6 +204,7 @@ func (r *Router) LnS(incomingOrPeer string) { IdleTimeout: r.Config.GetHTTPIdleTimeout(), } + r.donech = make(chan struct{}) if r.Config.GetGRPCEnabled() && len(grpcAddr) > 0 { l, err := net.Listen("tcp", grpcAddr) if err != nil { @@ -224,10 +224,19 @@ func (r *Router) LnS(incomingOrPeer string) { Timeout: time.Duration(grpcConfig.KeepAliveTimeout), }), } - traceServer := NewTraceServer(r) r.grpcServer = grpc.NewServer(serverOpts...) + + traceServer := NewTraceServer(r) collectortrace.RegisterTraceServiceServer(r.grpcServer, traceServer) - grpc_health_v1.RegisterHealthServer(r.grpcServer, r) + + logsServer := NewLogsServer(r) + collectorlogs.RegisterLogsServiceServer(r.grpcServer, logsServer) + + // health check -- manufactured by grpc health package + r.hsrv = healthserver.NewServer() + grpc_health_v1.RegisterHealthServer(r.grpcServer, r.hsrv) + r.startGRPCHealthMonitor() + go r.grpcServer.Serve(l) } @@ -253,13 +262,35 @@ func (r *Router) Stop() error { if r.grpcServer != nil { r.grpcServer.GracefulStop() } + close(r.donech) r.doneWG.Wait() return nil } func (r *Router) alive(w http.ResponseWriter, req *http.Request) { - r.iopLogger.Debug().Logf("answered /x/alive check") - w.Write([]byte(`{"source":"refinery","alive":"yes"}`)) + r.iopLogger.Debug().Logf("answered /alive check") + + alive := r.Health.IsAlive() + r.Metrics.Gauge("is_alive", alive) + if !alive { + w.WriteHeader(http.StatusServiceUnavailable) + r.marshalToFormat(w, map[string]interface{}{"source": "refinery", "alive": "no"}, "json") + return + } + r.marshalToFormat(w, map[string]interface{}{"source": "refinery", "alive": "yes"}, "json") +} + +func (r *Router) ready(w http.ResponseWriter, req *http.Request) { + r.iopLogger.Debug().Logf("answered /ready check") + + ready := r.Health.IsReady() + r.Metrics.Gauge("is_ready", ready) + if !ready { + w.WriteHeader(http.StatusServiceUnavailable) + r.marshalToFormat(w, map[string]interface{}{"source": "refinery", "ready": "no"}, "json") + return + } + r.marshalToFormat(w, map[string]interface{}{"source": "refinery", "ready": "yes"}, "json") } func (r *Router) panic(w http.ResponseWriter, req *http.Request) { @@ -273,29 +304,19 @@ func (r *Router) version(w http.ResponseWriter, req *http.Request) { func (r *Router) debugTrace(w http.ResponseWriter, req *http.Request) { traceID := mux.Vars(req)["traceID"] shard := r.Sharder.WhichShard(traceID) - w.Write([]byte(fmt.Sprintf(`{"traceID":"%s","node":"%s"}`, traceID, shard.GetAddress()))) + w.Write([]byte(fmt.Sprintf(`{"traceID":"%s","node":"%s"}`, html.EscapeString(traceID), shard.GetAddress()))) } func (r *Router) getSamplerRules(w http.ResponseWriter, req *http.Request) { format := strings.ToLower(mux.Vars(req)["format"]) dataset := mux.Vars(req)["dataset"] - cfg, name, err := r.Config.GetSamplerConfigForDestName(dataset) - if err != nil { - w.Write([]byte(fmt.Sprintf("got error %v trying to fetch config for dataset %s\n", err, dataset))) - w.WriteHeader(http.StatusBadRequest) - return - } + cfg, name := r.Config.GetSamplerConfigForDestName(dataset) r.marshalToFormat(w, map[string]interface{}{name: cfg}, format) } func (r *Router) getAllSamplerRules(w http.ResponseWriter, req *http.Request) { format := strings.ToLower(mux.Vars(req)["format"]) - cfgs, err := r.Config.GetAllSamplerRules() - if err != nil { - w.Write([]byte(fmt.Sprintf("got error %v trying to fetch configs", err))) - w.WriteHeader(http.StatusBadRequest) - return - } + cfgs := r.Config.GetAllSamplerRules() r.marshalToFormat(w, cfgs, format) } @@ -380,14 +401,13 @@ func (r *Router) requestToEvent(req *http.Request, reqBod []byte) (*types.Event, sampleRate = 1 } eventTime := getEventTime(req.Header.Get(types.TimestampHeader)) - vars := mux.Vars(req) - dataset := vars["datasetName"] - - apiHost, err := r.Config.GetHoneycombAPI() + dataset, err := getDatasetFromRequest(req) if err != nil { return nil, err } + apiHost := r.Config.GetHoneycombAPI() + // get environment name - will be empty for legacy keys environment, err := r.getEnvironmentName(apiKey) if err != nil { @@ -439,6 +459,12 @@ func (r *Router) batch(w http.ResponseWriter, req *http.Request) { return } + dataset, err := getDatasetFromRequest(req) + if err != nil { + r.handlerReturnWithError(w, ErrReqToEvent, err) + } + apiHost := r.Config.GetHoneycombAPI() + apiKey := req.Header.Get(types.APIKeyHeader) if apiKey == "" { apiKey = req.Header.Get(types.APIKeyHeaderShort) @@ -452,17 +478,15 @@ func (r *Router) batch(w http.ResponseWriter, req *http.Request) { batchedResponses := make([]*BatchResponse, 0, len(batchedEvents)) for _, bev := range batchedEvents { - ev, err := r.batchedEventToEvent(req, bev, apiKey, environment) - if err != nil { - batchedResponses = append( - batchedResponses, - &BatchResponse{ - Status: http.StatusBadRequest, - Error: fmt.Sprintf("failed to convert to event: %s", err.Error()), - }, - ) - debugLog.WithField("error", err).Logf("event from batch failed to process event") - continue + ev := &types.Event{ + Context: req.Context(), + APIHost: apiHost, + APIKey: apiKey, + Dataset: dataset, + Environment: environment, + SampleRate: bev.getSampleRate(), + Timestamp: bev.getEventTime(), + Data: bev.Data, } err = r.processEvent(ev, reqID) @@ -488,6 +512,41 @@ func (r *Router) batch(w http.ResponseWriter, req *http.Request) { w.Write(response) } +func (router *Router) processOTLPRequest( + ctx context.Context, + batches []huskyotlp.Batch, + apiKey string) error { + + var requestID types.RequestIDContextKey + apiHost := router.Config.GetHoneycombAPI() + + // get environment name - will be empty for legacy keys + environment, err := router.getEnvironmentName(apiKey) + if err != nil { + return nil + } + + for _, batch := range batches { + for _, ev := range batch.Events { + event := &types.Event{ + Context: ctx, + APIHost: apiHost, + APIKey: apiKey, + Dataset: batch.Dataset, + Environment: environment, + SampleRate: uint(ev.SampleRate), + Timestamp: ev.Timestamp, + Data: ev.Attributes, + } + if err = router.processEvent(event, requestID); err != nil { + router.Logger.Error().Logf("Error processing event: " + err.Error()) + } + } + } + + return nil +} + func (r *Router) processEvent(ev *types.Event, reqID interface{}) error { debugLog := r.iopLogger.Debug(). WithField("request_id", reqID). @@ -530,22 +589,24 @@ func (r *Router) processEvent(ev *types.Event, reqID interface{}) error { // and either drop or send the trace without even trying to cache or forward it. isProbe := false if r.Collector.Stressed() { - rate, keep, reason := r.Collector.GetStressedSampleRate(traceID) + processed, kept := r.Collector.ProcessSpanImmediately(span) + + if processed { + if !kept { + return nil - r.Collector.ProcessSpanImmediately(span, keep, rate, reason) + } - if !keep { - return nil + // If the span was kept, we want to generate a probe that we'll forward + // to a peer IF this span would have been forwarded. + ev.Data["meta.refinery.probe"] = true + isProbe = true } - // If the span was kept, we want to generate a probe that we'll forward - // to a peer IF this span would have been forwarded. - ev.Data["meta.refinery.probe"] = true - isProbe = true } // Figure out if we should handle this span locally or pass on to a peer targetShard := r.Sharder.WhichShard(traceID) - if r.incomingOrPeer == "incoming" && !targetShard.Equals(r.Sharder.MyShard()) { + if !targetShard.Equals(r.Sharder.MyShard()) { r.Metrics.Increment(r.incomingOrPeer + "_router_peer") debugLog. WithString("peer", targetShard.GetAddress()). @@ -596,7 +657,7 @@ func (r *Router) getMaybeCompressedBody(req *http.Request) (io.Reader, error) { defer gzipReader.Close() buf := &bytes.Buffer{} - if _, err := io.Copy(buf, gzipReader); err != nil { + if _, err := io.Copy(buf, io.LimitReader(gzipReader, HTTPMessageSizeMax)); err != nil { return nil, err } reader = buf @@ -612,7 +673,7 @@ func (r *Router) getMaybeCompressedBody(req *http.Request) (io.Reader, error) { return nil, err } buf := &bytes.Buffer{} - if _, err := io.Copy(buf, zReader); err != nil { + if _, err := io.Copy(buf, io.LimitReader(zReader, HTTPMessageSizeMax)); err != nil { return nil, err } @@ -623,32 +684,6 @@ func (r *Router) getMaybeCompressedBody(req *http.Request) (io.Reader, error) { return reader, nil } -func (r *Router) batchedEventToEvent(req *http.Request, bev batchedEvent, apiKey string, environment string) (*types.Event, error) { - sampleRate := bev.SampleRate - if sampleRate == 0 { - sampleRate = 1 - } - eventTime := bev.getEventTime() - // TODO move the following 3 lines outside of this loop; they could be done - // once for the entire batch instead of in every event. - vars := mux.Vars(req) - dataset := vars["datasetName"] - apiHost, err := r.Config.GetHoneycombAPI() - if err != nil { - return nil, err - } - return &types.Event{ - Context: req.Context(), - APIHost: apiHost, - APIKey: apiKey, - Dataset: dataset, - Environment: environment, - SampleRate: uint(sampleRate), - Timestamp: eventTime, - Data: bev.Data, - }, nil -} - type batchedEvent struct { Timestamp string `json:"time"` MsgPackTimestamp *time.Time `msgpack:"time,omitempty"` @@ -664,6 +699,13 @@ func (b *batchedEvent) getEventTime() time.Time { return getEventTime(b.Timestamp) } +func (b *batchedEvent) getSampleRate() uint { + if b.SampleRate == 0 { + return defaultSampleRate + } + return uint(b.SampleRate) +} + // getEventTime tries to guess the time format in our time header! // Allowable options are // * RFC3339Nano @@ -782,11 +824,20 @@ type cacheItem struct { // get queries the cached items, returning cache hits that have not expired. // Cache missed use the configured getFn to populate the cache. func (c *environmentCache) get(key string) (string, error) { + var val string + // get read lock so that we don't attempt to read from the map + // while another routine has a write lock and is actively writing + // to the map. + c.mutex.RLock() if item, ok := c.items[key]; ok { if time.Now().Before(item.expiresAt) { - return item.value, nil + val = item.value } } + c.mutex.RUnlock() + if val != "" { + return val, nil + } // get write lock early so we don't execute getFn in parallel so the // the result will be cached before the next lock is acquired to prevent @@ -847,10 +898,7 @@ func (r *Router) getEnvironmentName(apiKey string) (string, error) { } func (r *Router) lookupEnvironment(apiKey string) (string, error) { - apiEndpoint, err := r.Config.GetHoneycombAPI() - if err != nil { - return "", fmt.Errorf("failed to read Honeycomb API config value. %w", err) - } + apiEndpoint := r.Config.GetHoneycombAPI() authURL, err := url.Parse(apiEndpoint) if err != nil { return "", fmt.Errorf("failed to parse Honeycomb API URL config value. %w", err) @@ -900,13 +948,68 @@ func (r *Router) Watch(req *grpc_health_v1.HealthCheckRequest, server grpc_healt }) } +// startGRPCHealthMonitor starts a goroutine that periodically checks the health of the system and updates the grpc health server +func (r *Router) startGRPCHealthMonitor() { + const ( + system = "" // empty string represents the generic health of the whole system (corresponds to "ready") + systemReady = "ready" + systemAlive = "alive" + ) + r.iopLogger.Debug().Logf("running grpc health monitor") + + setStatus := func(svc string, stat bool) { + if stat { + r.hsrv.SetServingStatus(svc, grpc_health_v1.HealthCheckResponse_SERVING) + } else { + r.hsrv.SetServingStatus(svc, grpc_health_v1.HealthCheckResponse_NOT_SERVING) + } + } + + r.doneWG.Add(1) + go func() { + defer r.doneWG.Done() + // TODO: Does this time need to be configurable? + watchticker := time.NewTicker(3 * time.Second) + defer watchticker.Stop() + for { + select { + case <-watchticker.C: + alive := r.Health.IsAlive() + ready := r.Health.IsReady() + + // we can just update everything because the grpc health server will only send updates if the status changes + setStatus(systemReady, ready) + setStatus(systemAlive, alive) + setStatus(system, ready && alive) + case <-r.donech: + return + } + } + }() +} + // AddOTLPMuxxer adds muxxer for OTLP requests func (r *Router) AddOTLPMuxxer(muxxer *mux.Router) { // require an auth header for OTLP requests otlpMuxxer := muxxer.PathPrefix("/v1/").Methods("POST").Subrouter() - otlpMuxxer.Use(r.apiKeyChecker) // handle OTLP trace requests - otlpMuxxer.HandleFunc("/traces", r.postOTLP).Name("otlp") - otlpMuxxer.HandleFunc("/traces/", r.postOTLP).Name("otlp") + otlpMuxxer.HandleFunc("/traces", r.postOTLPTrace).Name("otlp_traces") + otlpMuxxer.HandleFunc("/traces/", r.postOTLPTrace).Name("otlp_traces") + + // handle OTLP logs requests + otlpMuxxer.HandleFunc("/logs", r.postOTLPLogs).Name("otlp_logs") + otlpMuxxer.HandleFunc("/logs/", r.postOTLPLogs).Name("otlp_logs") +} + +func getDatasetFromRequest(req *http.Request) (string, error) { + dataset := mux.Vars(req)["datasetName"] + if dataset == "" { + return "", fmt.Errorf("missing dataset name") + } + dataset, err := url.PathUnescape(dataset) + if err != nil { + return "", err + } + return dataset, nil } diff --git a/route/route_test.go b/route/route_test.go index e66ff6c9ed..1649df16cf 100644 --- a/route/route_test.go +++ b/route/route_test.go @@ -9,6 +9,7 @@ import ( "io" "net/http" "net/http/httptest" + "net/url" "strings" "testing" "time" @@ -16,17 +17,20 @@ import ( "github.com/facebookgo/inject" "github.com/honeycombio/refinery/collect" "github.com/honeycombio/refinery/config" + "github.com/honeycombio/refinery/internal/health" "github.com/honeycombio/refinery/internal/peer" "github.com/honeycombio/refinery/logger" "github.com/honeycombio/refinery/metrics" + "github.com/honeycombio/refinery/sharder" "github.com/honeycombio/refinery/transmit" + "github.com/jonboulle/clockwork" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "go.opentelemetry.io/otel/trace/noop" collectortrace "go.opentelemetry.io/proto/otlp/collector/trace/v1" trace "go.opentelemetry.io/proto/otlp/trace/v1" "github.com/gorilla/mux" - "github.com/honeycombio/refinery/sharder" "github.com/klauspost/compress/zstd" "github.com/vmihailenco/msgpack/v5" "google.golang.org/grpc/health/grpc_health_v1" @@ -317,7 +321,10 @@ func TestDebugTrace(t *testing.T) { rr := httptest.NewRecorder() router := &Router{ - Sharder: &TestSharder{}, + Sharder: &sharder.MockSharder{ + Self: &sharder.TestShard{Addr: "http://localhost:12345"}, + Other: &sharder.TestShard{Addr: "http://localhost:12345"}, + }, } router.debugTrace(rr, req) @@ -473,15 +480,18 @@ func TestDependencyInjection(t *testing.T) { &inject.Object{Value: &config.MockConfig{}}, &inject.Object{Value: &logger.NullLogger{}}, + &inject.Object{Value: noop.NewTracerProvider().Tracer("test"), Name: "tracer"}, &inject.Object{Value: http.DefaultTransport, Name: "upstreamTransport"}, &inject.Object{Value: &transmit.MockTransmission{}, Name: "upstreamTransmission"}, &inject.Object{Value: &transmit.MockTransmission{}, Name: "peerTransmission"}, - &inject.Object{Value: &TestSharder{}}, + &inject.Object{Value: &sharder.MockSharder{}}, &inject.Object{Value: &collect.InMemCollector{}}, &inject.Object{Value: &metrics.NullMetrics{}, Name: "metrics"}, &inject.Object{Value: &metrics.NullMetrics{}, Name: "genericMetrics"}, &inject.Object{Value: &collect.MockStressReliever{}, Name: "stressRelief"}, &inject.Object{Value: &peer.MockPeers{}}, + &inject.Object{Value: &health.Health{}}, + &inject.Object{Value: clockwork.NewFakeClock()}, ) if err != nil { t.Error(err) @@ -491,23 +501,6 @@ func TestDependencyInjection(t *testing.T) { } } -type TestSharder struct{} - -func (s *TestSharder) MyShard() sharder.Shard { return nil } - -func (s *TestSharder) WhichShard(string) sharder.Shard { - return &TestShard{ - addr: "http://localhost:12345", - } -} - -type TestShard struct { - addr string -} - -func (s *TestShard) Equals(other sharder.Shard) bool { return true } -func (s *TestShard) GetAddress() string { return s.addr } - func TestEnvironmentCache(t *testing.T) { t.Run("calls getFn on cache miss", func(t *testing.T) { cache := newEnvironmentCache(time.Second, func(key string) (string, error) { @@ -612,3 +605,119 @@ func TestGRPCHealthProbeWatch(t *testing.T) { sentMessage := mockServer.GetSentMessages()[0] assert.Equal(t, grpc_health_v1.HealthCheckResponse_SERVING, sentMessage.Status) } + +func TestGetDatasetFromRequest(t *testing.T) { + testCases := []struct { + name string + datasetName string + expectedDatasetName string + expectedError error + }{ + { + name: "empty dataset name", + datasetName: "", + expectedError: fmt.Errorf("missing dataset name"), + }, + { + name: "dataset name with invalid URL encoding", + datasetName: "foo%2", + expectedError: url.EscapeError("%2"), + }, + { + name: "normal dataset name", + datasetName: "foo", + expectedDatasetName: "foo", + }, + { + name: "dataset name with numbers", + datasetName: "foo123", + expectedDatasetName: "foo123", + }, + { + name: "dataset name with hyphen", + datasetName: "foo-bar", + expectedDatasetName: "foo-bar", + }, + { + name: "dataset name with underscore", + datasetName: "foo_bar", + expectedDatasetName: "foo_bar", + }, + { + name: "dataset name with tilde", + datasetName: "foo~bar", + expectedDatasetName: "foo~bar", + }, + { + name: "dataset name with period", + datasetName: "foo.bar", + expectedDatasetName: "foo.bar", + }, + { + name: "dataset name with URL encoded hyphen", + datasetName: "foo%2Dbar", + expectedDatasetName: "foo-bar", + }, + { + name: "dataset name with URL encoded underscore", + datasetName: "foo%5Fbar", + expectedDatasetName: "foo_bar", + }, + { + name: "dataset name with URL encoded tilde", + datasetName: "foo%7Ebar", + expectedDatasetName: "foo~bar", + }, + { + name: "dataset name with URL encoded period", + datasetName: "foo%2Ebar", + expectedDatasetName: "foo.bar", + }, + { + name: "dataset name with URL encoded forward slash", + datasetName: "foo%2Fbar", + expectedDatasetName: "foo/bar", + }, + { + name: "dataset name with URL encoded colon", + datasetName: "foo%3Abar", + expectedDatasetName: "foo:bar", + }, + { + name: "dataset name with URL encoded square brackets", + datasetName: "foo%5Bbar%5D", + expectedDatasetName: "foo[bar]", + }, + { + name: "dataset name with URL encoded parentheses", + datasetName: "foo%28bar%29", + expectedDatasetName: "foo(bar)", + }, + { + name: "dataset name with URL encoded curly braces", + datasetName: "foo%7Bbar%7D", + expectedDatasetName: "foo{bar}", + }, + { + name: "dataset name with URL encoded percent", + datasetName: "foo%25bar", + expectedDatasetName: "foo%bar", + }, + { + name: "dataset name with URL encoded ampersand", + datasetName: "foo%26bar", + expectedDatasetName: "foo&bar", + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + req, _ := http.NewRequest("GET", "/1/events/dataset", nil) + req = mux.SetURLVars(req, map[string]string{"datasetName": tc.datasetName}) + + dataset, err := getDatasetFromRequest(req) + assert.Equal(t, tc.expectedError, err) + assert.Equal(t, tc.expectedDatasetName, dataset) + }) + } +} diff --git a/rules.json b/rules.json new file mode 100644 index 0000000000..acdcb9c29a --- /dev/null +++ b/rules.json @@ -0,0 +1 @@ +{"RulesVersion":2,"Samplers":{"__default__":{"DeterministicSampler":{"SampleRate":1}},"TheNewWorld":{"EMADynamicSampler":{"GoalSampleRate":6,"AdjustmentInterval":"2s","Weight":0.5,"AgeOutValue":0.5,"BurstMultiple":200,"BurstDetectionDelay":30,"FieldList":["error","url","status"],"UseTraceLength":false,"MaxKeys":2000}}}} diff --git a/rules.md b/rules.md index 629f1344c7..7cef55f55a 100644 --- a/rules.md +++ b/rules.md @@ -1,7 +1,7 @@ # Honeycomb Refinery Rules Documentation This is the documentation for the rules configuration for Honeycomb's Refinery. -It was automatically generated on 2023-12-04 at 18:06:13 UTC. +It was automatically generated on 2024-09-03 at 19:48:56 UTC. ## The Rules file @@ -70,6 +70,8 @@ It indicates a ratio, where one sample trace is kept for every N traces seen. For example, a `SampleRate` of `30` will keep 1 out of every 30 traces. The choice on whether to keep any specific trace is random, so the rate is approximate. The sample rate is calculated from the trace ID, so all spans with the same trace ID will be sampled or not sampled together. +A `SampleRate` of `1` or less will keep all traces. +Specifying this value is required. Type: `int` @@ -92,6 +94,8 @@ It indicates a ratio, where one sample trace is kept for every N traces seen. For example, a `SampleRate` of `30` will keep 1 out of every 30 traces. The choice on whether to keep any specific trace is random, so the rate is approximate. The sample rate is calculated from the trace ID, so all spans with the same trace ID will be sampled or not sampled together. +A `SampleRate` of `1` or less will keep all traces. +Specifying this value is required. Type: `int` @@ -100,6 +104,7 @@ Type: `int` The duration after which the Dynamic Sampler should reset its internal counters. It should be specified as a duration string. For example, "30s" or "1m". +Defaults to "30s". Type: `duration` @@ -131,7 +136,7 @@ Type: `int` ### `UseTraceLength` Indicates whether to include the trace length (number of spans in the trace) as part of the key. -The number of spans is exact, so if there are normally small variations in trace length, we recommend setting this field to `false`. +The number of spans is exact, so if there are normally small variations in trace length, we recommend setting this field to `false` (the default). If your traces are consistent lengths and changes in trace length is a useful indicator to view in Honeycomb, then set this field to `true`. Type: `bool` @@ -158,6 +163,8 @@ It indicates a ratio, where one sample trace is kept for every N traces seen. For example, a `SampleRate` of `30` will keep 1 out of every 30 traces. The choice on whether to keep any specific trace is random, so the rate is approximate. The sample rate is calculated from the trace ID, so all spans with the same trace ID will be sampled or not sampled together. +A `SampleRate` of `1` or less will keep all traces. +Specifying this value is required. Type: `int` @@ -165,7 +172,8 @@ Type: `int` The duration after which the EMA Dynamic Sampler should recalculate its internal counters. It should be specified as a duration string. -For example, "30s" or "1m". +For example, `30s` or `1m`. +Defaults to `15s`. Type: `duration` @@ -175,6 +183,7 @@ The weight to use when calculating the EMA. It should be a number between `0` and `1`. Larger values weight the average more toward recent observations. In other words, a larger weight will cause sample rates more quickly adapt to traffic patterns, while a smaller weight will result in sample rates that are less sensitive to bursts or drops in traffic and thus more consistent over time. +The default value is `0.5`. Type: `float` @@ -183,7 +192,7 @@ Type: `float` Indicates the threshold for removing keys from the EMA. The EMA of any key will approach `0` if it is not repeatedly observed, but will never truly reach it, so this field determines what constitutes "zero". Keys with averages below this threshold will be removed from the EMA. -Default is the same as `Weight`, as this prevents a key with the smallest integer value (1) from being aged out immediately. +Default is the value of `Weight`, as this prevents a key with the smallest integer value (1) from being aged out immediately. This value should generally be less than (<=) `Weight`, unless you have very specific reasons to set it higher. Type: `float` @@ -232,7 +241,7 @@ Type: `int` ### `UseTraceLength` Indicates whether to include the trace length (number of spans in the trace) as part of the key. -The number of spans is exact, so if there are normally small variations in trace length, we recommend setting this field to `false`. +The number of spans is exact, so if there are normally small variations in trace length, we recommend setting this field to `false` (the default). If your traces are consistent lengths and changes in trace length is a useful indicator to view in Honeycomb, then set this field to `true`. Type: `bool` @@ -264,7 +273,7 @@ Type: `int` Indicates whether to use the cluster size to calculate the goal throughput. If `true`, then the goal throughput will be divided by the number of instances in the cluster. -If `false`, then the goal throughput will be the value specified in `GoalThroughputPerSec`. +If `false` (the default), then the goal throughput will be the value specified in `GoalThroughputPerSec`. Type: `bool` @@ -272,6 +281,7 @@ Type: `bool` `InitialSampleRate` is the sample rate to use during startup, before the sampler has accumulated enough data to calculate a reasonable throughput. This is mainly useful in situations where unsampled throughput is high enough to cause problems. +Default value is `10`. Type: `int` @@ -279,7 +289,8 @@ Type: `int` The duration after which the EMA Dynamic Sampler should recalculate its internal counters. It should be specified as a duration string. -For example, "30s" or "1m". +For example, `30s` or `1m`. +Defaults to `15s`. Type: `duration` @@ -289,6 +300,7 @@ The weight to use when calculating the EMA. It should be a number between `0` and `1`. Larger values weight the average more toward recent observations. In other words, a larger weight will cause sample rates more quickly adapt to traffic patterns, while a smaller weight will result in sample rates that are less sensitive to bursts or drops in traffic and thus more consistent over time. +The default value is `0.5`. Type: `float` @@ -297,7 +309,7 @@ Type: `float` Indicates the threshold for removing keys from the EMA. The EMA of any key will approach `0` if it is not repeatedly observed, but will never truly reach it, so this field determines what constitutes "zero". Keys with averages below this threshold will be removed from the EMA. -Default is the same as `Weight`, as this prevents a key with the smallest integer value (1) from being aged out immediately. +Default is the value of `Weight`, as this prevents a key with the smallest integer value (1) from being aged out immediately. This value should generally be less than (<=) `Weight`, unless you have very specific reasons to set it higher. Type: `float` @@ -346,7 +358,7 @@ Type: `int` ### `UseTraceLength` Indicates whether to include the trace length (number of spans in the trace) as part of the key. -The number of spans is exact, so if there are normally small variations in trace length, we recommend setting this field to `false`. +The number of spans is exact, so if there are normally small variations in trace length, we recommend setting this field to `false` (the default). If your traces are consistent lengths and changes in trace length is a useful indicator to view in Honeycomb, then set this field to `true`. Type: `bool` @@ -370,7 +382,7 @@ The Windowed Throughput Sampler resolves this by introducing two different, tuna recomputing sampling rate. A standard configuration would be to set `UpdateFrequency` to `1s` and `LookbackFrequency` to `30s`. In this configuration, for every second, we lookback at the last 30 seconds of data in order to compute the new sampling rate. -The actual sampling rate computation is nearly identical to the original Throughput Sampler, but this variant has better support for floating point numbers. +The actual sampling rate computation is nearly identical to the original Throughput Sampler, but this variant has better support for floating point numbers and does a better job with less-common keys. ### `GoalThroughputPerSec` @@ -385,7 +397,7 @@ Type: `int` Indicates whether to use the cluster size to calculate the goal throughput. If `true`, then the goal throughput will be divided by the number of instances in the cluster. -If `false`, then the goal throughput will be the value specified in `GoalThroughputPerSec`. +If `false` (the default), then the goal throughput will be the value specified in `GoalThroughputPerSec`. Type: `bool` @@ -393,7 +405,8 @@ Type: `bool` The duration between sampling rate computations. It should be specified as a duration string. -For example, "30s" or "1m". +For example, `30s` or `1m`. +Defaults to `1s`. Type: `duration` @@ -433,7 +446,7 @@ Type: `int` ### `UseTraceLength` Indicates whether to include the trace length (number of spans in the trace) as part of the key. -The number of spans is exact, so if there are normally small variations in trace length, we recommend setting this field to `false`. +The number of spans is exact, so if there are normally small variations in trace length, we recommend setting this field to `false` (the default). If your traces are consistent lengths and changes in trace length is a useful indicator to view in Honeycomb, then set this field to `true`. Type: `bool` @@ -457,7 +470,7 @@ Type: `objectarray` ### `CheckNestedFields` Indicates whether to expand nested JSON when evaluating rules. -If false, nested JSON will be treated as a string. +If false (the default), nested JSON will be treated as a string. If `true`, nested JSON will be expanded into a `map[string]interface{}` and the value of the field will be the value of the nested field. For example, if you have a field called `http.request.headers` and you want to check the value of the `User-Agent` header, then you would set this to `true` and use `http.request.headers.User-Agent` as the field name in your rule. This is a computationally expensive option and may cause performance problems if you have a large number of spans with nested JSON. @@ -514,8 +527,8 @@ Type: `objectarray` ### `Scope` Controls the scope of the rule evaluation. -If set to "trace" (the default), then each condition can apply to any span in the trace independently. -If set to "span", then all of the conditions in the rule will be evaluated against each span in the trace and the rule only succeeds if all of the conditions match on a single span together. +If set to `trace` (the default), then each condition can apply to any span in the trace independently. +If set to `span`, then all of the conditions in the rule will be evaluated against each span in the trace and the rule only succeeds if all of the conditions match on a single span together. Type: `string` @@ -531,27 +544,53 @@ If there are no conditions, then the rule will always match. ### `Field` The field to check. -This can be any field in the trace. +This can name any field in the trace. If the field is not present, then the condition will not match. The comparison is case-sensitive. +The field can also include a prefix that changes the span used for evaluation of the field. +The only prefix currently supported is `root`, as in `root.http.status`. +Specifying `root.` causes the condition to be evaluated against the root span. +For example, if the `Field` is `root.url`, then the condition will be processed using the url field from the root span. +The setting `Scope: span` for a rule does not change the meaning of this prefix -- the condition is still evaluated on the root span and is treated as if it were part of the span being processed. +When using the `root.` prefix on a field with a `not-exists` operator, include the `has-root-span: true` condition in the rule. +The `not-exists` condition on a `root.`-prefixed field will evaluate to false if the existence of the root span is not checked and the root span does not exist. +The primary reason a root span is not present on a trace when a sampling decision is being made is when the root span takes longer to complete than the configured TraceTimeout. Type: `string` +### `Fields` + +An array of field names to check. +These can name any field in the trace. +The fields are checked in the order defined here, and the first named field that contains a value will be used for the condition. +Only the first populated field will be used, even if the condition fails. +If a `root.` prefix is present on a field, but the root span is not on the trace, that field will be skipped. +If none of the fields are present, then the condition will not match. +The comparison is case-sensitive. +All fields are checked as individual fields before any of them are checked as nested fields (see `CheckNestedFields`). + +Type: `stringarray` + ### `Operator` The comparison operator to use. String comparisons are case-sensitive. +For most cases, use negative operators (`!=`, `does-not-contain`, `not-exists`, and `not-in`) in a rule with a scope of "span". +WARNING: Rules can have `Scope: trace` or `Scope: span`. +Using a negative operator with `Scope: trace` will cause the condition be true if **any** single span in the entire trace matches. +Use `Scope: span` with negative operators. Type: `string` -- Options: `=`, `!=`, `>`, `<`, `>=`, `<=`, `starts-with`, `contains`, `does-not-contain`, `exists`, `not-exists`, `has-root-span` +- Options: `=`, `!=`, `>`, `<`, `>=`, `<=`, `starts-with`, `contains`, `does-not-contain`, `exists`, `not-exists`, `has-root-span`, `matches`, `in`, `not-in` ### `Value` The value to compare against. If `Datatype` is not specified, then the value and the field will be compared based on the type of the field. +The `in` and `not-in` operators can accept a list of values, which should all be of the same datatype. -Type: `anyscalar` +Type: `sliceorscalar` ### `Datatype` @@ -559,6 +598,7 @@ The datatype to use when comparing the value and the field. If `Datatype` is specified, then both values will be converted (best-effort) to that type and then compared. Errors in conversion will result in the comparison evaluating to `false`. This is especially useful when a field like `http status code` may be rendered as strings by some environments and as numbers or booleans by others. +The best practice is to always specify `Datatype`; this avoids ambiguity, allows for more accurate comparisons, and offers a minor performance improvement. Type: `string` @@ -588,7 +628,7 @@ Type: `int` Indicates whether to use the cluster size to calculate the goal throughput. If `true`, then the goal throughput will be divided by the number of instances in the cluster. -If `false`, then the goal throughput will be the value specified in `GoalThroughputPerSec`. +If `false` (the default), then the goal throughput will be the value specified in `GoalThroughputPerSec`. Type: `bool` @@ -597,6 +637,7 @@ Type: `bool` The duration after which the Dynamic Sampler should reset its internal counters. It should be specified as a duration string. For example, "30s" or "1m". +Defaults to "30s". Type: `duration` @@ -628,7 +669,7 @@ Type: `int` ### `UseTraceLength` Indicates whether to include the trace length (number of spans in the trace) as part of the key. -The number of spans is exact, so if there are normally small variations in trace length, we recommend setting this field to `false`. +The number of spans is exact, so if there are normally small variations in trace length, we recommend setting this field to `false` (the default). If your traces are consistent lengths and changes in trace length is a useful indicator to view in Honeycomb, then set this field to `true`. Type: `bool` diff --git a/rules_complete.yaml b/rules_complete.yaml index 092da456a8..1a3c0daf70 100644 --- a/rules_complete.yaml +++ b/rules_complete.yaml @@ -66,10 +66,20 @@ Samplers: - Field: http.route Operator: = Value: /health-check - - Name: keep slow 500 errors + - Name: drop everything from specific services + Drop: true + Conditions: + - Field: service.name + Operator: in + Value: + - noisy-service + - overly-chatty-service17 + - Name: keep slow 500 errors across semantic conventions SampleRate: 1 Conditions: - - Field: status_code + - Fields: + - http.status_code + - http.response.status_code Operator: = Value: 500 Datatype: int @@ -77,9 +87,11 @@ Samplers: Operator: '>=' Value: 1000 Datatype: float - - Name: dynamically sample 200 responses + - Name: dynamically sample 200 responses across semantic conventions Conditions: - - Field: status_code + - Fields: + - http.status_code + - http.response.status_code Operator: = Value: 200 Datatype: int @@ -104,6 +116,8 @@ Samplers: - Name: drop incomplete traces from the buggy service Drop: true Conditions: + # NOTE: has-root-span only checks if the trace has a root span. + # It does NOT act as a shorthand for evaluating the root span; use the `root.` construct for that. - Operator: has-root-span Value: false - Field: service.name diff --git a/rules_conditions.md b/rules_conditions.md new file mode 100644 index 0000000000..35e043508d --- /dev/null +++ b/rules_conditions.md @@ -0,0 +1,279 @@ +# Refinery Conditions + +## Overview + +Refinery rules are described as a series of conditions. +Each condition is composed from a combination of these named parameters: + +- `Field` (or `Fields`) +- `Operator` +- `Value` +- `Datatype` + +The `Operator` is a required parameter, and controls which of the condition's other elements are required and which are optional. + +## `Field` + +The `Field` parameter points to a specific named element in the trace data. +If a `Field` is named within a span, then it `exists`. +A specific `Field` may or may not exist on any specific span in a trace. +It might not even exist within a trace at all. + +When a `Field` is absent in all spans within a trace, the associated rule does not apply to that trace. + +A `Field` is always a single string. +A `Field` is always matched by exact comparison. +No transformations for case or punctuation are performed. + +### Example use of `Field` + +```yaml +Conditions: + Field: http.route + Operator: = + Value: /health-check +``` +### Leveraging Special Refinery Telemetry in Root Spans + +Some Refinery configuration options introduce special fields that are added to telemetry. + +For example, when `AddCountsToRoot` is enabled, `meta.span_count` is added to all root spans, and allows for the creation of rule conditions based on span counts. +In this `meta.span_count` example, the Refinery rule applies to traces with more than 300 spans. + +```yaml +Conditions: + Field: "meta.span_count" + Operator: ">" + Value: 300 + Datatype: int +``` + +For details about all supported special fields, check out our [Refinery Telemetry documentation](https://docs.honeycomb.io/manage-data-volume/refinery/configuration/#refinery-telemetry). + +### Virtual Fields + +To handle specific scenarios when rules are evaluated before the arrival of root spans, Refinery introduces the concept of virtual fields. These fields provide metadata about traces that have timed out while waiting for their root span. + +This example shows a rule that drops traces containing more than 1000 spans by using the virtual field `?.NUM_DESCENDANTS`. + +```yaml +Rules: + - Name: Drop any big traces + Drop: true + Conditions: + Field: "?.NUM_DESCENDANTS" + Operator: ">" + Value: 1000 + Datatype: int + +``` + +#### Supported Virtual Fields + +All virtual fields are prefixed with `?.` to distinguish them from normal fields. +Currently only one virtual field is supported. + +- `?.NUM_DESCENDANTS`: the current number of child elements contained within a trace. + +## `Fields` + +The `Fields` parameter allows a single rule to apply to the first match among multiple field names. +It is typically used when telemetry field names are being changed. +It is exactly equivalent to `Field`, except that it must be expressed as an array of strings instead of a single value. +The array defines a sequences of `Field` names that are checked in order for each span being considered. +The first field that `exists` on any given span is used for the condition. + +### Example use of `Fields` + +This example shows how one might write a rule designed to cope with an expected name change of a key field. + +```yaml +Conditions: + Fields: + - http.status + - http.request.status + Operator: = + Value: 200 + Datatype: int +``` + +## Using a Prefix to Identify a Field in a Related Span + +Fields can contain a span selection prefix. Today, the only prefix supported is `root`. +This prefix causes the root span to be searched for the specified field, rather than the span being evaluated. + +```yaml +Rules: + - Name: limit by root span context + Conditions: + Field: "root.http.status" + Operator: = + Value: "500" + Datatype: string +``` + + +## `Operator` + +The `Operator` parameter controls how rules are evaluated. +Because YAML treats certain characters like less-than (`<`) specially, it is good practice to always enclose the basic comparison operators in single quotes (like `'<'`). + +The `Operator` may be one of the following: + +### `'='` + +Basic comparison -- `equals`. +The result is true if the value of the named `Field` is equal to the `Value` specified. +See [`Datatype`](#datatype) for how different datatypes are handled. + +### `'!='` + +Basic comparison -- `not equals`. +The result is true if the value of the named `Field` is not equal to the `Value` specified. +See [`Datatype`](#datatype) for how different datatypes are handled. + +For most cases, use `'!='` in a rule with a scope of "span". +WARNING: Rules can have `Scope: trace` or `Scope: span`; `'!='` used with `Scope: trace` will be true if **any** single span in the entire trace matches the negative condition. +This is almost never desired behavior. + +### `'<'` + +Basic comparison -- `less than`. +The result is true if the value of the named `Field` is less than the `Value` specified. +See [`Datatype`](#datatype) for how different datatypes are handled. + +### `'<='` + +Basic comparison -- `less than or equal to`. +The result is true if the value of the named `Field` is less than or equal to the `Value` specified. +See [`Datatype`](#datatype) for how different datatypes are handled. + +### `'>'` + +Basic comparison -- `greater than`. +The result is true if the value of the named `Field` is greater than the `Value` specified. +See [`Datatype`](#datatype) for how different datatypes are handled. + +### `'>='` + +Basic comparison -- `greater than or equal to`. +The result is true if the value of the named `Field` is greater than or equal to the `Value` specified. +See [`Datatype`](#datatype) for how different datatypes are handled. + +### `starts-with` + +Tests if the span value named by the `Field` begins with the text specified in the `Value` parameter. +Comparisons are case-sensitive and exact. + +Values are always coerced to strings -- the `Datatype` parameter is ignored. + +### `contains` + +Tests if the span value named by the `Field` contains the text specified in the `Value` parameter. +Comparisons are case-sensitive and exact. + +Values are always coerced to strings -- the `Datatype` parameter is ignored. + +### `does-not-contain` + +Tests if the span value named by the `Field` does not contain the text specified in the `Value` parameter. +Comparisons are case-sensitive and exact. + +Values are always coerced to strings -- the `Datatype` parameter is ignored. + +### `in` + +The `Value` parameter should be a list of items. + +Tests if the span value named by the `Field` occurs exactly within the list specified in the `Value` parameter. +Comparisons are exact. For strings, comparisons are also case-sensitive. + +### `not-in` + +The `Value` parameter should be a list of items. + +Tests if the span value named by the `Field` does not occur exactly within the list specified in the `Value` parameter. +Comparisons are exact. For strings, comparisons are also case-sensitive. + + + +For most cases, use negative operators (`!=`, `does-not-contain`, `not-exists`, +and `not-in`) in a rule with a scope of "span". +WARNING: Rules can have `Scope: trace` or `Scope: span`. +Using a negative operator with `Scope: trace` will cause the condition be true if **any** single span in the entire trace matches. +Use `Scope: span` with negative operators. + +### `exists` + +Tests if the specified span contains the field named by the `Field` parameter, without considering its value. + +Both the `Value` and the `Datatype` parameters are ignored. + +### `not-exists` + +Tests if the specified span does not contain the field named by the `Field` parameter, without considering its value. + +Both the `Value` and the `Datatype` parameters are ignored. + +For most cases, use `not-exists` in a rule with a scope of "span". +WARNING: Rules can have `Scope: trace` or `Scope: span`; `not-exists` used with `Scope: trace` will be true if **any** single span in the entire trace matches the negative condition. +This is almost never desired behavior. + +### has-root-span + +Tests if the trace as a whole has a root span. + +The `Value` parameter can either be `true` or `false`. + +NOTE: `has-root-span` does not check if a given span **is** a root span, +it checks if the containing trace **has** a root span. + +### `matches` + +Tests if the span value specified by the `Field` parameter matches the regular expression specified by the `Value` parameter. +The regular expression grammar used is the syntax used by the Go programming language. +It is documented [here](https://pkg.go.dev/regexp/syntax). + +For clarity, regular expressions in YAML should usually be quoted with single quotes (`'`). +This is because this form is unambiguous and does not process escape sequences, and thus regular expression character classes like `\d` for digits can be used directly. +For example, an expression to match arbitrary strings of digits would be `'\d+'`. + +Sometimes double-quoted (`"`) strings are required in order to express patterns containing less common characters. +These use escape sequences beginning with a backslash (`\`). This implies that backslashes intended for the regular expression will have to be doubled. +The same expression as above using double quotes looks like this: `"\\d+"`. + +Example: +```yaml + RulesBasedSampler: + Rules: + - Name: Drop traces for any path element starting with /health or /status + Conditions: + - Field: http.target + Operator: matches + Value: '/(health/status)\w+' + ``` + +Values are always coerced to strings -- the `Datatype` parameter is ignored. + +## `Value` + +The `Value` parameter can be any value of a supported type. +Its meaning and interpretation depends on the `Operator` in use. + +See [`Datatype`](#datatype) for how different datatypes are handled. + +## `Datatype` + +The `Datatype` parameter controls the type of the values used when evaluating rules for the basic comparison operators. +When `Datatype` is present, before evaluating the `Operator`, both the span value and the `Value` parameter are coerced (converted if necessary) to the specified format, and the comparison takes place as appropriate for that datatype. + +There are 4 possibilities: + +- `string` -- The comparison uses string operations. (For example, "2" is considered to be greater than "10" because '2' > '1'.) +- `int` -- The comparison uses integers. (1.5 == 1 because `1.5` gets converted to `1`) +- `float` -- The comparison uses floating point values. +- `bool` -- The comparison tries to convert values to boolean. Note that because of the semantics of YAML, the `Value` parameter will interpret not only `true/false` but also all of `yes/no`, `y/n`, and `on/off` as boolean values. Span values, for historical reasons, interpret `true/false` and `1/0` as boolean, and all other values are considered to be `false`. + +If the `Datatype` parameter is not specified, then Refinery determines the type of the incoming span value. If the value is numeric or boolean, it attempts to convert the `Value` parameter to the same type. If the span value is a string, the `Value` parameter must also be a string or the comparison will fail. + + diff --git a/sample/dynamic.go b/sample/dynamic.go index 5021940ee5..7367e5865b 100644 --- a/sample/dynamic.go +++ b/sample/dynamic.go @@ -89,6 +89,7 @@ func (d *DynamicSampler) GetSampleRate(trace *types.Trace) (rate uint, keep bool case "counter": delta := val - d.lastMetrics[name] d.Metrics.Count(name, delta) + d.lastMetrics[name] = val case "gauge": d.Metrics.Gauge(name, val) } diff --git a/sample/dynamic_ema.go b/sample/dynamic_ema.go index 9935902c97..7fd04d2780 100644 --- a/sample/dynamic_ema.go +++ b/sample/dynamic_ema.go @@ -97,6 +97,7 @@ func (d *EMADynamicSampler) GetSampleRate(trace *types.Trace) (rate uint, keep b case "counter": delta := val - d.lastMetrics[name] d.Metrics.Count(name, delta) + d.lastMetrics[name] = val case "gauge": d.Metrics.Gauge(name, val) } diff --git a/sample/dynamic_test.go b/sample/dynamic_test.go index 94823044f7..358d8d6f0e 100644 --- a/sample/dynamic_test.go +++ b/sample/dynamic_test.go @@ -19,7 +19,8 @@ func TestDynamicAddSampleRateKeyToTrace(t *testing.T) { sampler := &DynamicSampler{ Config: &config.DynamicSamplerConfig{ - FieldList: []string{"http.status_code"}, + FieldList: []string{"http.status_code", "root.service_name", "root.url"}, + SampleRate: 1, }, Logger: &logger.NullLogger{}, Metrics: &metrics, @@ -27,17 +28,32 @@ func TestDynamicAddSampleRateKeyToTrace(t *testing.T) { trace := &types.Trace{} for i := 0; i < spanCount; i++ { + if i == spanCount-1 { + trace.RootSpan = &types.Span{ + Event: types.Event{ + Data: map[string]interface{}{ + "http.status_code": "200", + "service_name": "test", + }, + }, + } + } trace.AddSpan(&types.Span{ Event: types.Event{ Data: map[string]interface{}{ "http.status_code": "200", + "url": "/test", }, }, }) } sampler.Start() - sampler.GetSampleRate(trace) + rate, keep, reason, key := sampler.GetSampleRate(trace) spans := trace.GetSpans() assert.Len(t, spans, spanCount, "should have the same number of spans as input") + assert.Equal(t, uint(1), rate) + assert.True(t, keep) + assert.Equal(t, "dynamic", reason) + assert.Equal(t, "200•,test,", key) } diff --git a/sample/ema_throughput.go b/sample/ema_throughput.go index 1ef22745e6..7d27983c67 100644 --- a/sample/ema_throughput.go +++ b/sample/ema_throughput.go @@ -114,6 +114,7 @@ func (d *EMAThroughputSampler) GetSampleRate(trace *types.Trace) (rate uint, kee case "counter": delta := val - d.lastMetrics[name] d.Metrics.Count(name, delta) + d.lastMetrics[name] = val case "gauge": d.Metrics.Gauge(name, val) } diff --git a/sample/rules.go b/sample/rules.go index b2a17bf879..c1795fe036 100644 --- a/sample/rules.go +++ b/sample/rules.go @@ -20,18 +20,20 @@ type RulesBasedSampler struct { prefix string } +const RootPrefix = "root." + func (s *RulesBasedSampler) Start() error { s.Logger.Debug().Logf("Starting RulesBasedSampler") defer func() { s.Logger.Debug().Logf("Finished starting RulesBasedSampler") }() s.prefix = "rulesbased_" s.Metrics.Register(s.prefix+"num_dropped", "counter") + s.Metrics.Register(s.prefix+"num_dropped_by_drop_rule", "counter") s.Metrics.Register(s.prefix+"num_kept", "counter") s.Metrics.Register(s.prefix+"sample_rate", "histogram") s.samplers = make(map[string]Sampler) - // Check if any rule has a downstream sampler and create it for _, rule := range s.Config.Rules { for _, cond := range rule.Conditions { if err := cond.Init(); err != nil { @@ -42,6 +44,7 @@ func (s *RulesBasedSampler) Start() error { continue } } + // Check if any rule has a downstream sampler and create it if rule.Sampler != nil { var sampler Sampler if rule.Sampler.DynamicSampler != nil { @@ -54,6 +57,8 @@ func (s *RulesBasedSampler) Start() error { sampler = &EMAThroughputSampler{Config: rule.Sampler.EMAThroughputSampler, Logger: s.Logger, Metrics: s.Metrics} } else if rule.Sampler.WindowedThroughputSampler != nil { sampler = &WindowedThroughputSampler{Config: rule.Sampler.WindowedThroughputSampler, Logger: s.Logger, Metrics: s.Metrics} + } else if rule.Sampler.DeterministicSampler != nil { + sampler = &DeterministicSampler{Config: rule.Sampler.DeterministicSampler, Logger: s.Logger, Metrics: s.Metrics} } else { s.Logger.Debug().WithFields(map[string]interface{}{ "rule_name": rule.Name, @@ -127,6 +132,10 @@ func (s *RulesBasedSampler) GetSampleRate(trace *types.Trace) (rate uint, keep b s.Metrics.Increment(s.prefix + "num_kept") } else { s.Metrics.Increment(s.prefix + "num_dropped") + if rule.Drop { + // If we dropped because of an explicit drop rule, then increment that too. + s.Metrics.Increment(s.prefix + "num_dropped_by_drop_rule") + } } logger.WithFields(map[string]interface{}{ "rate": rate, @@ -157,26 +166,31 @@ func ruleMatchesTrace(t *types.Trace, rule *config.RulesBasedSamplerRule, checkN matched++ continue } else { - // if HasRootSpan is one of the conditions and it didn't match, + // if HasRootSpan is one of the conditions, and it didn't match, // there's no need to check the rest of the conditions. return false } + } span: for _, span := range t.GetSpans() { - value, exists := extractValueFromSpan(span, condition, checkNestedFields) + value, exists, checkedOnlyRoot := extractValueFromSpan(t, span, condition, checkNestedFields) if condition.Matches == nil { if conditionMatchesValue(condition, value, exists) { matched++ break span } - continue } else if condition.Matches(value, exists) { matched++ break span } - + if checkedOnlyRoot { + // if we only checked the root span and it didn't match, + // there's no need to check the rest of the spans; + // they can't possibly match and we can end early. + break span + } } } return matched == len(rule.Conditions) @@ -192,18 +206,34 @@ func ruleMatchesSpanInTrace(trace *types.Trace, rule *config.RulesBasedSamplerRu ruleMatched := true for _, condition := range rule.Conditions { // whether this condition is matched by this span. - value, exists := extractValueFromSpan(span, condition, checkNestedFields) + value, exists, checkedOnlyRoot := extractValueFromSpan(trace, span, condition, checkNestedFields) if condition.Matches == nil { if !conditionMatchesValue(condition, value, exists) { ruleMatched = false - break // if any condition fails, we can't possibly succeed, so exit inner loop early + if checkedOnlyRoot { + // if we only checked the root span and it didn't match, + // there's no need to check the rest of the spans; + // they can't possibly match and we can end early. + return false + } + // if any condition fails, we can't possibly succeed, + // so exit inner loop early + break } } if condition.Matches != nil { if !condition.Matches(value, exists) { ruleMatched = false - break // if any condition fails, we can't possibly succeed, so exit inner loop early + if checkedOnlyRoot { + // if we only checked the root span and it didn't match, + // there's no need to check the rest of the spans; + // they can't possibly match and we can end early. + return false + } + // if any condition fails, we can't possibly succeed, + // so exit inner loop early + break } } } @@ -218,21 +248,71 @@ func ruleMatchesSpanInTrace(trace *types.Trace, rule *config.RulesBasedSamplerRu return false } -func extractValueFromSpan(span *types.Span, condition *config.RulesBasedSamplerCondition, checkNestedFields bool) (interface{}, bool) { +// extractValueFromSpan extracts the `value` found at the first of the given condition's fields found on the input `span`. +// It returns the extracted `value` and an `exists` boolean indicating whether any of the condition's fields are present +// on the input span. +// +// We need to check the fields in order; if we find a match using 'root.' we +// can short-circuit the rest of the spans because they'll all return the same +// value. But if we check a non-root value first, we need to keep checking all +// the spans to see if any of them match. +func extractValueFromSpan( + trace *types.Trace, + span *types.Span, + condition *config.RulesBasedSamplerCondition, + checkNestedFields bool) (value interface{}, exists bool, checkedOnlyRoot bool) { + // start with the assumption that we only checked the root span + checkedOnlyRoot = true + + // If the condition is a descendant count, we extract the count from trace and return it. + // Note that this is the equivalent of checking the root span's descendant count, so + // we don't need to check the other spans. + if f, ok := condition.GetComputedField(); ok { + switch f { + case config.NUM_DESCENDANTS: + return int64(trace.DescendantCount()), true, true + } + } + + // we need to preserve which span we're actually using, since + // we might need to use the root span instead of the current span. + original := span // whether this condition is matched by this span. - value, exists := span.Data[condition.Field] + for _, field := range condition.Fields { + // always start with the original span + span = original + // check if rule uses root span context + if strings.HasPrefix(field, RootPrefix) { + // make sure root span exists + if trace.RootSpan != nil { + field = field[len(RootPrefix):] + // now we're using the root span + span = trace.RootSpan + } else { + // we wanted root span but this trace doesn't have one, so just skip it + continue + } + } else { + checkedOnlyRoot = false + } + + value, exists = span.Data[field] + if exists { + return value, exists, checkedOnlyRoot + } + } if !exists && checkNestedFields { jsonStr, err := json.Marshal(span.Data) if err == nil { - result := gjson.Get(string(jsonStr), condition.Field) - if result.Exists() { - value = result.String() - exists = true + for _, field := range condition.Fields { + result := gjson.Get(string(jsonStr), field) + if result.Exists() { + return result.String(), true, false + } } } } - - return value, exists + return nil, false, false } // This only gets called when we're using one of the basic operators, and diff --git a/sample/rules_test.go b/sample/rules_test.go index 208367d0ca..d4ec68bd32 100644 --- a/sample/rules_test.go +++ b/sample/rules_test.go @@ -9,11 +9,14 @@ import ( "github.com/honeycombio/refinery/metrics" "github.com/honeycombio/refinery/types" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) type TestRulesData struct { - Rules *config.RulesBasedSamplerConfig - Spans []*types.Span + Rules *config.RulesBasedSamplerConfig + Spans []*types.Span + // Set to the matching rule's sample rate if the rule matches. + // Set to the default rate (1) if you expect no rule to match. ExpectedRate uint ExpectedKeep bool ExpectedName string @@ -685,13 +688,220 @@ func TestRules(t *testing.T) { ExpectedKeep: true, ExpectedRate: 99, }, + { + Rules: &config.RulesBasedSamplerConfig{ + Rules: []*config.RulesBasedSamplerRule{ + { + Name: "Search multiple fields (success)", + SampleRate: 10, + Conditions: []*config.RulesBasedSamplerCondition{ + { + Fields: []string{"test", "test2"}, + Operator: config.EQ, + Value: int(17), + }, + }, + }, + }, + }, + Spans: []*types.Span{ + { + Event: types.Event{ + Data: map[string]interface{}{ + "test2": int64(17), + }, + }, + }, + }, + ExpectedKeep: true, + ExpectedRate: 10, + }, + { + Rules: &config.RulesBasedSamplerConfig{ + Rules: []*config.RulesBasedSamplerRule{ + { + Name: "Search multiple fields (fails)", + SampleRate: 10, + Conditions: []*config.RulesBasedSamplerCondition{ + { + Fields: []string{"test", "test2"}, + Operator: config.EQ, + Value: int(17), + }, + }, + }, + }, + }, + Spans: []*types.Span{ + { + Event: types.Event{ + Data: map[string]interface{}{ + "test2": int64(16), + }, + }, + }, + }, + ExpectedKeep: true, + ExpectedName: "no rule matched", + ExpectedRate: 1, + }, + { + Rules: &config.RulesBasedSamplerConfig{ + Rules: []*config.RulesBasedSamplerRule{ + { + Name: "Multiple fields, multiple values (fails)", + SampleRate: 10, + Conditions: []*config.RulesBasedSamplerCondition{ + { + Fields: []string{"test", "test2"}, + Operator: config.EQ, + Value: int(17), + }, + }, + }, + }, + }, + Spans: []*types.Span{ + { + Event: types.Event{ + Data: map[string]interface{}{ + "test": int64(2), + "test2": int64(17), + }, + }, + }, + }, + ExpectedKeep: true, + ExpectedName: "no rule matched", + ExpectedRate: 1, + }, + { + Rules: &config.RulesBasedSamplerConfig{ + Rules: []*config.RulesBasedSamplerRule{ + { + Name: "Check that the number of descendants is greater than 3", + SampleRate: 1, + Conditions: []*config.RulesBasedSamplerCondition{ + { + Field: string(config.NUM_DESCENDANTS), + Operator: config.GT, + Value: int(3), + Datatype: "int", + }, + }, + Drop: true, + }, + }, + }, + Spans: []*types.Span{ + { + Event: types.Event{ + Data: map[string]interface{}{ + "trace.trace_id": "12345", + "trace.span_id": "54322", + "trace.parent_id": "54321", + "meta.span_count": int64(2), + }, + }, + }, + { + Event: types.Event{ + Data: map[string]interface{}{ + "trace.trace_id": "12345", + "trace.span_id": "654321", + "trace.parent_id": "54322", + }, + }, + }, + { + Event: types.Event{ + Data: map[string]interface{}{ + "trace.trace_id": "12345", + "trace.span_id": "754321", + "trace.parent_id": "54322", + }, + }, + }, + { + Event: types.Event{ + Data: map[string]interface{}{ + "trace.trace_id": "12345", + "trace.span_id": "754321", + "trace.parent_id": "54322", + }, + }, + }, + }, + ExpectedName: "Check that the number of descendants is greater than 3", + ExpectedKeep: false, + ExpectedRate: 1, + }, + { + Rules: &config.RulesBasedSamplerConfig{ + Rules: []*config.RulesBasedSamplerRule{ + { + Name: "Check that the number of descendants is less than 3", + SampleRate: 1, + Conditions: []*config.RulesBasedSamplerCondition{ + { + Field: string(config.NUM_DESCENDANTS), + Operator: config.LT, + Value: int(3), + }, + }, + }, + }, + }, + Spans: []*types.Span{ + { + Event: types.Event{ + Data: map[string]interface{}{ + "trace.trace_id": "12345", + "trace.span_id": "54322", + "trace.parent_id": "54321", + "meta.span_count": int64(2), + }, + }, + }, + { + Event: types.Event{ + Data: map[string]interface{}{ + "trace.trace_id": "12345", + "trace.span_id": "654321", + "trace.parent_id": "54322", + }, + }, + }, + { + Event: types.Event{ + Data: map[string]interface{}{ + "trace.trace_id": "12345", + "trace.span_id": "754321", + "trace.parent_id": "54322", + }, + }, + }, + { + Event: types.Event{ + Data: map[string]interface{}{ + "trace.trace_id": "12345", + "trace.span_id": "754321", + "trace.parent_id": "54322", + }, + }, + }, + }, + ExpectedName: "no rule matched", + ExpectedKeep: true, + ExpectedRate: 1, + }, } for _, d := range data { for _, rule := range d.Rules.Rules { for _, cond := range rule.Conditions { err := cond.Init() - assert.NoError(t, err) + assert.NoError(t, err, "error in "+rule.Name) } } sampler := &RulesBasedSampler{ @@ -849,35 +1059,74 @@ func TestRulesWithNestedFields(t *testing.T) { ExpectedRate: 1, ExpectedName: "no rule matched", }, + { + Rules: &config.RulesBasedSamplerConfig{ + Rules: []*config.RulesBasedSamplerRule{ + { + Name: "nested fields", + SampleRate: 10, + Conditions: []*config.RulesBasedSamplerCondition{ + { + Fields: []string{"test.test1", "test.test2"}, + Operator: config.EQ, + Value: "a", + }, + }, + }, + }, + CheckNestedFields: true, + }, + Spans: []*types.Span{ + { + Event: types.Event{ + Data: map[string]interface{}{ + "test": map[string]interface{}{ + "test2": "a", + }, + }, + }, + }, + }, + ExpectedKeep: true, + ExpectedRate: 10, + }, } for _, d := range data { - sampler := &RulesBasedSampler{ - Config: d.Rules, - Logger: &logger.NullLogger{}, - Metrics: &metrics.NullMetrics{}, - } + t.Run(d.Rules.Rules[0].Name, func(t *testing.T) { + for _, rule := range d.Rules.Rules { + for _, cond := range rule.Conditions { + err := cond.Init() + assert.NoError(t, err, "error in "+rule.Name) + } + } + sampler := &RulesBasedSampler{ + Config: d.Rules, + Logger: &logger.NullLogger{}, + Metrics: &metrics.NullMetrics{}, + } - trace := &types.Trace{} + trace := &types.Trace{} - for _, span := range d.Spans { - trace.AddSpan(span) - } + for _, span := range d.Spans { + trace.AddSpan(span) + } - rate, keep, reason, key := sampler.GetSampleRate(trace) + rate, keep, reason, key := sampler.GetSampleRate(trace) - assert.Equal(t, d.ExpectedRate, rate, d.Rules) - name := d.ExpectedName - if name == "" { - name = d.Rules.Rules[0].Name - } - assert.Contains(t, reason, name) - assert.Equal(t, "", key) + assert.Equal(t, d.ExpectedRate, rate, d.Rules) + name := d.ExpectedName + if name == "" { + name = d.Rules.Rules[0].Name + } + assert.Contains(t, reason, name) + assert.Equal(t, "", key) - // we can only test when we don't expect to keep the trace - if !d.ExpectedKeep { - assert.Equal(t, d.ExpectedKeep, keep, d.Rules) - } + // we can only test when we don't expect to keep the trace + if !d.ExpectedKeep { + assert.Equal(t, d.ExpectedKeep, keep, d.Rules) + } + }) } } @@ -928,6 +1177,12 @@ func TestRulesWithDynamicSampler(t *testing.T) { } for _, d := range data { + for _, rule := range d.Rules.Rules { + for _, cond := range rule.Conditions { + err := cond.Init() + assert.NoError(t, err, "error in "+rule.Name) + } + } sampler := &RulesBasedSampler{ Config: d.Rules, Logger: &logger.NullLogger{}, @@ -1008,6 +1263,12 @@ func TestRulesWithEMADynamicSampler(t *testing.T) { } for _, d := range data { + for _, rule := range d.Rules.Rules { + for _, cond := range rule.Conditions { + err := cond.Init() + assert.NoError(t, err, "error in "+rule.Name) + } + } sampler := &RulesBasedSampler{ Config: d.Rules, Logger: &logger.NullLogger{}, @@ -1129,6 +1390,14 @@ func TestRuleMatchesSpanMatchingSpan(t *testing.T) { Logger: &logger.NullLogger{}, Metrics: &metrics.NullMetrics{}, } + for _, s := range sampler.samplers { + for _, rule := range s.(*RulesBasedSampler).Config.Rules { + for _, cond := range rule.Conditions { + err := cond.Init() + assert.NoError(t, err, "error in "+rule.Name) + } + } + } trace := &types.Trace{} @@ -1801,3 +2070,795 @@ func TestRegexpRules(t *testing.T) { }) } } + +func TestRulesWithDeterministicSampler(t *testing.T) { + data := []TestRulesData{ + { + Rules: &config.RulesBasedSamplerConfig{ + Rules: []*config.RulesBasedSamplerRule{ + { + Name: "downstream-deterministic", + Conditions: []*config.RulesBasedSamplerCondition{ + { + Field: "rule_test", + Operator: "=", + Value: int64(1), + }, + }, + Sampler: &config.RulesBasedDownstreamSampler{ + DeterministicSampler: &config.DeterministicSamplerConfig{ + SampleRate: 10, + }, + }, + }, + }, + }, + Spans: []*types.Span{ + { + Event: types.Event{ + Data: map[string]interface{}{ + "rule_test": int64(1), + "http.status_code": "200", + }, + }, + }, + { + Event: types.Event{ + Data: map[string]interface{}{ + "rule_test": int64(1), + "http.status_code": "200", + }, + }, + }, + }, + ExpectedKeep: true, + ExpectedRate: 10, + }, + } + + for _, d := range data { + for _, rule := range d.Rules.Rules { + for _, cond := range rule.Conditions { + err := cond.Init() + assert.NoError(t, err, "error in "+rule.Name) + } + } + sampler := &RulesBasedSampler{ + Config: d.Rules, + Logger: &logger.NullLogger{}, + Metrics: &metrics.NullMetrics{}, + } + + trace := &types.Trace{} + + for _, span := range d.Spans { + trace.AddSpan(span) + } + + sampler.Start() + rate, keep, reason, key := sampler.GetSampleRate(trace) + assert.Equal(t, "", key) + + assert.Equal(t, d.ExpectedRate, rate, d.Rules) + name := d.ExpectedName + if name == "" { + name = d.Rules.Rules[0].Name + } + assert.Contains(t, reason, name) + + // we can only test when we don't expect to keep the trace + if !d.ExpectedKeep { + assert.Equal(t, d.ExpectedKeep, keep, d.Rules) + } + + spans := trace.GetSpans() + assert.Len(t, spans, len(d.Spans), "should have the same number of spans as input") + } +} + +func TestRulesRootSpanContext(t *testing.T) { + data := []TestRulesData{ + { + Rules: &config.RulesBasedSamplerConfig{ + Rules: []*config.RulesBasedSamplerRule{ + { + Name: "root span matches", + SampleRate: 10, + Conditions: []*config.RulesBasedSamplerCondition{ + { + Field: "root.test", + Operator: config.Exists, + }, + }, + }, + }, + }, + Spans: []*types.Span{ + { + TraceID: "123testABC", // I am root. + Event: types.Event{ + Data: map[string]interface{}{ + "test": "foo", + }, + }, + }, + { + Event: types.Event{ + Data: map[string]interface{}{ + "http.status_code": "200", + }, + }, + }, + { + Event: types.Event{ + Data: map[string]interface{}{ + "test1": 1, + "test2": 2.2, + "test3": "foo", + }, + }, + }, + }, + ExpectedKeep: false, + ExpectedRate: 10, + ExpectedName: "root span matches", + }, + { + Rules: &config.RulesBasedSamplerConfig{ + Rules: []*config.RulesBasedSamplerRule{ + { + Name: "root prefix condition matches, others don't", + SampleRate: 10, + Conditions: []*config.RulesBasedSamplerCondition{ + { + Field: "root.test", + Operator: config.Exists, + }, + { + Field: "anotherField", + Operator: config.EQ, + Value: "bar", + Datatype: "string", + }, + }, + }, + }, + }, + Spans: []*types.Span{ + { + TraceID: "123testABC", // I am root. + Event: types.Event{ + Data: map[string]interface{}{ + "test": "foo", + }, + }, + }, + { + Event: types.Event{ + Data: map[string]interface{}{ + "http.status_code": "200", + }, + }, + }, + { + Event: types.Event{ + Data: map[string]interface{}{ + "test1": 1, + "test2": 2.2, + "test3": "foo", + }, + }, + }, + }, + ExpectedKeep: true, + ExpectedRate: 1, + ExpectedName: "no rule matched", + }, + { + Rules: &config.RulesBasedSamplerConfig{ + Rules: []*config.RulesBasedSamplerRule{ + { + Name: "all conditions match", + SampleRate: 10, + Conditions: []*config.RulesBasedSamplerCondition{ + { + Field: "root.test", + Operator: config.Exists, + }, + { + Field: "anotherField", + Operator: config.GT, + Value: 1.0, + }, + }, + }, + }, + }, + Spans: []*types.Span{ + { + TraceID: "123testABC", // I am root. + Event: types.Event{ + Data: map[string]interface{}{ + "test": "foo", + "anotherField": 3.5, + }, + }, + }, + { + Event: types.Event{ + Data: map[string]interface{}{ + "http.status_code": "200", + "anotherField": 1.1, + }, + }, + }, + { + Event: types.Event{ + Data: map[string]interface{}{ + "test1": 1, + "test2": 2.2, + "test3": "foo", + }, + }, + }, + }, + ExpectedKeep: true, + ExpectedRate: 10, + ExpectedName: "all conditions match", + }, + { + Rules: &config.RulesBasedSamplerConfig{ + Rules: []*config.RulesBasedSamplerRule{ + { + Name: "two root conditions, only one matches", + SampleRate: 10, + Conditions: []*config.RulesBasedSamplerCondition{ + { + Field: "root.test", + Operator: config.Exists, + }, + { + Field: "root.nope", + Operator: config.Exists, + }, + { + Field: "anotherField", + Operator: config.LT, + Value: 100, + }, + }, + }, + }, + }, + Spans: []*types.Span{ + { + TraceID: "123testABC", // I am root. + Event: types.Event{ + Data: map[string]interface{}{ + "test": "foo", + "anotherField": 50, + }, + }, + }, + { + Event: types.Event{ + Data: map[string]interface{}{ + "http.status_code": "200", + "anotherField": 10, + }, + }, + }, + { + Event: types.Event{ + Data: map[string]interface{}{ + "test1": 1, + "test2": 2.2, + "test3": "foo", + }, + }, + }, + }, + ExpectedKeep: true, + ExpectedRate: 1, + ExpectedName: "no rule matched", + }, + { + Rules: &config.RulesBasedSamplerConfig{ + Rules: []*config.RulesBasedSamplerRule{ + { + Name: "two root conditions, only one matches, reversed", + SampleRate: 10, + Conditions: []*config.RulesBasedSamplerCondition{ + { + Field: "anotherField", + Operator: config.LT, + Value: 100, + }, + { + Field: "root.test", + Operator: config.Exists, + }, + { + Field: "root.nope", + Operator: config.Exists, + }, + }, + }, + }, + }, + Spans: []*types.Span{ + { + TraceID: "123testABC", // I am root. + Event: types.Event{ + Data: map[string]interface{}{ + "test": "foo", + "anotherField": 50, + }, + }, + }, + { + Event: types.Event{ + Data: map[string]interface{}{ + "http.status_code": "200", + "anotherField": 10, + }, + }, + }, + { + Event: types.Event{ + Data: map[string]interface{}{ + "test1": 1, + "test2": 2.2, + "test3": "foo", + }, + }, + }, + }, + ExpectedKeep: true, + ExpectedRate: 1, + ExpectedName: "no rule matched", + }, + { + Rules: &config.RulesBasedSamplerConfig{ + Rules: []*config.RulesBasedSamplerRule{ + { + Name: "span matches, root does not match", + SampleRate: 10, + Conditions: []*config.RulesBasedSamplerCondition{ + { + Field: "root.test", + Operator: config.Contains, + Value: "foo", + Datatype: "string", + }, + }, + }, + }, + }, + Spans: []*types.Span{ + { + TraceID: "123testABC", // I am root. + Event: types.Event{ + Data: map[string]interface{}{ + "test": "nope", // I am the root span that does not match. + }, + }, + }, + { + Event: types.Event{ + Data: map[string]interface{}{ + "http.status_code": "200", + }, + }, + }, + { + Event: types.Event{ + Data: map[string]interface{}{ + "test": "foo", // I am the span that almost matches, but I'm not root. + "test1": 1, + "test2": 2.2, + }, + }, + }, + }, + ExpectedKeep: true, + ExpectedRate: 1, + ExpectedName: "no rule matched", + }, + { + Rules: &config.RulesBasedSamplerConfig{ + Rules: []*config.RulesBasedSamplerRule{ + { + Name: "root does not match, spans do not match", + SampleRate: 10, + Conditions: []*config.RulesBasedSamplerCondition{ + { + Field: "root.test", + Operator: config.Contains, + Value: "foo", + Datatype: "string", + }, + }, + }, + }, + }, + Spans: []*types.Span{ + { + TraceID: "123testABC", // I am root. + Event: types.Event{ + Data: map[string]interface{}{ + "test": "nope", + }, + }, + }, + { + Event: types.Event{ + Data: map[string]interface{}{ + "http.status_code": "200", + }, + }, + }, + { + Event: types.Event{ + Data: map[string]interface{}{ + "test": "nope", + "test1": 1, + "test2": 2.2, + }, + }, + }, + }, + ExpectedKeep: true, + ExpectedRate: 1, + ExpectedName: "no rule matched", + }, + { + Rules: &config.RulesBasedSamplerConfig{ + Rules: []*config.RulesBasedSamplerRule{ + { + Name: "root doesn't match, next span doesn't match, third span matches", + SampleRate: 10, + Conditions: []*config.RulesBasedSamplerCondition{ + { + Fields: []string{"http.status_code", "root.http.status_code"}, + Operator: config.EQ, + Value: "500", + }, + }, + }, + }, + }, + Spans: []*types.Span{ + { + TraceID: "123testABC", // I am root. + Event: types.Event{ + Data: map[string]interface{}{ + "test": "nope", + }, + }, + }, + { + Event: types.Event{ + Data: map[string]interface{}{ + "http.status_code": "200", + }, + }, + }, + { + Event: types.Event{ + Data: map[string]interface{}{ + "http.status_code": "500", + }, + }, + }, + }, + ExpectedKeep: false, + ExpectedRate: 10, + ExpectedName: "root doesn't match, next span doesn't match, third span matches", + }, + { + Rules: &config.RulesBasedSamplerConfig{ + Rules: []*config.RulesBasedSamplerRule{ + { + Name: "root doesn't match, next span doesn't match, third span matches, reversed", + SampleRate: 10, + Conditions: []*config.RulesBasedSamplerCondition{ + { + Fields: []string{"root.http.status_code", "http.status_code"}, + Operator: config.EQ, + Value: "500", + }, + }, + }, + }, + }, + Spans: []*types.Span{ + { + TraceID: "123testABC", // I am root. + Event: types.Event{ + Data: map[string]interface{}{ + "test": "nope", + }, + }, + }, + { + Event: types.Event{ + Data: map[string]interface{}{ + "http.status_code": "200", + }, + }, + }, + { + Event: types.Event{ + Data: map[string]interface{}{ + "http.status_code": "500", + }, + }, + }, + }, + ExpectedKeep: false, + ExpectedRate: 10, + ExpectedName: "root doesn't match, next span doesn't match, third span matches, reversed", + }, + { + Rules: &config.RulesBasedSamplerConfig{ + Rules: []*config.RulesBasedSamplerRule{ + { + Name: "no root span", + SampleRate: 10, + Conditions: []*config.RulesBasedSamplerCondition{ + { + Fields: []string{"root.http.status_code"}, + Operator: config.EQ, + Value: "500", + }, + }, + }, + }, + }, + Spans: []*types.Span{ + { + Event: types.Event{ + Data: map[string]interface{}{ + "http.status_code": "200", + }, + }, + }, + { + Event: types.Event{ + Data: map[string]interface{}{ + "http.status_code": "500", + }, + }, + }, + }, + ExpectedKeep: true, + ExpectedRate: 1, + ExpectedName: "no rule matched", + }, + { + Rules: &config.RulesBasedSamplerConfig{ + Rules: []*config.RulesBasedSamplerRule{ + { + Name: "no root span with multiple fields", + SampleRate: 10, + Conditions: []*config.RulesBasedSamplerCondition{ + { + Fields: []string{"root.foo", "foo"}, + Operator: config.EQ, + Value: 100.01, + }, + }, + }, + }, + }, + Spans: []*types.Span{ + { + Event: types.Event{ + Data: map[string]interface{}{ + "foo": 100.01, + }, + }, + }, + { + Event: types.Event{ + Data: map[string]interface{}{ + "foo": 99.1, + "bar": 100.10, + }, + }, + }, + }, + ExpectedKeep: false, + ExpectedRate: 10, + ExpectedName: "no root span with multiple fields", + }, + { + Rules: &config.RulesBasedSamplerConfig{ + Rules: []*config.RulesBasedSamplerRule{ + { + Name: "no root span, checking other spans in trace", + SampleRate: 10, + Conditions: []*config.RulesBasedSamplerCondition{ + { + Fields: []string{"root.foo", "foo"}, + Operator: config.EQ, + Value: true, + }, + }, + }, + }, + }, + Spans: []*types.Span{ + { + Event: types.Event{ + Data: map[string]interface{}{ + "foo": true, + }, + }, + }, + { + Event: types.Event{ + Data: map[string]interface{}{ + "foo": true, + }, + }, + }, + }, + ExpectedKeep: false, + ExpectedRate: 10, + ExpectedName: "no root span, checking other spans in trace", + }, + { + Rules: &config.RulesBasedSamplerConfig{ + Rules: []*config.RulesBasedSamplerRule{ + { + Name: "NotExists🦶🔫/root span exists, rule uses has-root-span guard, field is missing", + SampleRate: 2, + Conditions: []*config.RulesBasedSamplerCondition{ + { + Operator: config.HasRootSpan, + Value: true, + }, + { + Field: "root.service.name", + Operator: config.NotExists, + }, + }, + }, + }, + }, + Spans: []*types.Span{ + { + TraceID: "abc123", + Event: types.Event{ + Data: map[string]interface{}{ + "name.of.service": "is not service.name!", + }, + }, + }, + { + Event: types.Event{ + Data: map[string]interface{}{ + "foo": true, + }, + }, + }, + }, + ExpectedKeep: true, + ExpectedRate: 2, + ExpectedName: "NotExists🦶🔫/root span exists, rule uses has-root-span guard, field is missing", + }, + { + Rules: &config.RulesBasedSamplerConfig{ + Rules: []*config.RulesBasedSamplerRule{ + { + Name: "NotExists🦶🔫/root span exists, rule uses has-root-span guard, field is present", + SampleRate: 2, + Conditions: []*config.RulesBasedSamplerCondition{ + { + Operator: config.HasRootSpan, + Value: true, + }, + { + Field: "root.service.name", + Operator: config.NotExists, + }, + }, + }, + }, + }, + Spans: []*types.Span{ + { + TraceID: "abc123", + Event: types.Event{ + Data: map[string]interface{}{ + "service.name": "totally present", + }, + }, + }, + { + Event: types.Event{ + Data: map[string]interface{}{ + "foo": true, + }, + }, + }, + }, + ExpectedKeep: true, + ExpectedRate: 1, + ExpectedName: "no rule matched", + }, + { + Rules: &config.RulesBasedSamplerConfig{ + Rules: []*config.RulesBasedSamplerRule{ + { + Name: "NotExists🦶🔫/no root span, no has-root-span guard (not recommended!)", + SampleRate: 2, + Conditions: []*config.RulesBasedSamplerCondition{ + // note: no HasRootSpan guard condition to confirm presence of a root span! + { + Field: "root.service.name", + Operator: config.NotExists, + }, + }, + }, + }, + }, + Spans: []*types.Span{ + { + Event: types.Event{ + Data: map[string]interface{}{ + "service.name": "no trace id on this test span, so it's not root", + }, + }, + }, + { + Event: types.Event{ + Data: map[string]interface{}{ + "foo": true, + }, + }, + }, + }, + ExpectedKeep: true, + ExpectedRate: 2, + ExpectedName: "NotExists🦶🔫/no root span, no has-root-span guard (not recommended!)", + }, + } + + for _, d := range data { + t.Run(d.Rules.Rules[0].Name, func(t *testing.T) { + sampler := &RulesBasedSampler{ + Config: d.Rules, + Logger: &logger.NullLogger{}, + Metrics: &metrics.NullMetrics{}, + } + + sampler.Start() + + trace := &types.Trace{} + for _, span := range d.Spans { + trace.AddSpan(span) + // We declare which span is the root span in the test cases by setting the traceID. + if span.TraceID != "" { + require.Nil(t, trace.RootSpan, "Only set the trace ID on one span in a test case to designate which is the root. This test case appears to have multiple spans with trace IDs set.") + trace.RootSpan = span + } + } + + spans := trace.GetSpans() + assert.Len(t, spans, len(d.Spans), "should have the same number of spans as input") + + rate, _, reason, key := sampler.GetSampleRate(trace) + assert.Equal(t, "", key) + + assert.Equal(t, d.ExpectedRate, rate, d.Rules) + name := d.ExpectedName + if name == "" { + name = d.Rules.Rules[0].Name + } + assert.Contains(t, reason, name) + }) + } +} diff --git a/sample/sample.go b/sample/sample.go index b94de3b35d..4ddbad1d01 100644 --- a/sample/sample.go +++ b/sample/sample.go @@ -65,10 +65,7 @@ func (s *SamplerFactory) GetSamplerImplementationForKey(samplerKey string, isLeg } } - c, _, err := s.Config.GetSamplerConfigForDestName(samplerKey) - if err != nil { - return nil - } + c, _ := s.Config.GetSamplerConfigForDestName(samplerKey) var sampler Sampler @@ -92,7 +89,7 @@ func (s *SamplerFactory) GetSamplerImplementationForKey(samplerKey string, isLeg os.Exit(1) } - err = sampler.Start() + err := sampler.Start() if err != nil { s.Logger.Debug().WithField("dataset", samplerKey).Logf("failed to start sampler") return nil diff --git a/sample/sample_test.go b/sample/sample_test.go index acd2862314..4f58583a36 100644 --- a/sample/sample_test.go +++ b/sample/sample_test.go @@ -25,8 +25,7 @@ func getConfig(args []string) (config.Config, error) { // creates two temporary yaml files from the strings passed in and returns their filenames func createTempConfigs(t *testing.T, configBody, rulesBody string) (string, string) { - tmpDir, err := os.MkdirTemp("", "") - assert.NoError(t, err) + tmpDir := t.TempDir() configFile, err := os.CreateTemp(tmpDir, "cfg_*.yaml") assert.NoError(t, err) @@ -100,8 +99,6 @@ func TestDatasetPrefix(t *testing.T) { "Samplers/dataset.production/DeterministicSampler/SampleRate", 20, ) cfg, rules := createTempConfigs(t, cm, rm) - defer os.Remove(rules) - defer os.Remove(cfg) c, err := getConfig([]string{"--no-validate", "--config", cfg, "--rules_config", rules}) assert.NoError(t, err) @@ -145,8 +142,6 @@ func TestTotalThroughputClusterSize(t *testing.T) { "Samplers/production/TotalThroughputSampler/UseClusterSize", true, ) cfg, rules := createTempConfigs(t, cm, rm) - defer os.Remove(rules) - defer os.Remove(cfg) c, err := getConfig([]string{"--no-validate", "--config", cfg, "--rules_config", rules}) assert.NoError(t, err) @@ -176,8 +171,6 @@ func TestEMAThroughputClusterSize(t *testing.T) { "Samplers/production/EMAThroughputSampler/UseClusterSize", true, ) cfg, rules := createTempConfigs(t, cm, rm) - defer os.Remove(rules) - defer os.Remove(cfg) c, err := getConfig([]string{"--no-validate", "--config", cfg, "--rules_config", rules}) assert.NoError(t, err) @@ -207,8 +200,6 @@ func TestWindowedThroughputClusterSize(t *testing.T) { "Samplers/production/WindowedThroughputSampler/UseClusterSize", true, ) cfg, rules := createTempConfigs(t, cm, rm) - defer os.Remove(rules) - defer os.Remove(cfg) c, err := getConfig([]string{"--no-validate", "--config", cfg, "--rules_config", rules}) assert.NoError(t, err) diff --git a/sample/totalthroughput.go b/sample/totalthroughput.go index ef188996c2..3475d29b5b 100644 --- a/sample/totalthroughput.go +++ b/sample/totalthroughput.go @@ -106,6 +106,7 @@ func (d *TotalThroughputSampler) GetSampleRate(trace *types.Trace) (rate uint, k case "counter": delta := val - d.lastMetrics[name] d.Metrics.Count(name, delta) + d.lastMetrics[name] = val case "gauge": d.Metrics.Gauge(name, val) } diff --git a/sample/trace_key.go b/sample/trace_key.go index da65fefa3f..7683397dd3 100644 --- a/sample/trace_key.go +++ b/sample/trace_key.go @@ -4,20 +4,34 @@ import ( "fmt" "sort" "strconv" + "strings" "github.com/honeycombio/refinery/types" ) type traceKey struct { fields []string + rootOnlyFields []string useTraceLength bool } func newTraceKey(fields []string, useTraceLength bool) *traceKey { // always put the field list in sorted order for easier comparison sort.Strings(fields) + rootOnlyFields := make([]string, 0, len(fields)/2) + nonRootFields := make([]string, 0, len(fields)/2) + for _, field := range fields { + if strings.HasPrefix(field, RootPrefix) { + rootOnlyFields = append(rootOnlyFields, field[len(RootPrefix):]) + continue + } + + nonRootFields = append(nonRootFields, field) + } + return &traceKey{ - fields: fields, + fields: nonRootFields, + rootOnlyFields: rootOnlyFields, useTraceLength: useTraceLength, } } @@ -26,7 +40,7 @@ func newTraceKey(fields []string, useTraceLength bool) *traceKey { func (d *traceKey) build(trace *types.Trace) string { // fieldCollector gets all values from the fields listed in the config, even // if they happen multiple times. - fieldCollector := map[string][]string{} + fieldCollector := make(map[string][]string) // for each field, for each span, get the value of that field spans := trace.GetSpans() @@ -54,6 +68,15 @@ func (d *traceKey) build(trace *types.Trace) string { key += "," } + if trace.RootSpan != nil { + for _, field := range d.rootOnlyFields { + + if val, ok := trace.RootSpan.Data[field]; ok { + key += fmt.Sprintf("%v,", val) + } + } + } + if d.useTraceLength { key += strconv.FormatInt(int64(len(spans)), 10) } diff --git a/sample/trace_key_test.go b/sample/trace_key_test.go index e7a3bfec65..b9e29ba25c 100644 --- a/sample/trace_key_test.go +++ b/sample/trace_key_test.go @@ -117,8 +117,9 @@ func TestKeyGeneration(t *testing.T) { assert.Equal(t, expected, generator.build(trace)) - // now test that multiple values across spans in a different order are condensed the same - fields = []string{"http.status_code"} + // test field list with root prefix, only include the field from on the root span + // if it exists + fields = []string{"http.status_code", "root.service_name", "root.another_field"} useTraceLength = true generator = newTraceKey(fields, useTraceLength) @@ -133,31 +134,24 @@ func TestKeyGeneration(t *testing.T) { }, }) - trace.AddSpan(&types.Span{ - Event: types.Event{ - Data: map[string]interface{}{ - "http.status_code": 404, - }, - }, - }) - trace.AddSpan(&types.Span{ Event: types.Event{ Data: map[string]interface{}{ "http.status_code": 200, + "service_name": "another", }, }, }) - trace.AddSpan(&types.Span{ + trace.RootSpan = &types.Span{ Event: types.Event{ Data: map[string]interface{}{ - "http.status_code": 200, + "service_name": "test", }, }, - }) + } - expected = "200•404•,4" + expected = "200•404•,test,2" assert.Equal(t, expected, generator.build(trace)) } diff --git a/sample/windowed_throughput.go b/sample/windowed_throughput.go index a479a7c849..1c1e1cd542 100644 --- a/sample/windowed_throughput.go +++ b/sample/windowed_throughput.go @@ -102,6 +102,7 @@ func (d *WindowedThroughputSampler) GetSampleRate(trace *types.Trace) (rate uint case "counter": delta := val - d.lastMetrics[name] d.Metrics.Count(name, delta) + d.lastMetrics[name] = val case "gauge": d.Metrics.Gauge(name, val) } diff --git a/service/debug/debug_service.go b/service/debug/debug_service.go index 8e6021bf0a..9d085169e1 100644 --- a/service/debug/debug_service.go +++ b/service/debug/debug_service.go @@ -61,7 +61,7 @@ func (s *DebugService) Start() error { s.Publish("memstats", Func(memstats)) go func() { - configAddr, _ := s.Config.GetDebugServiceAddr() + configAddr := s.Config.GetDebugServiceAddr() if configAddr != "" { host, portStr, _ := net.SplitHostPort(configAddr) addr := net.JoinHostPort(host, portStr) diff --git a/sharder/deterministic.go b/sharder/deterministic.go index 6d933fa425..a1ae132748 100644 --- a/sharder/deterministic.go +++ b/sharder/deterministic.go @@ -1,9 +1,6 @@ package sharder import ( - "fmt" - "net" - "net/url" "sort" "sync" "time" @@ -13,7 +10,6 @@ import ( "github.com/honeycombio/refinery/internal/peer" "github.com/honeycombio/refinery/logger" "github.com/pkg/errors" - "github.com/sirupsen/logrus" ) // These are random bits to make sure we differentiate between different @@ -23,41 +19,54 @@ const ( peerSeed uint64 = 6789531204236 ) -// DetShard implements Shard -type DetShard struct { - scheme string - ipOrHost string - port string -} - type hashShard struct { uhash uint64 shardIndex int } -func (d *DetShard) Equals(other Shard) bool { - otherDetshard, ok := other.(*DetShard) +var _ Shard = detShard("") + +// detShard implements Shard +type detShard string + +// GetHashesFor generates a number of hashShards for a given detShard by repeatedly hashing the +// seed with itself. The intent is to generate a repeatable pseudo-random sequence. +func (d detShard) GetHashesFor(index int, n int, seed uint64) []hashShard { + hashes := make([]hashShard, 0) + addr := d.GetAddress() + for i := 0; i < n; i++ { + hashes = append(hashes, hashShard{ + uhash: wyhash.Hash([]byte(addr), seed), + shardIndex: index, + }) + // generate another seed from the previous seed; we want this to be the same + // sequence for everything. + seed = wyhash.Hash([]byte("anything"), seed) + } + return hashes +} +func (d detShard) Equals(other Shard) bool { + otherDetshard, ok := other.(detShard) if !ok { // can't be equal if it's a different kind of Shard! return false } // only basic types in this struct; we can use == hooray - return *d == *otherDetshard + return d == otherDetshard +} + +// GetAddress returns the Shard's address in a usable form +func (d detShard) GetAddress() string { + return string(d) } -type SortableShardList []*DetShard +type SortableShardList []detShard func (s SortableShardList) Len() int { return len(s) } func (s SortableShardList) Swap(i, j int) { s[i], s[j] = s[j], s[i] } func (s SortableShardList) Less(i, j int) bool { - if s[i].ipOrHost != s[j].ipOrHost { - return s[i].ipOrHost < s[j].ipOrHost - } - if s[i].scheme != s[j].scheme { - return s[i].scheme < s[j].scheme - } - return s[i].port < s[j].port + return s[i] < s[j] } func (s SortableShardList) Equals(other SortableShardList) bool { @@ -72,39 +81,16 @@ func (s SortableShardList) Equals(other SortableShardList) bool { return true } -// GetAddress returns the Shard's address in a usable form -func (d *DetShard) GetAddress() string { - return fmt.Sprintf("%s://%s:%s", d.scheme, d.ipOrHost, d.port) -} - -func (d *DetShard) String() string { - return d.GetAddress() -} - -// GetHashesFor generates a number of hashShards for a given DetShard by repeatedly hashing the -// seed with itself. The intent is to generate a repeatable pseudo-random sequence. -func (d *DetShard) GetHashesFor(index int, n int, seed uint64) []hashShard { - hashes := make([]hashShard, 0) - addr := d.GetAddress() - for i := 0; i < n; i++ { - hashes = append(hashes, hashShard{ - uhash: wyhash.Hash([]byte(addr), seed), - shardIndex: index, - }) - // generate another seed from the previous seed; we want this to be the same - // sequence for everything. - seed = wyhash.Hash([]byte("anything"), seed) - } - return hashes -} +// make sure DeterministicSharder implements Sharder +var _ Sharder = (*DeterministicSharder)(nil) type DeterministicSharder struct { Config config.Config `inject:""` Logger logger.Logger `inject:""` Peers peer.Peers `inject:""` - myShard *DetShard - peers []*DetShard + myShard detShard + peers []detShard hashes []hashShard peerLock sync.RWMutex @@ -122,96 +108,33 @@ func (d *DeterministicSharder) Start() error { } }) + if err := d.loadPeerList(); err != nil { + d.Logger.Error().Logf("failed to reload peer list: %+v", err) + } + // Try up to 5 times to find myself in the peer list before giving up - var found bool - var selfIndexIntoPeerList int + var self string + var err error for j := 0; j < 5; j++ { - err := d.loadPeerList() - if err != nil { - return err - } - - // get my listen address for peer traffic for the Port number - listenAddr, err := d.Config.GetPeerListenAddr() - if err != nil { - return errors.Wrap(err, "failed to get listen addr config") - } - _, localPort, err := net.SplitHostPort(listenAddr) - if err != nil { - return errors.Wrap(err, "failed to parse listen addr into host:port") - } - d.Logger.Debug().Logf("picked up local peer port of %s", localPort) - - var localIPs []string - - // If RedisIdentifier is an IP, use as localIPs value. - if redisIdentifier, err := d.Config.GetRedisIdentifier(); err == nil && redisIdentifier != "" { - if ip := net.ParseIP(redisIdentifier); ip != nil { - d.Logger.Debug().Logf("Using RedisIdentifier as public IP: %s", redisIdentifier) - localIPs = []string{redisIdentifier} - } - } - - // Otherwise, get my local interfaces' IPs. - if len(localIPs) == 0 { - localAddrs, err := net.InterfaceAddrs() - if err != nil { - return errors.Wrap(err, "failed to get local interface list to initialize sharder") - } - localIPs = make([]string, len(localAddrs)) - for i, addr := range localAddrs { - addrStr := addr.String() - ip, _, err := net.ParseCIDR(addrStr) - if err != nil { - return errors.Wrap(err, fmt.Sprintf("failed to parse CIDR for local IP %s", addrStr)) - } - localIPs[i] = ip.String() - } - } - // go through peer list, resolve each address, see if any of them match any // local interface. Note that this assumes only one instance of Refinery per // host can run. - for i, peerShard := range d.peers { - d.Logger.Debug().WithFields(logrus.Fields{ - "peer": peerShard, - "self": localIPs, - }).Logf("Considering peer looking for self") - peerIPList, err := net.LookupHost(peerShard.ipOrHost) - if err != nil { - // TODO something better than fail to start if peer is missing - return errors.Wrap(err, fmt.Sprintf("couldn't resolve peer hostname %s", peerShard.ipOrHost)) - } - for _, peerIP := range peerIPList { - for _, ipAddr := range localIPs { - if peerIP == ipAddr { - if peerShard.port == localPort { - d.Logger.Debug().WithField("peer", peerShard).Logf("Found myself in peer list") - found = true - selfIndexIntoPeerList = i - } else { - d.Logger.Debug().WithFields(logrus.Fields{ - "peer": peerShard, - "expectedPort": localPort, - }).Logf("Peer port mismatch") - } - } + self, err = d.Peers.GetInstanceID() + if err == nil { + for _, peerShard := range d.peers { + if self == peerShard.GetAddress() { + d.myShard = peerShard + return nil } } } - if found { - break - } + d.Logger.Debug().Logf("Failed to find self in peer list; waiting 5sec and trying again") time.Sleep(5 * time.Second) } - if !found { - d.Logger.Debug().Logf("list of current peers: %+v", d.peers) - return errors.New("failed to find self in the peer list") - } - d.myShard = d.peers[selfIndexIntoPeerList] - return nil + d.Logger.Error().WithFields(map[string]interface{}{"peers": d.peers, "self": self}).Logf("list of current peers") + return errors.New("failed to find self in the peer list") } // loadPeerList will run every time any config changes (not only when the list @@ -231,17 +154,9 @@ func (d *DeterministicSharder) loadPeerList() error { // turn the peer list into a list of shards // and a list of hashes - newPeers := make([]*DetShard, len(peerList)) + newPeers := make([]detShard, len(peerList)) for ix, peer := range peerList { - peerURL, err := url.Parse(peer) - if err != nil { - return errors.Wrap(err, "couldn't parse peer as a URL") - } - peerShard := &DetShard{ - scheme: peerURL.Scheme, - ipOrHost: peerURL.Hostname(), - port: peerURL.Port(), - } + peerShard := detShard(peer) newPeers[ix] = peerShard } @@ -282,7 +197,7 @@ func (d *DeterministicSharder) loadPeerList() error { // if the peer list changed, load the new list d.peerLock.RLock() if !SortableShardList(d.peers).Equals(newPeers) { - d.Logger.Info().Logf("Peer list has changed. New peer list: %+v", newPeers) + d.Logger.Info().WithField("peers", newPeers).Logf("Peer list has changed.") d.peerLock.RUnlock() d.peerLock.Lock() d.peers = newPeers diff --git a/sharder/deterministic_test.go b/sharder/deterministic_test.go index f5fd90c2de..d0ec33abff 100644 --- a/sharder/deterministic_test.go +++ b/sharder/deterministic_test.go @@ -1,7 +1,6 @@ package sharder import ( - "context" "fmt" "math/rand" "testing" @@ -9,29 +8,33 @@ import ( "github.com/honeycombio/refinery/config" "github.com/honeycombio/refinery/internal/peer" "github.com/honeycombio/refinery/logger" + "github.com/honeycombio/refinery/metrics" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func TestWhichShard(t *testing.T) { const ( - selfAddr = "127.0.0.1:8081" - traceID = "test" + selfPeerAddr = "127.0.0.1:8081" + traceID = "test" ) peers := []string{ - "http://" + selfAddr, + "http://" + selfPeerAddr, "http://2.2.2.2:8081", "http://3.3.3.3:8081", } config := &config.MockConfig{ - GetPeerListenAddrVal: selfAddr, + GetPeerListenAddrVal: selfPeerAddr, GetPeersVal: peers, PeerManagementType: "file", } done := make(chan struct{}) defer close(done) - filePeers, err := peer.NewPeers(context.Background(), config, done) - assert.Equal(t, nil, err) + + filePeers := &peer.FilePeers{Cfg: config, Metrics: &metrics.NullMetrics{}} + require.NoError(t, filePeers.Start()) + sharder := DeterministicSharder{ Config: config, Logger: &logger.NullLogger{}, @@ -46,35 +49,38 @@ func TestWhichShard(t *testing.T) { "should select a peer for a trace") config.GetPeersVal = []string{} - config.ReloadConfig() + config.Reload() assert.Equal(t, shard.GetAddress(), sharder.WhichShard(traceID).GetAddress(), "should select the same peer if peer list becomes empty") } func TestWhichShardAtEdge(t *testing.T) { const ( - selfAddr = "127.0.0.1:8081" - traceID = "RCIVNUNA" // carefully chosen (by trying over a billion times) to hash in WhichShard to 0xFFFFFFFF + selfPeerAddr = "127.0.0.1:8081" + traceID = "RCIVNUNA" // carefully chosen (by trying over a billion times) to hash in WhichShard to 0xFFFFFFFF ) // The algorithm in WhichShard works correctly for divisors of 2^32-1. The prime factorization of that includes // 1, 3, 5, 17, so we need something other than 3 to be sure that this test would fail. // It was tested (and failed) without the additional conditional. peers := []string{ - "http://" + selfAddr, + "http://" + selfPeerAddr, "http://2.2.2.2:8081", "http://3.3.3.3:8081", "http://4.4.4.4:8081", } + config := &config.MockConfig{ - GetPeerListenAddrVal: selfAddr, + GetPeerListenAddrVal: selfPeerAddr, GetPeersVal: peers, PeerManagementType: "file", } done := make(chan struct{}) defer close(done) - filePeers, err := peer.NewPeers(context.Background(), config, done) - assert.Equal(t, nil, err) + + filePeers := &peer.FilePeers{Cfg: config, Metrics: &metrics.NullMetrics{}} + require.NoError(t, filePeers.Start()) + sharder := DeterministicSharder{ Config: config, Logger: &logger.NullLogger{}, @@ -89,7 +95,7 @@ func TestWhichShardAtEdge(t *testing.T) { "should select a peer for a trace") config.GetPeersVal = []string{} - config.ReloadConfig() + config.Reload() assert.Equal(t, shard.GetAddress(), sharder.WhichShard(traceID).GetAddress(), "should select the same peer if peer list becomes empty") } @@ -107,26 +113,28 @@ func GenID(numChars int) string { func BenchmarkShardBulk(b *testing.B) { const ( - selfAddr = "127.0.0.1:8081" - traceID = "test" + selfPeerAddr = "127.0.0.1:8081" + traceID = "test" ) const npeers = 11 peers := []string{ - "http://" + selfAddr, + "http://" + selfPeerAddr, } for i := 1; i < npeers; i++ { peers = append(peers, fmt.Sprintf("http://2.2.2.%d/:8081", i)) } config := &config.MockConfig{ - GetPeerListenAddrVal: selfAddr, + GetPeerListenAddrVal: selfPeerAddr, GetPeersVal: peers, PeerManagementType: "file", } done := make(chan struct{}) defer close(done) - filePeers, err := peer.NewPeers(context.Background(), config, done) - assert.Equal(b, nil, err) + + filePeers := &peer.FilePeers{Cfg: config, Metrics: &metrics.NullMetrics{}} + require.NoError(b, filePeers.Start()) + sharder := DeterministicSharder{ Config: config, Logger: &logger.NullLogger{}, @@ -149,8 +157,8 @@ func BenchmarkShardBulk(b *testing.B) { func TestShardBulk(t *testing.T) { const ( - selfAddr = "127.0.0.1:8081" - traceID = "test" + selfPeerAddr = "127.0.0.1:8081" + traceID = "test" ) // this test should work for a wide range of peer counts @@ -159,21 +167,23 @@ func TestShardBulk(t *testing.T) { t.Run(fmt.Sprintf("bulk npeers=%d", npeers), func(t *testing.T) { for retry := 0; retry < 2; retry++ { peers := []string{ - "http://" + selfAddr, + "http://" + selfPeerAddr, } for i := 1; i < npeers; i++ { peers = append(peers, fmt.Sprintf("http://2.2.2.%d/:8081", i)) } config := &config.MockConfig{ - GetPeerListenAddrVal: selfAddr, + GetPeerListenAddrVal: selfPeerAddr, GetPeersVal: peers, PeerManagementType: "file", } done := make(chan struct{}) defer close(done) - filePeers, err := peer.NewPeers(context.Background(), config, done) - assert.NoError(t, err, "NewPeers should succeed") + + filePeers := &peer.FilePeers{Cfg: config, Metrics: &metrics.NullMetrics{}} + require.NoError(t, filePeers.Start()) + sharder := DeterministicSharder{ Config: config, Logger: &logger.NullLogger{}, @@ -223,8 +233,8 @@ func TestShardBulk(t *testing.T) { func TestShardDrop(t *testing.T) { const ( - selfAddr = "127.0.0.1:8081" - traceID = "test" + selfPeerAddr = "127.0.0.1:8081" + traceID = "test" ) for i := 0; i < 5; i++ { @@ -232,21 +242,23 @@ func TestShardDrop(t *testing.T) { t.Run(fmt.Sprintf("drop npeers=%d", npeers), func(t *testing.T) { for retry := 0; retry < 2; retry++ { peers := []string{ - "http://" + selfAddr, + "http://" + selfPeerAddr, } for i := 1; i < npeers; i++ { peers = append(peers, fmt.Sprintf("http://2.2.2.%d/:8081", i)) } config := &config.MockConfig{ - GetPeerListenAddrVal: selfAddr, + GetPeerListenAddrVal: selfPeerAddr, GetPeersVal: peers, PeerManagementType: "file", } done := make(chan struct{}) defer close(done) - filePeers, err := peer.NewPeers(context.Background(), config, done) - assert.Equal(t, nil, err) + + filePeers := &peer.FilePeers{Cfg: config, Metrics: &metrics.NullMetrics{}} + require.NoError(t, filePeers.Start()) + sharder := DeterministicSharder{ Config: config, Logger: &logger.NullLogger{}, @@ -307,8 +319,8 @@ func TestShardDrop(t *testing.T) { func TestShardAddHash(t *testing.T) { const ( - selfAddr = "127.0.0.1:8081" - traceID = "test" + selfPeerAddr = "127.0.0.1:8081" + traceID = "test" ) for i := 0; i < 5; i++ { @@ -316,21 +328,23 @@ func TestShardAddHash(t *testing.T) { t.Run(fmt.Sprintf("add npeers=%d", npeers), func(t *testing.T) { for retry := 0; retry < 2; retry++ { peers := []string{ - "http://" + selfAddr, + "http://" + selfPeerAddr, } for i := 1; i < npeers; i++ { peers = append(peers, fmt.Sprintf("http://2.2.2.%d/:8081", i)) } config := &config.MockConfig{ - GetPeerListenAddrVal: selfAddr, + GetPeerListenAddrVal: selfPeerAddr, GetPeersVal: peers, PeerManagementType: "file", } done := make(chan struct{}) defer close(done) - filePeers, err := peer.NewPeers(context.Background(), config, done) - assert.Equal(t, nil, err) + + filePeers := &peer.FilePeers{Cfg: config, Metrics: &metrics.NullMetrics{}} + require.NoError(t, filePeers.Start()) + sharder := DeterministicSharder{ Config: config, Logger: &logger.NullLogger{}, @@ -392,27 +406,30 @@ func TestShardAddHash(t *testing.T) { func BenchmarkDeterministicShard(b *testing.B) { const ( - selfAddr = "127.0.0.1:8081" - traceID = "test" + selfPeerAddr = "127.0.0.1:8081" + traceID = "test" ) + for i := 0; i < 5; i++ { npeers := i*10 + 4 b.Run(fmt.Sprintf("benchmark_deterministic_%d", npeers), func(b *testing.B) { peers := []string{ - "http://" + selfAddr, + "http://" + selfPeerAddr, } for i := 1; i < npeers; i++ { peers = append(peers, fmt.Sprintf("http://2.2.2.%d/:8081", i)) } config := &config.MockConfig{ - GetPeerListenAddrVal: selfAddr, + GetPeerListenAddrVal: selfPeerAddr, GetPeersVal: peers, PeerManagementType: "file", } done := make(chan struct{}) defer close(done) - filePeers, err := peer.NewPeers(context.Background(), config, done) - assert.Equal(b, nil, err) + + filePeers := &peer.FilePeers{Cfg: config, Metrics: &metrics.NullMetrics{}} + require.NoError(b, filePeers.Start()) + sharder := DeterministicSharder{ Config: config, Logger: &logger.NullLogger{}, diff --git a/sharder/mock.go b/sharder/mock.go new file mode 100644 index 0000000000..c3c90b6487 --- /dev/null +++ b/sharder/mock.go @@ -0,0 +1,23 @@ +package sharder + +type MockSharder struct { + Self *TestShard + Other *TestShard +} + +func (s *MockSharder) MyShard() Shard { return s.Self } + +func (s *MockSharder) WhichShard(traceID string) Shard { + if s.Other != nil { + return s.Other + } + + return s.Self +} + +type TestShard struct { + Addr string +} + +func (s *TestShard) Equals(other Shard) bool { return s.Addr == other.GetAddress() } +func (s *TestShard) GetAddress() string { return s.Addr } diff --git a/sharder/single.go b/sharder/single.go index e2003a29df..edcfcf5dcc 100644 --- a/sharder/single.go +++ b/sharder/single.go @@ -12,7 +12,7 @@ var selfShard SingleShard = "self" func (s *SingleShard) Equals(other Shard) bool { return true } // GetAddress will never be used because every shard is my shard -func (s *SingleShard) GetAddress() string { return "" } +func (s *SingleShard) GetAddress() string { return "http://self" } type SingleServerSharder struct { Logger logger.Logger `inject:""` diff --git a/smoke-test/.gitignore b/smoke-test/.gitignore new file mode 100644 index 0000000000..03bd4129be --- /dev/null +++ b/smoke-test/.gitignore @@ -0,0 +1 @@ +*.env diff --git a/smoke-test/README.md b/smoke-test/README.md new file mode 100644 index 0000000000..661acff3d2 --- /dev/null +++ b/smoke-test/README.md @@ -0,0 +1,78 @@ +# Smoke Testing + +⚠️ All configuration in this directory is for development and testing purposes. +This is not an example of a production-ready Refinery deployment. + +## How Do I Even? + +From the root of the project repo: + +```shell +> make local_image +``` + +Then change to this directory and run docker compose: + +```shell +> cd smoke-test +> docker compose up +``` + +Observe the log output of the services. +Refinery ought to have connected to Redis to report and then find itself in the peer list. + +Congratulations! You have applied power and [the magic smoke was not released](https://en.wikipedia.org/wiki/Smoke_testing_(software)#Etymology)! + +## Shooting Trouble + +### Refinery warning: failed to upload metrics + +#### Problem + +The logs for the Refinery node contains: + +```plain +failed to upload metrics: failed to send metrics to : 401 Unauthorized +``` + +This message on its own is not a Refinery *failure*. +The service is likely operating, but unable to send the telemetry concerning its internal operations on to the configured endpoint. + +#### Solution + +Double-check the `LegacyMetrics` and `OTelMetrics` sections of `config.yaml` are set to send telemetry to the destination you expect. +Confirm that the API key provided there or in environment variables is correct for the intended destination. + +### Docker Error: No such image + +#### Problem + +The command `docker compose up` returns the following error: + +```plain +Error response from daemon: No such image: ko.local/refinery:latest +``` + +#### Solution + +The local image needs to be built. Run `make local_target` at the root of the repo. + +### Redis Error: SSL routines::wrong version number + +#### Problem + +The services for Redis and Refinery start, but the Redis log contains numerous entries like: + +```plain +redis-1 | 1:M 19 Aug 2024 17:23:52.114 # Error accepting a client connection: error:0A00010B:SSL routines::wrong version number (addr=172.25.0.3:37484 laddr=172.25.0.2:6379) +``` + +This is a sign that Refinery is not using TLS to connect to Redis which *is* using TLS. + +#### Solution + +Check the config.yaml used by the Refinery container. + +* Is `UseTLS` set to true? +* Is `UseTLSInsecure` set to true? (because we're self-signed locally) +* Do we have a bug with TLS connections? diff --git a/smoke-test/config.yaml b/smoke-test/config.yaml new file mode 100644 index 0000000000..031f9227be --- /dev/null +++ b/smoke-test/config.yaml @@ -0,0 +1,27 @@ +General: + ConfigurationVersion: 2 + MinRefineryVersion: v2.0 + +Logger: + Type: stdout + Level: info + +# LegacyMetrics: +# Enabled: true +# Dataset: refinery_metrics + +# OTelMetrics: +# Enabled: true +# Dataset: refinery_metrics_otel + +PeerManagement: + Type: redis + +RedisPeerManagement: + UseTLS: true + UseTLSInsecure: true + +RefineryTelemetry: + AddRuleReasonToTrace: true + AddSpanCountToRoot: true + AddHostMetadataToTrace: true diff --git a/smoke-test/docker-compose.yaml b/smoke-test/docker-compose.yaml new file mode 100644 index 0000000000..c0ad16fada --- /dev/null +++ b/smoke-test/docker-compose.yaml @@ -0,0 +1,51 @@ +services: + refinery: + image: ko.local/refinery:latest # build this with 'make local_image' at the root of the repo + pull_policy: never # 'Error response from daemon: No such image' means you need to build it. 👆 + environment: # these take precedence over the settings in env_file + REFINERY_REDIS_HOST: redis:6379 + env_file: + - refinery.env # put secrets & other custom env vars in here, git ignores it + volumes: + - ./config.yaml:/etc/refinery/refinery.yaml + - ./rules.yaml:/etc/refinery/rules.yaml + ports: + - 127.0.0.1:8080:8080 + - 127.0.0.1:9090:9090 + depends_on: + redis: + condition: service_healthy + + redis: + image: redis:7 + command: [ "redis-server", + "--port", "0", + "--tls-port", "6379", + "--tls-cert-file", "/data/certs/cert.pem", + "--tls-key-file", "/data/certs/key.pem", + "--tls-ca-cert-file", "/data/certs/ca.pem", + "--tls-auth-clients", "no" + ] + healthcheck: + test: ["CMD-SHELL", "redis-cli --tls --insecure ping | grep PONG"] + interval: 2s + timeout: 3s + retries: 5 + ports: + - 127.0.0.1:6379:6379 + volumes: + - redis-data:/data + - certs:/data/certs + depends_on: + gen-certs: + condition: service_completed_successfully + + gen-certs: + image: paulczar/omgwtfssl + command: ["sh", "-c", "[ -f /certs/cert.pem ] && echo 'Cert exists!' || /usr/local/bin/generate-certs"] + volumes: + - certs:/certs + +volumes: + certs: + redis-data: diff --git a/smoke-test/rules.yaml b/smoke-test/rules.yaml new file mode 100644 index 0000000000..67c4f13c21 --- /dev/null +++ b/smoke-test/rules.yaml @@ -0,0 +1,13 @@ + +RulesVersion: 2 +Samplers: + __default__: + DeterministicSampler: + SampleRate: 1 + TheNewWorld: + TotalThroughputSampler: + GoalThroughputPerSec: 50 + ClearFrequency: 5s + FieldList: + - title + diff --git a/test/EMAThroughput_rules.yaml b/test/EMAThroughput_rules.yaml new file mode 100644 index 0000000000..596f13f68b --- /dev/null +++ b/test/EMAThroughput_rules.yaml @@ -0,0 +1,12 @@ +RulesVersion: 2 +Samplers: + __default__: + DeterministicSampler: + SampleRate: 1 + + TheNewWorld: + EMAThroughputSampler: + GoalThroughputPerSec: 50 + AdjustmentInterval: 5s + FieldList: + - title diff --git a/test/TotalThroughput_rules.yaml b/test/TotalThroughput_rules.yaml new file mode 100644 index 0000000000..596f13f68b --- /dev/null +++ b/test/TotalThroughput_rules.yaml @@ -0,0 +1,12 @@ +RulesVersion: 2 +Samplers: + __default__: + DeterministicSampler: + SampleRate: 1 + + TheNewWorld: + EMAThroughputSampler: + GoalThroughputPerSec: 50 + AdjustmentInterval: 5s + FieldList: + - title diff --git a/test/WindowedThroughput_rules.yaml b/test/WindowedThroughput_rules.yaml new file mode 100644 index 0000000000..596f13f68b --- /dev/null +++ b/test/WindowedThroughput_rules.yaml @@ -0,0 +1,12 @@ +RulesVersion: 2 +Samplers: + __default__: + DeterministicSampler: + SampleRate: 1 + + TheNewWorld: + EMAThroughputSampler: + GoalThroughputPerSec: 50 + AdjustmentInterval: 5s + FieldList: + - title diff --git a/test/config.yaml b/test/config.yaml new file mode 100644 index 0000000000..55aa5cb798 --- /dev/null +++ b/test/config.yaml @@ -0,0 +1,26 @@ +General: + ConfigurationVersion: 2 + MinRefineryVersion: v2.0 + ConfigReloadInterval: 50s + +Network: + HoneycombAPI: https://api-dogfood.honeycomb.io + +Logger: + Type: stdout + Level: info + +LegacyMetrics: + Enabled: true + Dataset: refinery_metrics + APIHost: https://api-dogfood.honeycomb.io + +OTelMetrics: + Enabled: true + Dataset: refinery_metrics_otel + APIHost: https://api-dogfood.honeycomb.io + +RefineryTelemetry: + AddRuleReasonToTrace: true + AddSpanCountToRoot: true + AddHostMetadataToTrace: true diff --git a/test/rules.yaml b/test/rules.yaml new file mode 100644 index 0000000000..1bff2bd112 --- /dev/null +++ b/test/rules.yaml @@ -0,0 +1,18 @@ + +RulesVersion: 2 +Samplers: + __default__: + DeterministicSampler: + SampleRate: 1 + + TheNewWorld: + + + TotalThroughputSampler: + GoalThroughputPerSec: 50 + ClearFrequency: 5s + + + FieldList: + - title + diff --git a/tools/convert/Makefile b/tools/convert/Makefile index 78cb3b3916..3d4e1a422b 100644 --- a/tools/convert/Makefile +++ b/tools/convert/Makefile @@ -78,6 +78,10 @@ websiterules: go run . website rules --output=../../refinery_rules.md .PHONY: validate +#: validate the sample config and rules +validate: validateSampleConfig validateConfig validateRules + +.PHONY: validateSampleConfig #: validate the sample config validateSampleConfig: @echo @@ -85,12 +89,14 @@ validateSampleConfig: @echo go run . validate config --input=minimal_config.yaml +.PHONY: validateSampleRules validateConfig: @echo @echo "+++ validating sample config" @echo go run . validate config --input=../../config_complete.yaml +.PHONY: validateRules validateRules: @echo @echo "+++ validating sample rules" diff --git a/tools/convert/configDataNames.txt b/tools/convert/configDataNames.txt new file mode 100644 index 0000000000..d6215b92b8 --- /dev/null +++ b/tools/convert/configDataNames.txt @@ -0,0 +1,258 @@ +# Names of groups and fields in the new config file format. +# Automatically generated on 2024-09-03 at 19:48:53 UTC. + +General: + - ConfigurationVersion + + - MinRefineryVersion + + - DatasetPrefix + + - ConfigReloadInterval + + +Network: + - ListenAddr + + - PeerListenAddr + + - HTTPIdleTimeout + + - HoneycombAPI + + +AccessKeys: + - ReceiveKeys (originally APIKeys) + + - AcceptOnlyListedKeys + + - SendKey + + - SendKeyMode + + +RefineryTelemetry: + - AddRuleReasonToTrace + + - AddSpanCountToRoot + + - AddCountsToRoot + + - AddHostMetadataToTrace + + +Traces: + - SendDelay + + - BatchTimeout + + - TraceTimeout + + - SpanLimit + + - MaxBatchSize + + - SendTicker + + +Debugging: + - DebugServiceAddr + + - QueryAuthToken + + - AdditionalErrorFields + + - DryRun + + +Logger: + - Type + + - Level + + +HoneycombLogger: + - APIHost (originally HoneycombLogger.LoggerHoneycombAPI) + + - APIKey (originally HoneycombLogger.LoggerAPIKey) + + - Dataset (originally HoneycombLogger.LoggerDataset) + + - SamplerEnabled (originally HoneycombLogger.LoggerSamplerEnabled) + + - SamplerThroughput + + +StdoutLogger: + - Structured + + - SamplerEnabled + + - SamplerThroughput + + +PrometheusMetrics: + - Enabled + + - ListenAddr (originally PrometheusMetrics.MetricsListenAddr) + + +LegacyMetrics: + - Enabled + + - APIHost (originally HoneycombMetrics.MetricsHoneycombAPI) + + - APIKey (originally HoneycombMetrics.MetricsAPIKey) + + - Dataset (originally HoneycombMetrics.MetricsDataset) + + - ReportingInterval (originally HoneycombMetrics.MetricsReportingInterval) + + +OTelMetrics: + - Enabled + + - APIHost + + - APIKey + + - Dataset + + - ReportingInterval + + - Compression + + +OTelTracing: + - Enabled + + - APIHost + + - APIKey + + - Dataset + + - SampleRate + + +PeerManagement: + - Type (originally PeerManagement.Type) + + - Identifier (originally PeerManagement.RedisIdentifier) + + - IdentifierInterfaceName (originally PeerManagement.IdentifierInterfaceName) + + - UseIPV6Identifier (originally PeerManagement.UseIPV6Identifier) + + - Peers (originally PeerManagement.Peers) + + +RedisPeerManagement: + - Host (originally PeerManagement.RedisHost) + + - ClusterHosts + + - Username (originally PeerManagement.RedisUsername) + + - Password (originally PeerManagement.Password) + + - AuthCode (originally PeerManagement.AuthCode) + + - Prefix (originally PeerManagement.Prefix) + + - Database (originally PeerManagement.Database) + + - UseTLS (originally PeerManagement.UseTLS) + + - UseTLSInsecure (originally PeerManagement.UseTLSInsecure) + + - Timeout (originally PeerManagement.Timeout) + + - Strategy (originally PeerManagement.Strategy) (**removed in v2**) + + +Collection: + - CacheCapacity (originally InMemCollector.CacheCapacity) + + - PeerQueueSize + + - IncomingQueueSize + + - AvailableMemory + + - MaxMemoryPercentage + + - MaxAlloc (originally InMemCollector.MaxAlloc) + + - DisableRedistribution + + - ShutdownDelay + + +BufferSizes: + - UpstreamBufferSize + + - PeerBufferSize + + +Specialized: + - EnvironmentCacheTTL + + - CompressPeerCommunication + + - Collector (originally Collector) (**removed in v2**) + + - CacheOverrunStrategy (originally InMemCollector.CacheOverrunStrategy) (**removed in v2**) + + - AdditionalAttributes + + +IDFields: + - TraceNames + + - ParentNames + + +GRPCServerParameters: + - Enabled + + - ListenAddr (originally GRPCListenAddr) + + - MaxConnectionIdle (originally GRPCServerParameters.MaxConnectionIdle) + + - MaxConnectionAge (originally GRPCServerParameters.MaxConnectionAge) + + - MaxConnectionAgeGrace (originally GRPCServerParameters.MaxConnectionAgeGrace) + + - KeepAlive (originally GRPCServerParameters.Time) + + - KeepAliveTimeout (originally GRPCServerParameters.Timeout) + + - MaxSendMsgSize + + - MaxRecvMsgSize + + +SampleCache: + - Type (originally SampleCacheConfig/SampleCache.Type) (**removed in v2**) + + - KeptSize (originally SampleCacheConfig/SampleCache.KeptSize) + + - DroppedSize (originally SampleCacheConfig/SampleCache.DroppedSize) + + - SizeCheckInterval (originally SampleCacheConfig/SampleCache.SizeCheckInterval) + + +StressRelief: + - Mode (originally StressRelief.Mode) + + - ActivationLevel (originally StressRelief.ActivationLevel) + + - DeactivationLevel (originally StressRelief.DeactivationLevel) + + - SamplingRate (originally StressRelief.StressSamplingRate) + + - MinimumActivationDuration (originally StressRelief.MinimumActivationDuration) + + - MinimumStartupDuration (originally StressRelief.MinimumStartupDuration) + diff --git a/tools/convert/minimal_config.yaml b/tools/convert/minimal_config.yaml new file mode 100644 index 0000000000..bcdaa29b9e --- /dev/null +++ b/tools/convert/minimal_config.yaml @@ -0,0 +1,148 @@ +# sample uncommented config file containing all possible fields +# automatically generated on 2024-09-03 at 19:48:53 UTC +General: + ConfigurationVersion: 2 + MinRefineryVersion: "v2.0" + DatasetPrefix: "" + ConfigReloadInterval: 15s +Network: + ListenAddr: "0.0.0.0:8080" + PeerListenAddr: "0.0.0.0:8081" + HTTPIdleTimeout: 0s + HoneycombAPI: "https://api.honeycomb.io" +AccessKeys: + ReceiveKeys: + - "your-key-goes-here" + + AcceptOnlyListedKeys: false + SendKey: SetThisToAHoneycombKey + SendKeyMode: none +RefineryTelemetry: + AddRuleReasonToTrace: true + AddSpanCountToRoot: true + AddCountsToRoot: false + AddHostMetadataToTrace: true +Traces: + SendDelay: 2s + BatchTimeout: 500ms + TraceTimeout: 60s + SpanLimit: 0 + MaxBatchSize: 500 + SendTicker: 100ms +Debugging: + DebugServiceAddr: "localhost:6060" + QueryAuthToken: "some-private-value" + AdditionalErrorFields: + - "trace.span_id" + + DryRun: true +Logger: + Type: stdout + Level: warn +HoneycombLogger: + APIHost: "https://api.honeycomb.io" + APIKey: SetThisToAHoneycombKey + Dataset: "Refinery Logs" + SamplerEnabled: true + SamplerThroughput: 10 +StdoutLogger: + Structured: false + SamplerEnabled: false + SamplerThroughput: 10 +PrometheusMetrics: + Enabled: false + ListenAddr: "localhost:2112" +LegacyMetrics: + Enabled: false + APIHost: "https://api.honeycomb.io" + APIKey: SetThisToAHoneycombKey + Dataset: "Refinery Metrics" + ReportingInterval: 30s +OTelMetrics: + Enabled: false + APIHost: "https://api.honeycomb.io" + APIKey: SetThisToAHoneycombKey + Dataset: "Refinery Metrics" + ReportingInterval: 30s + Compression: gzip +OTelTracing: + Enabled: false + APIHost: "https://api.honeycomb.io" + APIKey: SetThisToAHoneycombKey + Dataset: "Refinery Traces" + SampleRate: 100 +PeerManagement: + Type: file + Identifier: "192.168.1.1" + IdentifierInterfaceName: eth0 + UseIPV6Identifier: false + Peers: + - "http://192.168.1.11:8081" + - "http://192.168.1.12:8081" + +RedisPeerManagement: + Host: "localhost:6379" + ClusterHosts: + - "- localhost:6379" + + Username: "" + Password: "" + AuthCode: "" + Prefix: customPrefix + Database: 1 + UseTLS: false + UseTLSInsecure: false + Timeout: 5s + Strategy: hash +Collection: + CacheCapacity: 10_000 + PeerQueueSize: 30_000 + IncomingQueueSize: 30_000 + AvailableMemory: "4.5Gb" + MaxMemoryPercentage: 75 + MaxAlloc: 0 + DisableRedistribution: false + ShutdownDelay: 15s +BufferSizes: + UpstreamBufferSize: 10_000 + PeerBufferSize: 100_000 +Specialized: + EnvironmentCacheTTL: 1h + CompressPeerCommunication: true + Collector: InMemCollector + CacheOverrunStrategy: resize + AdditionalAttributes: + ClusterName: MyCluster + environment: production + +IDFields: + TraceNames: + - "trace.trace_id" + - traceId + + ParentNames: + - "trace.parent_id" + - parentId + +GRPCServerParameters: + Enabled: true + ListenAddr: "" + MaxConnectionIdle: 1m + MaxConnectionAge: 3m + MaxConnectionAgeGrace: 1m + KeepAlive: 1m + KeepAliveTimeout: 20s + MaxSendMsgSize: 15MB + MaxRecvMsgSize: 15MB +SampleCache: + Type: cuckoo + KeptSize: 10_000 + DroppedSize: 1_000_000 + SizeCheckInterval: 10s +StressRelief: + Mode: never + ActivationLevel: 90 + DeactivationLevel: 75 + SamplingRate: 100 + MinimumActivationDuration: 10s + MinimumStartupDuration: 3s \ No newline at end of file diff --git a/tools/convert/templates/cfg_docrepo.tmpl b/tools/convert/templates/cfg_docrepo.tmpl index 2ec8ab96c9..b902e74114 100644 --- a/tools/convert/templates/cfg_docrepo.tmpl +++ b/tools/convert/templates/cfg_docrepo.tmpl @@ -62,7 +62,12 @@ The remainder of this document describes the sections within the file and the fi {{ $field.Description | wrapForDocs -}} {{- println -}} - {{ if $field.Reload }}E{{else}}Not e{{end}}ligible for live reload. +{{- println -}} +{{- if eq $field.Type "defaulttrue" -}} +- Type: `bool` +{{- else -}} - Type: `{{ $field.Type }}` +{{- end -}} {{- println -}} {{- if $field.Default -}} - Default: `{{ $field.Default }}` diff --git a/tools/convert/templates/configV2.tmpl b/tools/convert/templates/configV2.tmpl index 502fee1b20..ee2889d976 100644 --- a/tools/convert/templates/configV2.tmpl +++ b/tools/convert/templates/configV2.tmpl @@ -2,7 +2,7 @@ ## Honeycomb Refinery Configuration ## ###################################### # -# created {{ now }} from {{ .Input }} using a template generated on 2023-12-04 at 22:34:11 UTC +# created {{ now }} from {{ .Input }} using a template generated on 2024-09-05 at 17:40:29 UTC # This file contains a configuration for the Honeycomb Refinery. It is in YAML # format, organized into named groups, each of which contains a set of @@ -63,8 +63,11 @@ General: ## up to 10% to avoid all instances refreshing together. In installations ## where configuration changes are handled by restarting Refinery, which ## is often the case when using Kubernetes, disable this feature with a - ## value of `0s`. If the config file is being loaded from a URL, it may - ## be wise to increase this value to avoid overloading the file server. + ## value of `0s`. As of Refinery v2.7, news of a configuration change is + ## immediately propagated to all peers, and they will attempt to reload + ## their configurations. Note that external factors (for example, + ## Kubernetes ConfigMaps) may cause delays in propagating configuration + ## changes. ## ## Accepts a duration string with units, like "15s". ## default: 15s @@ -151,10 +154,43 @@ AccessKeys: ## accepted. Events arriving with API keys not in the `ReceiveKeys` list ## will be rejected with an HTTP `401` error. ## If `false`, then all traffic is accepted and `ReceiveKeys` is ignored. + ## This setting is applied **before** the `SendKey` and `SendKeyMode` + ## settings. ## ## Eligible for live reload. {{ conditional .Data "AcceptOnlyListedKeys" "nostar APIKeys" }} + ## SendKey is an optional Honeycomb API key that Refinery can use to send + ## data to Honeycomb, depending on configuration. + ## + ## If `SendKey` is set to a valid Honeycomb key, then Refinery can use + ## the listed key to send data. The exact behavior depends on the value + ## of `SendKeyMode`. + ## + ## Eligible for live reload. + {{ nonDefaultOnly .Data "SendKey" "SendKey" "" }} + + ## SendKeyMode controls how SendKey is used to replace or augment API + ## keys used in incoming telemetry. + ## + ## Controls how SendKey is used to replace or supply API keys used in + ## incoming telemetry. If `AcceptOnlyListedKeys` is `true`, then + ## `SendKeys` will only be used for events with keys listed in + ## `ReceiveKeys`. + ## `none` uses the incoming key for all telemetry (default). `all` + ## overwrites all keys, even missing ones, with `SendKey`. `nonblank` + ## overwrites all supplied keys but will not inject `SendKey` if the + ## incoming key is blank. `listedonly` overwrites only the keys listed in + ## `ReceiveKeys`. `unlisted` uses the `SendKey` for all events *except* + ## those with keys listed in `ReceiveKeys`, which use their original + ## keys. `missingonly` uses the SendKey only to inject keys into events + ## with blank keys. All other events use their original keys. + ## + ## default: none + ## Eligible for live reload. + ## Options: none all nonblank listedonly unlisted missingonly + {{ choice .Data "SendKeyMode" "SendKeyMode" (makeSlice "none" "all" "nonblank" "listedonly" "unlisted" "missingonly") "none" }} + ######################## ## Refinery Telemetry ## ######################## @@ -172,11 +208,11 @@ RefineryTelemetry: ## This setting also includes the field `meta.refinery.send_reason`, ## which contains the reason that the trace was sent. Possible values of ## this field are `trace_send_got_root`, which means that the root span - ## arrived; `trace_send_expired`, which means that TraceTimeout was + ## arrived; `trace_send_expired`, which means that `TraceTimeout` was ## reached; `trace_send_ejected_full`, which means that the trace cache - ## was full; and `trace_send_ejected_memsize`, which means that refinery + ## was full; and `trace_send_ejected_memsize`, which means that Refinery ## was out of memory. - ## These names are also the names of metrics that refinery tracks. + ## These names are also the names of metrics that Refinery tracks. ## We recommend enabling this setting whenever a rules-based sampler is ## in use, as it is useful for debugging and understanding the behavior ## of your Refinery installation. @@ -230,13 +266,15 @@ RefineryTelemetry: Traces: ## Traces contains configuration for how traces are managed. #### - ## SendDelay is the duration to wait before sending a trace. + ## SendDelay is the duration to wait after the root span arrives before + ## sending a trace. ## - ## This setting is a short timer that is triggered when a trace is - ## complete. Refinery waits for this duration before sending the trace. - ## The reason for this setting is to allow for small network delays or - ## clock jitters to elapse and any final spans to arrive before sending - ## the trace. Set to "0" for immediate sending. + ## This setting is a short timer that is triggered when a trace is marked + ## complete by the arrival of the root span. Refinery waits for this + ## duration before sending the trace. This setting exists to allow for + ## asynchronous spans and small network delays to elapse before sending + ## the trace. `SendDelay` is not applied if the `TraceTimeout` expires or + ## the `SpanLimit` is reached. ## ## Accepts a duration string with units, like "2s". ## default: 2s @@ -259,8 +297,13 @@ Traces: ## before making the trace decision about an incomplete trace. ## Normally trace decisions (send or drop) are made when the root span ## arrives. Sometimes the root span never arrives (for example, due to - ## crashes) and this timer ensures sending a trace even without having - ## received the root span. + ## crashes). Once this timer fires, Refinery will make a trace decision + ## based on the spans that have arrived so far. This ensures sending a + ## trace even when the root span never arrives. + ## After the trace decision has been made, Refinery retains a record of + ## that decision for a period of time. When additional spans (including + ## the root span) arrive, they will be kept or dropped based on the + ## original decision. ## If particularly long-lived traces are present in your data, then you ## should increase this timer. Note that this increase will also increase ## the memory requirements for Refinery. @@ -270,6 +313,19 @@ Traces: ## Eligible for live reload. {{ nonDefaultOnly .Data "TraceTimeout" "TraceTimeout" "60s" }} + ## SpanLimit is the number of spans after which a trace becomes eligible + ## for a trace decision. + ## + ## This setting helps to keep memory usage under control. If a trace has + ## more than this set number of spans, then it becomes eligible for a + ## trace decision. + ## It's most helpful in a situation where a sudden burst of many spans in + ## a large trace hits Refinery all at once, causing memory usage to spike + ## and possibly crashing Refinery. + ## + ## Eligible for live reload. + {{ nonDefaultOnly .Data "SpanLimit" "SpanLimit" 0 }} + ## MaxBatchSize is the maximum number of events to be included in each ## batch for sending. ## @@ -315,7 +371,9 @@ Debugging: {{ nonEmptyString .Data "DebugServiceAddr" "DebugServiceAddr" "localhost:6060" }} ## QueryAuthToken is the token that must be specified to access the - ## `/query` endpoint. + ## `/query` endpoint. Setting this value via a command line flag may + ## expose credentials - it is recommended to use the environment variable + ## or a configuration file. ## ## This token must be specified with the header ## "X-Honeycomb-Refinery-Query" in order for a `/query` request to @@ -401,6 +459,9 @@ HoneycombLogger: {{ nonDefaultOnly .Data "APIHost" "HoneycombLogger.LoggerHoneycombAPI" "https://api.honeycomb.io" }} ## APIKey is the API key used to send Refinery's logs to Honeycomb. + ## Setting this value via a command line flag may expose credentials - it + ## is recommended to use the environment variable or a configuration + ## file. ## ## It is recommended that you create a separate team and key for Refinery ## logs. @@ -531,7 +592,9 @@ LegacyMetrics: {{ nonDefaultOnly .Data "APIHost" "HoneycombMetrics.MetricsHoneycombAPI" "https://api.honeycomb.io" }} ## APIKey is the API key used by Refinery to send its metrics to - ## Honeycomb. + ## Honeycomb. Setting this value via a command line flag may expose + ## credentials - it is recommended to use the environment variable or a + ## configuration file. ## ## It is recommended that you create a separate team and key for Refinery ## metrics. @@ -585,7 +648,9 @@ OTelMetrics: {{ nonDefaultOnly .Data "APIHost" "APIHost" "https://api.honeycomb.io" }} ## APIKey is the API key used to send Honeycomb metrics via - ## OpenTelemetry. + ## OpenTelemetry. Setting this value via a command line flag may expose + ## credentials - it is recommended to use the environment variable or a + ## configuration file. ## ## It is recommended that you create a separate team and key for Refinery ## metrics. @@ -627,6 +692,62 @@ OTelMetrics: ## Options: none gzip {{ choice .Data "Compression" "Compression" (makeSlice "none" "gzip") "gzip" }} +########################### +## OpenTelemetry Tracing ## +########################### +OTelTracing: + ## OTelTracing contains configuration for Refinery's own tracing. + #### + ## Enabled controls whether to send Refinery's own OpenTelemetry traces. + ## + ## The setting specifies if Refinery sends traces. + ## + ## Not eligible for live reload. + {{ nonDefaultOnly .Data "Enabled" "Enabled" false }} + + ## APIHost is the URL of the OpenTelemetry API to which traces will be + ## sent. + ## + ## Refinery's internal traces will be sent to the `/v1/traces` endpoint + ## on this host. + ## + ## default: https://api.honeycomb.io + ## Not eligible for live reload. + {{ nonDefaultOnly .Data "APIHost" "APIHost" "https://api.honeycomb.io" }} + + ## APIKey is the API key used to send Refinery's traces to Honeycomb. + ## Setting this value via a command line flag may expose credentials - it + ## is recommended to use the environment variable or a configuration + ## file. + ## + ## It is recommended that you create a separate team and key for Refinery + ## telemetry. + ## If this value is blank, then Refinery will not set the + ## Honeycomb-specific headers for OpenTelemetry, and your `APIHost` must + ## be set to a valid OpenTelemetry endpoint. + ## + ## Not eligible for live reload. + {{ nonDefaultOnly .Data "APIKey" "APIKey" "" }} + + ## Dataset is the Honeycomb dataset to which Refinery sends its + ## OpenTelemetry metrics. + ## + ## Only used if `APIKey` is specified. + ## + ## default: Refinery Traces + ## Not eligible for live reload. + {{ nonDefaultOnly .Data "Dataset" "Dataset" "Refinery Traces" }} + + ## SampleRate is the rate at which Refinery samples its own traces. + ## + ## This is the Honeycomb sample rate used to sample traces sent by + ## Refinery. Since each incoming span generates multiple outgoing spans, + ## a minimum sample rate of `100` is strongly advised. + ## + ## default: 100 + ## Eligible for live reload. + {{ nonDefaultOnly .Data "SampleRate" "SampleRate" 100 }} + ##################### ## Peer Management ## ##################### @@ -638,9 +759,16 @@ PeerManagement: ## ## Peer management is the mechanism by which Refinery locates its peers. ## `file` means that Refinery gets its peer list from the Peers list in - ## this config file. - ## `redis` means that Refinery self-registers with a Redis instance and - ## gets its peer list from there. + ## this config file. It also prevents Refinery from using a + ## publish/subscribe mechanism to propagate peer lists, stress levels, + ## and configuration changes. + ## `redis` means that Refinery uses a Publish/Subscribe mechanism, + ## implemented on Redis, to propagate peer lists, stress levels, and + ## notification of configuration changes much more quickly than the + ## legacy mechanism. + ## The recommended setting is `redis`, especially for new installations. + ## If `redis` is specified, fields in `RedisPeerManagement` must also be + ## set. ## ## default: file ## Not eligible for live reload. @@ -688,18 +816,18 @@ PeerManagement: ## Peers is the list of peers to use when Type is "file", excluding self. ## ## This list is ignored when Type is "redis". The format is a list of - ## strings of the form "host:port". + ## strings of the form "scheme://host:port". ## ## Not eligible for live reload. - {{ renderStringarray .Data "Peers" "PeerManagement.Peers" "192.168.1.11:8081,192.168.1.12:8081" }} + {{ renderStringarray .Data "Peers" "PeerManagement.Peers" "http://192.168.1.11:8081,http://192.168.1.12:8081" }} ########################### ## Redis Peer Management ## ########################### RedisPeerManagement: ## RedisPeerManagement controls how the Refinery cluster communicates - ## between peers when using Redis. Only applies when - ## `PeerManagement.Type` is "redis". + ## between peers when using Redis. Does not apply when + ## `PeerManagement.Type` is "file". ## #### ## Host is the host and port of the Redis instance to use for peer @@ -711,8 +839,21 @@ RedisPeerManagement: ## Not eligible for live reload. {{ nonEmptyString .Data "Host" "PeerManagement.RedisHost" "localhost:6379" }} + ## ClusterHosts is a list of host and port pairs for the instances in a + ## Redis Cluster, and used for managing peer cluster membership. + ## + ## This configuration enables Refinery to connect to a Redis deployment + ## setup in Cluster Mode. Each entry in the list should follow the format + ## `host:port`. If `ClusterHosts` is specified, the `Host` setting will + ## be ignored. + ## + ## Not eligible for live reload. + {{ renderStringarray .Data "ClusterHosts" "ClusterHosts" "- localhost:6379" }} + ## Username is the username used to connect to Redis for peer cluster - ## membership management. + ## membership management. Setting this value via a command line flag may + ## expose credentials - it is recommended to use the environment variable + ## or a configuration file. ## ## Many Redis installations do not use this field. ## @@ -720,7 +861,9 @@ RedisPeerManagement: {{ nonEmptyString .Data "Username" "PeerManagement.RedisUsername" "" }} ## Password is the password used to connect to Redis for peer cluster - ## membership management. + ## membership management. Setting this value via a command line flag may + ## expose credentials - it is recommended to use the environment variable + ## or a configuration file. ## ## Many Redis installations do not use this field. ## @@ -728,35 +871,15 @@ RedisPeerManagement: {{ nonEmptyString .Data "Password" "PeerManagement.Password" "" }} ## AuthCode is the string used to connect to Redis for peer cluster - ## membership management using an explicit AUTH command. + ## membership management using an explicit AUTH command. Setting this + ## value via a command line flag may expose credentials - it is + ## recommended to use the environment variable or a configuration file. ## ## Many Redis installations do not use this field. ## ## Not eligible for live reload. {{ nonEmptyString .Data "AuthCode" "PeerManagement.AuthCode" "" }} - ## Prefix is a string used as a prefix for the keys in Redis while - ## storing the peer membership. - ## - ## It might be useful to override this in any situation where multiple - ## Refinery clusters or multiple applications want to share a single - ## Redis instance. It may not be blank. - ## - ## default: refinery - ## Not eligible for live reload. - {{ nonDefaultOnly .Data "Prefix" "PeerManagement.Prefix" "refinery" }} - - ## Database is the database number to use for the Redis instance storing - ## the peer membership. - ## - ## An integer from 0-15 indicating the database number to use for the - ## Redis instance storing the peer membership. It might be useful to set - ## this in any situation where multiple Refinery clusters or multiple - ## applications want to share a single Redis instance. - ## - ## Not eligible for live reload. - {{ nonDefaultOnly .Data "Database" "PeerManagement.Database" 0 }} - ## UseTLS enables TLS when connecting to Redis for peer cluster ## membership management. ## @@ -798,12 +921,13 @@ Collection: ## CacheCapacity is the number of traces to keep in the cache's circular ## buffer. ## - ## The collection cache is used to collect all spans into a trace as well - ## as remember the sampling decision for any spans that might come in - ## after the trace has been marked "complete" (either by timing out or - ## seeing the root span). The number of traces in the cache should be - ## many multiples (100x to 1000x) of the total number of concurrently - ## active traces (trace throughput * trace duration). + ## The collection cache is used to collect all active spans into traces. + ## It is organized as a circular buffer. When the buffer wraps around, + ## Refinery will try a few times to find an empty slot; if it fails, it + ## starts ejecting traces from the cache earlier than would otherwise be + ## necessary. Ideally, the size of the cache should be many multiples + ## (100x to 1000x) of the total number of concurrently active traces + ## (average trace throughput * average trace duration). ## ## default: 10000 ## Eligible for live reload. @@ -878,6 +1002,33 @@ Collection: ## Eligible for live reload. {{ memorysize .Data "MaxAlloc" "InMemCollector.MaxAlloc" "" }} + ## DisableRedistribution controls whether to transmit traces in cache to + ## remaining peers during cluster scaling event. + ## + ## If `true`, Refinery will NOT forward live traces in its cache to the + ## rest of the peers when peers join or leave the cluster. By disabling + ## this behavior, it can help to prevent disruptive bursts of network + ## traffic when large traces with long `TraceTimeout` are redistributed. + ## + ## Eligible for live reload. + {{ nonDefaultOnly .Data "DisableRedistribution" "DisableRedistribution" false }} + + ## ShutdownDelay controls the maximum time Refinery can use while + ## draining traces at shutdown. + ## + ## This setting controls the duration that Refinery expects to have to + ## drain in-process traces before shutting down an instance. When asked + ## to shut down gracefully, Refinery stops accepting new spans + ## immediately and drains the remaining traces by sending them to + ## remaining peers. This value should be set to a bit less than the + ## normal timeout period for shutting down without forcibly terminating + ## the process. + ## + ## Accepts a duration string with units, like "15s". + ## default: 15s + ## Eligible for live reload. + {{ nonDefaultOnly .Data "ShutdownDelay" "ShutdownDelay" "15s" }} + ################## ## Buffer Sizes ## ################## @@ -994,6 +1145,7 @@ GRPCServerParameters: ## If `false`, then the gRPC server is not started and no gRPC traffic is ## accepted. ## + ## default: true ## Not eligible for live reload. {{ conditional .Data "Enabled" "nonempty GRPCListenAddr" }} @@ -1074,7 +1226,7 @@ GRPCServerParameters: ## memory available to the process by a single request. The size is ## expressed in bytes. ## - ## default: 5MB + ## default: 15MB ## Not eligible for live reload. {{ memorysize .Data "MaxSendMsgSize" "MaxSendMsgSize" "" }} @@ -1084,7 +1236,7 @@ GRPCServerParameters: ## memory available to the process by a single request. The size is ## expressed in bytes. ## - ## default: 5MB + ## default: 15MB ## Not eligible for live reload. {{ memorysize .Data "MaxRecvMsgSize" "MaxRecvMsgSize" "" }} @@ -1225,31 +1377,16 @@ StressRelief: ## Eligible for live reload. {{ nonDefaultOnly .Data "MinimumActivationDuration" "StressRelief.MinimumActivationDuration" "10s" }} - ## MinimumStartupDuration is the minimum time that Stress Relief will - ## stay enabled. - ## - ## This setting is used when switching into Monitor mode. - ## When Stress Relief is enabled, it will start up in stressed mode for - ## at least this set duration of time to try to make sure that Refinery - ## can handle the load before it begins processing it in earnest. This is - ## to help address the problem of trying to bring a new node into an - ## already-overloaded cluster. - ## If this duration is `0`, then Refinery will not start in stressed - ## mode, which will provide faster startup at the possible cost of - ## startup instability. - ## - ## Accepts a duration string with units, like "3s". - ## default: 3s - ## Eligible for live reload. - {{ nonDefaultOnly .Data "MinimumStartupDuration" "StressRelief.MinimumStartupDuration" "3s" }} - ################################################### ## Config values removed by the config converter ## ################################################### ## The following configuration options are obsolete and are not included ## in the new configuration: ## + ## - PeerManagement.Prefix + ## - PeerManagement.Database ## - PeerManagement.Strategy ## - Collector ## - InMemCollector.CacheOverrunStrategy ## - SampleCacheConfig/SampleCache.Type + ## - StressRelief.MinimumStartupDuration diff --git a/transmit/transmit.go b/transmit/transmit.go index b98ef658c0..6a5790646e 100644 --- a/transmit/transmit.go +++ b/transmit/transmit.go @@ -56,10 +56,7 @@ func (d *DefaultTransmission) Start() error { // upstreamAPI doesn't get set when the client is initialized, because // it can be reloaded from the config file while live - upstreamAPI, err := d.Config.GetHoneycombAPI() - if err != nil { - return err - } + upstreamAPI := d.Config.GetHoneycombAPI() d.builder = d.LibhClient.NewBuilder() d.builder.APIHost = upstreamAPI @@ -83,13 +80,9 @@ func (d *DefaultTransmission) Start() error { return nil } -func (d *DefaultTransmission) reloadTransmissionBuilder() { +func (d *DefaultTransmission) reloadTransmissionBuilder(cfgHash, ruleHash string) { d.Logger.Debug().Logf("reloading transmission config") - upstreamAPI, err := d.Config.GetHoneycombAPI() - if err != nil { - // log and skip reload - d.Logger.Error().Logf("Failed to reload Honeycomb API when reloading configs:", err) - } + upstreamAPI := d.Config.GetHoneycombAPI() builder := d.LibhClient.NewBuilder() builder.APIHost = upstreamAPI } diff --git a/types/event.go b/types/event.go index cecb53e4da..4a9c9f1979 100644 --- a/types/event.go +++ b/types/event.go @@ -3,6 +3,8 @@ package types import ( "context" "time" + + huskyotlp "github.com/honeycombio/husky/otlp" ) const ( @@ -40,11 +42,12 @@ type Trace struct { TraceID string // SampleRate should only be changed if the changer holds the SendSampleLock - SampleRate uint + sampleRate uint // KeepSample should only be changed if the changer holds the SendSampleLock KeepSample bool // Sent should only be changed if the changer holds the SendSampleLock - Sent bool + Sent bool + sentReason uint SendBy time.Time @@ -99,6 +102,26 @@ func (t *Trace) GetSpans() []*Span { return t.spans } +func (t *Trace) ID() string { + return t.TraceID +} + +func (t *Trace) SampleRate() uint { + return t.sampleRate +} + +func (t *Trace) SetSampleRate(rate uint) { + t.sampleRate = rate +} + +func (t *Trace) SentReason() uint { + return t.sentReason +} + +func (t *Trace) SetSentReason(reason uint) { + t.sentReason = reason +} + // DescendantCount gets the number of descendants of all kinds currently in this trace func (t *Trace) DescendantCount() uint32 { return uint32(len(t.spans)) @@ -232,5 +255,5 @@ func (sp *Span) CacheImpact(traceTimeout time.Duration) int { } func IsLegacyAPIKey(apiKey string) bool { - return len(apiKey) == 32 + return huskyotlp.IsClassicApiKey(apiKey) }