diff --git a/.ci/Jenkinsfile b/.ci/Jenkinsfile index 496a42250..7b2d0ed49 100644 --- a/.ci/Jenkinsfile +++ b/.ci/Jenkinsfile @@ -10,8 +10,8 @@ pipeline { DOCKER_COMPOSE_VERSION = '1.25.5' JOB_GIT_CREDENTIALS = "f6c7695a-671e-4f4f-a331-acdce44ff9ba" PIPELINE_LOG_LEVEL='INFO' - JOB_GCS_BUCKET = 'beats-ci-artifacts' - JOB_GCS_CREDENTIALS = 'beats-ci-gcs-plugin-file-credentials' + JOB_GCS_BUCKET = 'fleet-ci-artifacts' + JOB_GCS_CREDENTIALS = 'fleet-ci-gcs-plugin-file-credentials' } options { timeout(time: 1, unit: 'HOURS') @@ -139,7 +139,7 @@ pipeline { options { skipDefaultCheckout() } when { expression { isBranch() } } steps { - build(job: "Ingest-manager/fleet-server-package-mbp/${env.JOB_BASE_NAME}", + build(job: "fleet-server/fleet-server-package-mbp/${env.JOB_BASE_NAME}", propagate: false, wait: false, parameters: [string(name: 'COMMIT', value: "${env.GIT_BASE_COMMIT}")]) diff --git a/.ci/jobs/defaults.yml b/.ci/jobs/defaults.yml index d749f5b0c..0e3defc4e 100644 --- a/.ci/jobs/defaults.yml +++ b/.ci/jobs/defaults.yml @@ -3,7 +3,7 @@ ##### GLOBAL METADATA - meta: - cluster: beats-ci + cluster: fleet-ci ##### JOB DEFAULTS @@ -15,4 +15,4 @@ publishers: - email: recipients: infra-root+build@elastic.co - prune-dead-branches: true \ No newline at end of file + prune-dead-branches: true diff --git a/.ci/jobs/fleet-server-package-mbp.yml b/.ci/jobs/fleet-server-package-mbp.yml index 2a519cb73..d911c7bf9 100644 --- a/.ci/jobs/fleet-server-package-mbp.yml +++ b/.ci/jobs/fleet-server-package-mbp.yml @@ -1,6 +1,6 @@ --- - job: - name: Ingest-manager/fleet-server-package-mbp + name: fleet-server/fleet-server-package-mbp display-name: Fleet Server Package description: Jenkins pipeline for the Elastic Fleet Server package process project-type: multibranch @@ -53,4 +53,4 @@ reference-repo: /var/lib/jenkins/.git-references/fleet-server.git timeout: '15' use-author: true - wipe-workspace: 'True' + wipe-workspace: true diff --git a/.ci/jobs/fleet-server.yml b/.ci/jobs/fleet-server.yml index 5c71b0094..18ff2884b 100644 --- a/.ci/jobs/fleet-server.yml +++ b/.ci/jobs/fleet-server.yml @@ -1,6 +1,6 @@ --- - job: - name: Ingest-manager/fleet-server + name: fleet-server/fleet-server-mbp display-name: Fleet Server description: Jenkins pipeline for the Elastic Fleet Server project view: Beats diff --git a/.ci/jobs/folders.yml b/.ci/jobs/folders.yml index c0cd03617..178ab4dc6 100644 --- a/.ci/jobs/folders.yml +++ b/.ci/jobs/folders.yml @@ -1,10 +1,10 @@ --- #https://docs.openstack.org/infra/jenkins-job-builder/project_folder.html - job: - name: Ingest-manager - description: Ingest manager related Jobs + name: fleet-server + description: Fleet Server related Jobs project-type: folder - view: - name: Ingest-manager - view-type: list \ No newline at end of file + name: fleet-server + view-type: list diff --git a/.ci/packaging.groovy b/.ci/packaging.groovy index 4e8d6760f..905855c42 100644 --- a/.ci/packaging.groovy +++ b/.ci/packaging.groovy @@ -9,7 +9,7 @@ pipeline { SLACK_CHANNEL = '#elastic-agent-control-plane' NOTIFY_TO = 'fleet-server+build-package@elastic.co' JOB_GCS_BUCKET = credentials('gcs-bucket') - JOB_GCS_CREDENTIALS = 'beats-ci-gcs-plugin' + JOB_GCS_CREDENTIALS = 'fleet-ci-gcs-plugin' DOCKER_SECRET = 'secret/observability-team/ci/docker-registry/prod' DOCKER_REGISTRY = 'docker.elastic.co' DRA_OUTPUT = 'release-manager.out' @@ -133,7 +133,7 @@ pipeline { } } } - stage('DRA Staging') { + stage('DRA Release Staging') { options { skipDefaultCheckout() } when { allOf { diff --git a/.github/workflows/add-issues-to-ingest-board.yml b/.github/workflows/add-issues-to-ingest-board.yml new file mode 100644 index 000000000..ad2f01cbd --- /dev/null +++ b/.github/workflows/add-issues-to-ingest-board.yml @@ -0,0 +1,51 @@ +name: Add issue to Platform Ingest project + +on: + issues: + types: + - labeled + +env: + INGEST_PROJECT_ID: 'PVT_kwDOAGc3Zs4AEzn4' + FLEET_LABEL: 'Team:Fleet' + AREA_FIELD_ID: 'PVTSSF_lADOAGc3Zs4AEzn4zgEgZSo' + FLEET_SERVER_OPTION_ID: 'ea828bb4' + +jobs: + add_to_ingest_project: + runs-on: ubuntu-latest + steps: + - uses: octokit/graphql-action@v2.x + id: add_to_project + if: ${{ github.event.label.name == env.FLEET_LABEL }} + with: + query: | + # Variables have to be snake cased because of https://github.com/octokit/graphql-action/issues/164 + mutation AddToIngestProject($project_id: ID!, $content_id: ID!) { + addProjectV2ItemById(input: { projectId: $project_id, contentId: $content_id }) { + item { + id + } + } + } + project_id: ${{ env.INGEST_PROJECT_ID }} + content_id: ${{ github.event.issue.node_id }} + env: + GITHUB_TOKEN: ${{ secrets.PROJECT_ASSIGNER_TOKEN }} + - uses: octokit/graphql-action@v2.x + id: set_fleet_server_area + if: github.event.label.name == env.FLEET_LABEL + with: + query: | + mutation updateIngestArea($item_id: ID!, $project_id: ID!, $area_field_id: ID!, $area_id: String) { + updateProjectV2ItemFieldValue( + input: { itemId: $item_id, projectId: $project_id, fieldId: $area_field_id, value: { singleSelectOptionId: $area_id } }) { + clientMutationId + } + } + item_id: ${{ fromJSON(steps.add_to_project.outputs.data).addProjectV2ItemById.item.id }} + project_id: ${{ env.INGEST_PROJECT_ID }} + area_field_id: ${{ env.AREA_FIELD_ID }} + area_id: ${{ env.FLEET_SERVER_OPTION_ID }} + env: + GITHUB_TOKEN: ${{ secrets.PROJECT_ASSIGNER_TOKEN }} diff --git a/.go-version b/.go-version index 8e8b0a933..d6f3a382b 100644 --- a/.go-version +++ b/.go-version @@ -1 +1 @@ -1.18.5 +1.18.7 diff --git a/.mergify.yml b/.mergify.yml index 4828585ea..3de01e069 100644 --- a/.mergify.yml +++ b/.mergify.yml @@ -144,3 +144,16 @@ pull_request_rules: labels: - "backport" title: "[{{ destination_branch }}](backport #{{ number }}) {{ title }}" + - name: backport patches to 8.5 branch + conditions: + - merged + - label=backport-v8.5.0 + actions: + backport: + assignees: + - "{{ author }}" + branches: + - "8.5" + labels: + - "backport" + title: "[{{ destination_branch }}](backport #{{ number }}) {{ title }}" diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc index b0fde4951..67546b1da 100644 --- a/CHANGELOG.next.asciidoc +++ b/CHANGELOG.next.asciidoc @@ -1,8 +1,22 @@ +==== Breaking Changes + +- Upgrade to Go 1.18. Certificates signed with SHA-1 are now rejected. See the Go 1.18 https://tip.golang.org/doc/go1.18#sha1[release notes] for details. {pull}1709[1709] + ==== Bugfixes - Return a better error on enrolling and the Elasticsearch version is incompatible. {pull}1211[1211] - Give a grace period when starting the unenroll monitor. {issue}1500[1500] - Fixes a race condition between the unenroller goroutine and the main goroutine for the coordinator monitor. {issues}1738[1738] +- Remove events from agent checkin body. {issue}1774[1774] +- Improve authc debug logging. {pull}1870[1870] +- Add error detail to catch-all HTTP error response. {pull}1854[1854] +- Fix issue were errors where being ignored written to elasticsearch. {pull}1896[1896] +- Update apikey.cache_hit log field name to match convention. {pull}1900[1900] +- LoadServerLimits will not overwrite specified limits when loading default/agent number specified values. {issue}1841[1841] {pull}1912[1912] +- Use seperate rate limiters for internal and external API listeners. {issue}1859[1859] {pull}1904[1904] +- Fix fleet.migration.total log key overlap {pull}1951[1951] +- Remove POLICY_CHANGE actions from list retrieved from actions index before sending actions to agent on Checkin. {issue}1773[1773] {pull}1963[1963] +- Add "active: true" filter to enrollemnent key queries. {issue}2029[2029] {pull}2044[2044] ==== New Features @@ -12,4 +26,6 @@ - Add start_time and minimum_execution_duration to actions to allow fleet-server to schedule agent actions. {pull}1381[1381] - Fleet Server now allows setting global labels on APM instrumentation. {pull}1649[1649] - Fleet Server now allows setting transaction sample rate on APM instrumentation {pull}1681[1681] -- Log redacted config when config updates. {issue}1626[1626] {pull}1668[1668] \ No newline at end of file +- Log redacted config when config updates. {issue}1626[1626] {pull}1668[1668] +- Storing checkin message in last_checkin_message {pull}1932[1932] +- Allow upgrade actions to signal that they will be retried. {pull}1887[1887] diff --git a/NOTICE.txt b/NOTICE.txt index dc18b37dd..58b0d9d69 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -227,11 +227,11 @@ Contents of probable licence file $GOMODCACHE/github.com/dgraph-io/ristretto@v0. -------------------------------------------------------------------------------- Dependency : github.com/elastic/elastic-agent -Version: v0.0.0-20220831162706-5f1e54f40d3e +Version: v0.0.0-20221107053805-657f66dad4bd Licence type (autodetected): Elastic -------------------------------------------------------------------------------- -Contents of probable licence file $GOMODCACHE/github.com/elastic/elastic-agent@v0.0.0-20220831162706-5f1e54f40d3e/LICENSE.txt: +Contents of probable licence file $GOMODCACHE/github.com/elastic/elastic-agent@v0.0.0-20221107053805-657f66dad4bd/LICENSE.txt: Elastic License 2.0 @@ -563,11 +563,11 @@ SOFTWARE -------------------------------------------------------------------------------- Dependency : github.com/elastic/elastic-agent-libs -Version: v0.2.6 +Version: v0.2.14 Licence type (autodetected): Apache-2.0 -------------------------------------------------------------------------------- -Contents of probable licence file $GOMODCACHE/github.com/elastic/elastic-agent-libs@v0.2.6/LICENSE: +Contents of probable licence file $GOMODCACHE/github.com/elastic/elastic-agent-libs@v0.2.14/LICENSE: Apache License Version 2.0, January 2004 @@ -9133,11 +9133,11 @@ you may not use this file except in compliance with the Elastic License. -------------------------------------------------------------------------------- Dependency : github.com/elastic/elastic-agent-autodiscover -Version: v0.0.0-20220404145827-89887023c1ab +Version: v0.2.1 Licence type (autodetected): Apache-2.0 -------------------------------------------------------------------------------- -Contents of probable licence file $GOMODCACHE/github.com/elastic/elastic-agent-autodiscover@v0.0.0-20220404145827-89887023c1ab/LICENSE: +Contents of probable licence file $GOMODCACHE/github.com/elastic/elastic-agent-autodiscover@v0.2.1/LICENSE: Apache License Version 2.0, January 2004 @@ -9556,11 +9556,11 @@ Contents of probable licence file $GOMODCACHE/github.com/elastic/go-licenser@v0. -------------------------------------------------------------------------------- Dependency : github.com/elastic/go-structform -Version: v0.0.9 +Version: v0.0.10 Licence type (autodetected): Apache-2.0 -------------------------------------------------------------------------------- -Contents of probable licence file $GOMODCACHE/github.com/elastic/go-structform@v0.0.9/LICENSE: +Contents of probable licence file $GOMODCACHE/github.com/elastic/go-structform@v0.0.10/LICENSE: Apache License Version 2.0, January 2004 @@ -14217,6 +14217,43 @@ Contents of probable licence file $GOMODCACHE/github.com/googleapis/gnostic@v0.5 +-------------------------------------------------------------------------------- +Dependency : github.com/gorilla/mux +Version: v1.8.0 +Licence type (autodetected): BSD-3-Clause +-------------------------------------------------------------------------------- + +Contents of probable licence file $GOMODCACHE/github.com/gorilla/mux@v1.8.0/LICENSE: + +Copyright (c) 2012-2018 The Gorilla Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + -------------------------------------------------------------------------------- Dependency : github.com/grpc-ecosystem/grpc-gateway Version: v1.16.0 @@ -25918,6 +25955,217 @@ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +-------------------------------------------------------------------------------- +Dependency : go.elastic.co/apm/module/apmgorilla +Version: v1.15.0 +Licence type (autodetected): Apache-2.0 +-------------------------------------------------------------------------------- + +Contents of probable licence file $GOMODCACHE/go.elastic.co/apm/module/apmgorilla@v1.15.0/LICENSE: + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2018 Elasticsearch BV + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + -------------------------------------------------------------------------------- Dependency : go.elastic.co/apm/module/apmgrpc Version: v1.15.0 diff --git a/README.md b/README.md index 1ca907f1a..d39be5dba 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -[![Build Status](https://beats-ci.elastic.co/job/Ingest-manager/job/fleet-server/job/main/badge/icon)](https://beats-ci.elastic.co/job/Ingest-manager/job/fleet-server/job/main/) +[![Build Status](https://fleet-ci.elastic.co/job/fleet-server/job/fleet-server-mbp/job/main/badge/icon)](https://fleet-ci.elastic.co/job/Ingest-manager/job/fleet-server/job/main/) # Fleet Server implementation diff --git a/cmd/fleet/main.go b/cmd/fleet/main.go index 61618eb12..1588112f7 100644 --- a/cmd/fleet/main.go +++ b/cmd/fleet/main.go @@ -8,61 +8,25 @@ package fleet import ( "context" "errors" - "fmt" - "io" - "net/url" "os" - "reflect" - "runtime/debug" - "sync" - "time" "go.elastic.co/apm" - apmtransport "go.elastic.co/apm/transport" - "github.com/elastic/go-ucfg" "github.com/elastic/go-ucfg/yaml" - "github.com/elastic/fleet-server/v7/internal/pkg/action" - "github.com/elastic/fleet-server/v7/internal/pkg/api" "github.com/elastic/fleet-server/v7/internal/pkg/build" - "github.com/elastic/fleet-server/v7/internal/pkg/bulk" - "github.com/elastic/fleet-server/v7/internal/pkg/cache" - "github.com/elastic/fleet-server/v7/internal/pkg/checkin" "github.com/elastic/fleet-server/v7/internal/pkg/config" - "github.com/elastic/fleet-server/v7/internal/pkg/coordinator" - "github.com/elastic/fleet-server/v7/internal/pkg/dl" - "github.com/elastic/fleet-server/v7/internal/pkg/es" - "github.com/elastic/fleet-server/v7/internal/pkg/gc" "github.com/elastic/fleet-server/v7/internal/pkg/logger" - "github.com/elastic/fleet-server/v7/internal/pkg/monitor" - "github.com/elastic/fleet-server/v7/internal/pkg/policy" - "github.com/elastic/fleet-server/v7/internal/pkg/profile" - "github.com/elastic/fleet-server/v7/internal/pkg/reload" - "github.com/elastic/fleet-server/v7/internal/pkg/scheduler" + "github.com/elastic/fleet-server/v7/internal/pkg/server" "github.com/elastic/fleet-server/v7/internal/pkg/signal" - "github.com/elastic/fleet-server/v7/internal/pkg/sleep" "github.com/elastic/fleet-server/v7/internal/pkg/state" - "github.com/elastic/fleet-server/v7/internal/pkg/ver" - "github.com/hashicorp/go-version" - "github.com/rs/zerolog" "github.com/rs/zerolog/log" "github.com/spf13/cobra" - "golang.org/x/sync/errgroup" - - "github.com/elastic/elastic-agent-client/v7/pkg/client" ) const ( - kAgentMode = "agent-mode" - kAgentModeRestartLoopDelay = 2 * time.Second - - kFleetServer = "fleet-server" - kUAFleetServer = "Fleet-Server" - kElasticsearch = "elasticsearch" - - kStopped = "Stopped" + kAgentMode = "agent-mode" ) func init() { @@ -75,26 +39,6 @@ func installSignalHandler() context.Context { return signal.HandleInterrupt(rootCtx) } -func makeCache(cfg *config.Config) (cache.Cache, error) { - cacheCfg := makeCacheConfig(cfg) - log.Info().Interface("cfg", cacheCfg).Msg("Setting cache config options") - return cache.New(cacheCfg) -} - -func makeCacheConfig(cfg *config.Config) cache.Config { - ccfg := cfg.Inputs[0].Cache - - return cache.Config{ - NumCounters: ccfg.NumCounters, - MaxCost: ccfg.MaxCost, - ActionTTL: ccfg.ActionTTL, - EnrollKeyTTL: ccfg.EnrollKeyTTL, - ArtifactTTL: ccfg.ArtifactTTL, - APIKeyTTL: ccfg.APIKeyTTL, - APIKeyJitter: ccfg.APIKeyJitter, - } -} - func initLogger(cfg *config.Config, version, commit string) (*logger.Logger, error) { l, err := logger.Init(cfg, build.ServiceName) if err != nil { @@ -109,7 +53,7 @@ func initLogger(cfg *config.Config, version, commit string) (*logger.Logger, err Str("exe", os.Args[0]). Strs("args", os.Args[1:]). Msg("Boot fleet-server") - log.Debug().Strs("env", os.Environ()).Msg("environment") + log.Debug().Strs("env", os.Environ()).Msg("environment variables") return l, err } @@ -125,7 +69,6 @@ func getRunCommand(bi build.Info) func(cmd *cobra.Command, args []string) error } var l *logger.Logger - var runErr error if agentMode { cfg, err := config.FromConfig(cliCfg) if err != nil { @@ -136,12 +79,16 @@ func getRunCommand(bi build.Info) func(cmd *cobra.Command, args []string) error return err } - agent, err := NewAgentMode(cliCfg, os.Stdin, bi, l) + srv, err := server.NewAgent(cliCfg, os.Stdin, bi, l) if err != nil { return err } - runErr = agent.Run(installSignalHandler()) + if err := srv.Run(installSignalHandler()); err != nil && !errors.Is(err, context.Canceled) { + log.Error().Err(err).Msg("Exiting") + l.Sync() + return err + } } else { cfgPath, err := cmd.Flags().GetString("config") if err != nil { @@ -165,19 +112,18 @@ func getRunCommand(bi build.Info) func(cmd *cobra.Command, args []string) error return err } - srv, err := NewFleetServer(bi, state.NewLog()) + srv, err := server.NewFleet(bi, state.NewLog()) if err != nil { return err } - runErr = srv.Run(installSignalHandler(), cfg) + if err := srv.Run(installSignalHandler(), cfg); err != nil && !errors.Is(err, context.Canceled) { + log.Error().Err(err).Msg("Exiting") + l.Sync() + return err + } } - if runErr != nil && !errors.Is(runErr, context.Canceled) { - log.Error().Err(runErr).Msg("Exiting") - l.Sync() - return runErr - } l.Sync() return nil } @@ -194,946 +140,3 @@ func NewCommand(bi build.Info) *cobra.Command { cmd.Flags().VarP(config.NewFlag(), "E", "E", "Overwrite configuration value") return cmd } - -type AgentMode struct { - cliCfg *ucfg.Config - bi build.Info - reloadables []reload.Reloadable - - agent client.V2 - - outputUnit *client.Unit - inputUnit *client.Unit - - srv *FleetServer - srvCtx context.Context - srvCanceller context.CancelFunc - srvDone chan bool -} - -func NewAgentMode(cliCfg *ucfg.Config, reader io.Reader, bi build.Info, reloadables ...reload.Reloadable) (*AgentMode, error) { - var err error - - a := &AgentMode{ - cliCfg: cliCfg, - bi: bi, - reloadables: reloadables, - } - a.agent, _, err = client.NewV2FromReader(reader, client.VersionInfo{ - Name: kFleetServer, - Version: bi.Version, - Meta: map[string]string{ - "commit": bi.Commit, - "build_time": bi.BuildTime.String(), - }, - }) - if err != nil { - return nil, err - } - return a, nil -} - -func (a *AgentMode) Run(ctx context.Context) error { - subCtx, subCanceller := context.WithCancel(ctx) - defer subCanceller() - - var wg sync.WaitGroup - wg.Add(1) - go func() { - defer wg.Done() - - t := time.NewTicker(1 * time.Second) - defer t.Stop() - for { - select { - case <-subCtx.Done(): - return - case err := <-a.agent.Errors(): - if err != nil && !errors.Is(err, context.Canceled) && !errors.Is(err, io.EOF) { - log.Error().Err(err) - } - case change := <-a.agent.UnitChanges(): - switch change.Type { - case client.UnitChangedAdded: - err := a.unitAdded(subCtx, change.Unit) - if err != nil { - log.Error().Str("unit", change.Unit.ID()).Err(err) - _ = change.Unit.UpdateState(client.UnitStateFailed, err.Error(), nil) - } - case client.UnitChangedModified: - err := a.unitModified(subCtx, change.Unit) - if err != nil { - log.Error().Str("unit", change.Unit.ID()).Err(err) - _ = change.Unit.UpdateState(client.UnitStateFailed, err.Error(), nil) - } - case client.UnitChangedRemoved: - a.unitRemoved(change.Unit) - } - case <-t.C: - // Fleet Server is the only component that gets started by Elastic Agent without an Agent ID. We loop - // here on interval waiting for the Elastic Agent to enroll so then the Agent ID is then set. - agentInfo := a.agent.AgentInfo() - if agentInfo != nil && agentInfo.ID != "" { - // Agent ID is not set for the component. - t.Stop() - err := a.reconfigure(subCtx) - if err != nil { - log.Error().Err(err) - } - } - } - } - }() - - log.Info().Msg("starting communication connection back to Elastic Agent") - err := a.agent.Start(subCtx) - if err != nil { - return err - } - - <-subCtx.Done() - wg.Wait() - - return nil -} - -// UpdateState updates the state of the message and payload. -func (a *AgentMode) UpdateState(state client.UnitState, message string, payload map[string]interface{}) error { - if a.inputUnit != nil { - _ = a.inputUnit.UpdateState(state, message, payload) - } - if a.outputUnit != nil { - _ = a.outputUnit.UpdateState(state, message, payload) - } - return nil -} - -func (a *AgentMode) unitAdded(ctx context.Context, unit *client.Unit) error { - if unit.Type() == client.UnitTypeInput { - _, _, cfg := unit.Expected() - if cfg.Type != kFleetServer { - // not support input type - _ = unit.UpdateState(client.UnitStateFailed, fmt.Sprintf("%s is an unsupported input type", cfg.Type), nil) - return nil - } - if a.inputUnit != nil { - // original input unit is being stopped; swapping in this unit as the new input unit - _ = a.inputUnit.UpdateState(client.UnitStateStopped, kStopped, nil) - } - a.inputUnit = unit - if a.outputUnit == nil { - // waiting for output unit to really start Fleet Server - _ = unit.UpdateState(client.UnitStateStarting, "waiting for output unit", nil) - return nil - } - return a.start(ctx) - } - if unit.Type() == client.UnitTypeOutput { - _, _, cfg := unit.Expected() - if cfg.Type != kElasticsearch { - // not support output type - _ = unit.UpdateState(client.UnitStateFailed, fmt.Sprintf("%s is an unsupported output type", cfg.Type), nil) - return nil - } - if a.outputUnit != nil { - // original output unit is being stopped; swapping in this unit as the new output unit - _ = a.outputUnit.UpdateState(client.UnitStateStopped, kStopped, nil) - } - a.outputUnit = unit - if a.inputUnit == nil { - // waiting for input unit to really start Fleet Server - _ = unit.UpdateState(client.UnitStateStarting, "waiting for input unit", nil) - return nil - } - return a.start(ctx) - } - return fmt.Errorf("unknown unit type %v", unit.Type()) -} - -func (a *AgentMode) unitModified(ctx context.Context, unit *client.Unit) error { - state, _, _ := unit.Expected() - if unit.Type() == client.UnitTypeInput { - if a.inputUnit != unit { - // not our input unit; would have been marked failed in unitAdded; do nothing - return nil - } - if state == client.UnitStateHealthy { - if a.outputUnit == nil { - // still no output unit; would have been marked starting already; do nothing - return nil - } - - // configuration modified (should still be running) - return a.reconfigure(ctx) - } else if state == client.UnitStateStopped { - // unit should be stopped - a.stop() - return nil - } - return fmt.Errorf("unknown unit state %v", state) - } - if unit.Type() == client.UnitTypeOutput { - if a.outputUnit != unit { - // not our output unit; would have been marked failed in unitAdded; do nothing - return nil - } - if state == client.UnitStateHealthy { - if a.inputUnit == nil { - // still no input unit; would have been marked starting already; do nothing - return nil - } - - // configuration modified (should still be running) - return a.reconfigure(ctx) - } else if state == client.UnitStateStopped { - // unit should be stopped - a.stop() - return nil - } - return fmt.Errorf("unknown unit state %v", state) - } - return fmt.Errorf("unknown unit type %v", unit.Type()) -} - -func (a *AgentMode) unitRemoved(unit *client.Unit) { - stop := false - if a.inputUnit == unit || a.outputUnit == unit { - stop = true - } - if stop { - a.stop() - } - if a.inputUnit == unit { - a.inputUnit = nil - } - if a.outputUnit == unit { - a.outputUnit = nil - } -} - -func (a *AgentMode) start(ctx context.Context) error { - if a.srv != nil { - return a.reconfigure(ctx) - } - - cfg, err := a.configFromUnits() - if err != nil { - return err - } - - // reload the generic reloadables - for _, r := range a.reloadables { - err = r.Reload(ctx, cfg) - if err != nil { - return err - } - } - - srvDone := make(chan bool) - srvCtx, srvCanceller := context.WithCancel(ctx) - srv, err := NewFleetServer(a.bi, state.NewChained(state.NewLog(), a)) - if err != nil { - close(srvDone) - srvCanceller() - return err - } - - go func() { - defer close(srvDone) - for { - err := srv.Run(srvCtx, cfg) - if err == nil || errors.Is(err, context.Canceled) { - return - } - // sleep some before calling Run again - _ = sleep.WithContext(srvCtx, kAgentModeRestartLoopDelay) - } - }() - - a.srv = srv - a.srvCtx = srvCtx - a.srvCanceller = srvCanceller - a.srvDone = srvDone - return nil -} - -func (a *AgentMode) reconfigure(ctx context.Context) error { - if a.srv == nil { - return a.start(ctx) - } - - cfg, err := a.configFromUnits() - if err != nil { - return err - } - - // reload the generic reloadables - for _, r := range a.reloadables { - err = r.Reload(ctx, cfg) - if err != nil { - return err - } - } - - return a.srv.Reload(ctx, cfg) -} - -func (a *AgentMode) stop() { - if a.srvCanceller == nil { - return - } - - canceller := a.srvCanceller - a.srvCanceller = nil - a.srvCtx = nil - a.srv = nil - canceller() - <-a.srvDone - a.srvDone = nil - - if a.inputUnit != nil { - _ = a.inputUnit.UpdateState(client.UnitStateStopped, kStopped, nil) - } - if a.outputUnit != nil { - _ = a.outputUnit.UpdateState(client.UnitStateStopped, kStopped, nil) - } -} - -// configFromUnits takes both inputUnit and outputUnit and creates a single configuration just like fleet server was -// being started from a configuration file. -func (a *AgentMode) configFromUnits() (*config.Config, error) { - agentID := "" - agentVersion := "" - agentInfo := a.agent.AgentInfo() - if agentInfo != nil { - agentID = agentInfo.ID - agentVersion = agentInfo.Version - } - _, inputLevel, inputCfg := a.inputUnit.Expected() - _, outputLevel, outputCfg := a.outputUnit.Expected() - logLevel := inputLevel - if outputLevel > logLevel { - logLevel = outputLevel - } - - cfgData, err := ucfg.NewFrom(map[string]interface{}{ - "fleet": map[string]interface{}{ - "agent": map[string]interface{}{ - "id": agentID, - "version": agentVersion, - "logging": map[string]interface{}{ - "level": logLevel.String(), - }, - }, - }, - "output": map[string]interface{}{ - "elasticsearch": outputCfg.Source.AsMap(), - }, - "inputs": []interface{}{ - inputCfg.Source.AsMap(), - }, - "logging": map[string]interface{}{ - "level": logLevel.String(), - }, - }) - if err != nil { - return nil, err - } - return config.FromConfig(cfgData) -} - -type FleetServer struct { - bi build.Info - verCon version.Constraints - - cfgCh chan *config.Config - cache cache.Cache - reporter state.Reporter -} - -// NewFleetServer creates the actual fleet server service. -func NewFleetServer(bi build.Info, reporter state.Reporter) (*FleetServer, error) { - verCon, err := api.BuildVersionConstraint(bi.Version) - if err != nil { - return nil, err - } - - return &FleetServer{ - bi: bi, - verCon: verCon, - cfgCh: make(chan *config.Config, 1), - reporter: reporter, - }, nil -} - -type runFunc func(context.Context) error - -type runFuncCfg func(context.Context, *config.Config) error - -// Run runs the fleet server -func (f *FleetServer) Run(ctx context.Context, initCfg *config.Config) error { - err := initCfg.LoadServerLimits() - if err != nil { - return fmt.Errorf("encountered error while loading server limits: %w", err) - } - cache, err := makeCache(initCfg) - if err != nil { - return err - } - f.cache = cache - - var curCfg *config.Config - newCfg := initCfg - - // Replace context with cancellable ctx - // in order to automatically cancel all the go routines - // that were started in the scope of this function on function exit - ctx, cn := context.WithCancel(ctx) - defer cn() - - stop := func(cn context.CancelFunc, g *errgroup.Group) { - if cn != nil { - cn() - } - if g != nil { - err := g.Wait() - if err != nil { - log.Error().Err(err).Msg("error encountered while stopping server") - } - } - } - - start := func(ctx context.Context, runfn runFuncCfg, cfg *config.Config, ech chan<- error) (*errgroup.Group, context.CancelFunc) { - ctx, cn = context.WithCancel(ctx) - g, ctx := errgroup.WithContext(ctx) - - g.Go(func() error { - err := runfn(ctx, cfg) - if err != nil { - ech <- err - } - return err - }) - return g, cn - } - - var ( - proCancel, srvCancel context.CancelFunc - proEg, srvEg *errgroup.Group - ) - - started := false - -LOOP: - for { - ech := make(chan error, 2) - if started { - f.reporter.UpdateState(client.UnitStateConfiguring, "Re-configuring", nil) //nolint:errcheck // unclear on what should we do if updating the status fails? - } else { - started = true - f.reporter.UpdateState(client.UnitStateStarting, "Starting", nil) //nolint:errcheck // unclear on what should we do if updating the status fails? - } - - err := newCfg.LoadServerLimits() - if err != nil { - return fmt.Errorf("encountered error while loading server limits: %w", err) - } - - // Create or recreate cache - if configCacheChanged(curCfg, newCfg) { - log.Info().Msg("reconfigure cache on configuration change") - cacheCfg := makeCacheConfig(newCfg) - err := f.cache.Reconfigure(cacheCfg) - log.Info().Err(err).Interface("cfg", cacheCfg).Msg("reconfigure cache complete") - if err != nil { - return err - } - } - - // Start or restart profiler - if configChangedProfiler(curCfg, newCfg) { - if proCancel != nil { - log.Info().Msg("stopping profiler on configuration change") - stop(proCancel, proEg) - } - proEg, proCancel = nil, nil - if newCfg.Inputs[0].Server.Profiler.Enabled { - log.Info().Msg("starting profiler on configuration change") - proEg, proCancel = start(ctx, func(ctx context.Context, cfg *config.Config) error { - return profile.RunProfiler(ctx, cfg.Inputs[0].Server.Profiler.Bind) - }, newCfg, ech) - } - } - - // Start or restart server - if configChangedServer(curCfg, newCfg) { - if srvCancel != nil { - log.Info().Msg("stopping server on configuration change") - stop(srvCancel, srvEg) - } - log.Info().Msg("starting server on configuration change") - srvEg, srvCancel = start(ctx, func(ctx context.Context, cfg *config.Config) error { - return f.runServer(ctx, cfg) - }, newCfg, ech) - } - - curCfg = newCfg - - select { - case newCfg = <-f.cfgCh: - log.Info().Msg("Server configuration update") - case err := <-ech: - f.reporter.UpdateState(client.UnitStateFailed, fmt.Sprintf("Error - %s", err), nil) //nolint:errcheck // unclear on what should we do if updating the status fails? - log.Error().Err(err).Msg("Fleet Server failed") - return err - case <-ctx.Done(): - f.reporter.UpdateState(client.UnitStateStopping, "Stopping", nil) //nolint:errcheck // unclear on what should we do if updating the status fails? - break LOOP - } - } - - // Server is coming down; wait for the server group to exit cleanly. - // Timeout if something is locked up. - err = safeWait(srvEg, time.Second) - - // Eat cancel error to minimize confusion in logs - if errors.Is(err, context.Canceled) { - err = nil - } - - log.Info().Err(err).Msg("Fleet Server exited") - return err -} - -func configChangedProfiler(curCfg, newCfg *config.Config) bool { - - changed := true - - switch { - case curCfg == nil: - case curCfg.Inputs[0].Server.Profiler.Enabled != newCfg.Inputs[0].Server.Profiler.Enabled: - case curCfg.Inputs[0].Server.Profiler.Bind != newCfg.Inputs[0].Server.Profiler.Bind: - default: - changed = false - } - - return changed -} - -func redactOutputCfg(cfg *config.Config) config.Output { - const kRedacted = "[redacted]" - redacted := cfg.Output - - if redacted.Elasticsearch.APIKey != "" { - redacted.Elasticsearch.APIKey = kRedacted - } - - if redacted.Elasticsearch.ServiceToken != "" { - redacted.Elasticsearch.ServiceToken = kRedacted - } - - if redacted.Elasticsearch.TLS != nil { - newTLS := *redacted.Elasticsearch.TLS - - if newTLS.Certificate.Key != "" { - newTLS.Certificate.Key = kRedacted - } - if newTLS.Certificate.Passphrase != "" { - newTLS.Certificate.Passphrase = kRedacted - } - - redacted.Elasticsearch.TLS = &newTLS - } - - return redacted -} - -func redactServerCfg(cfg *config.Config) config.Server { - const kRedacted = "[redacted]" - redacted := cfg.Inputs[0].Server - - if redacted.TLS != nil { - newTLS := *redacted.TLS - - if newTLS.Certificate.Key != "" { - newTLS.Certificate.Key = kRedacted - } - if newTLS.Certificate.Passphrase != "" { - newTLS.Certificate.Passphrase = kRedacted - } - - redacted.TLS = &newTLS - } - - return redacted -} - -func redactConfig(cfg *config.Config) *config.Config { - redacted := &config.Config{ - Fleet: cfg.Fleet, - Output: cfg.Output, - Inputs: make([]config.Input, 1), - Logging: cfg.Logging, - HTTP: cfg.HTTP, - } - redacted.Inputs[0].Server = redactServerCfg(cfg) - redacted.Output = redactOutputCfg(cfg) - return redacted -} - -func configChangedServer(curCfg, newCfg *config.Config) bool { - - zlog := log.With().Interface("new", redactConfig(newCfg)).Logger() - - changed := true - switch { - case curCfg == nil: - zlog.Info().Msg("initial server configuration") - case !reflect.DeepEqual(curCfg.Fleet, newCfg.Fleet): - zlog.Info(). - Interface("old", redactConfig(curCfg)). - Msg("fleet configuration has changed") - case !reflect.DeepEqual(curCfg.Output, newCfg.Output): - zlog.Info(). - Interface("old", redactConfig(curCfg)). - Msg("output configuration has changed") - case !reflect.DeepEqual(curCfg.Inputs[0].Server, newCfg.Inputs[0].Server): - zlog.Info(). - Interface("old", redactConfig(curCfg)). - Msg("server configuration has changed") - default: - changed = false - } - - return changed -} - -func configCacheChanged(curCfg, newCfg *config.Config) bool { - if curCfg == nil { - return false - } - return curCfg.Inputs[0].Cache != newCfg.Inputs[0].Cache -} - -func safeWait(g *errgroup.Group, to time.Duration) error { - var err error - waitCh := make(chan error) - go func() { - waitCh <- g.Wait() - }() - - select { - case err = <-waitCh: - case <-time.After(to): - log.Warn().Msg("deadlock: goroutine locked up on errgroup.Wait()") - err = errors.New("group wait timeout") - } - - return err -} - -func loggedRunFunc(ctx context.Context, tag string, runfn runFunc) func() error { - return func() error { - - log.Debug().Msg(tag + " started") - - err := runfn(ctx) - - lvl := zerolog.DebugLevel - switch { - case err == nil: - case errors.Is(err, context.Canceled): - err = nil - default: - lvl = zerolog.ErrorLevel - } - - log.WithLevel(lvl).Err(err).Msg(tag + " exited") - return err - } -} - -func initRuntime(cfg *config.Config) { - gcPercent := cfg.Inputs[0].Server.Runtime.GCPercent - if gcPercent != 0 { - old := debug.SetGCPercent(gcPercent) - - log.Info(). - Int("old", old). - Int("new", gcPercent). - Msg("SetGCPercent") - } -} - -func (f *FleetServer) initBulker(ctx context.Context, tracer *apm.Tracer, cfg *config.Config) (*bulk.Bulker, error) { - es, err := es.NewClient(ctx, cfg, false, elasticsearchOptions( - cfg.Inputs[0].Server.Instrumentation.Enabled, f.bi, - )...) - if err != nil { - return nil, err - } - - blk := bulk.NewBulker(es, tracer, bulk.BulkOptsFromCfg(cfg)...) - return blk, nil -} - -func (f *FleetServer) runServer(ctx context.Context, cfg *config.Config) (err error) { - initRuntime(cfg) - - // The metricsServer is only enabled if http.enabled is set in the config - metricsServer, err := api.InitMetrics(ctx, cfg, f.bi) - switch { - case err != nil: - return err - case metricsServer != nil: - defer func() { - _ = metricsServer.Stop() - }() - } - - // Bulker is started in its own context and managed in the scope of this function. This is done so - // when the `ctx` is cancelled, the bulker will remain executing until this function exits. - // This allows the child subsystems to continue to write to the data store while tearing down. - bulkCtx, bulkCancel := context.WithCancel(context.Background()) - defer bulkCancel() - - // Create the APM tracer. - tracer, err := f.initTracer(cfg.Inputs[0].Server.Instrumentation) - if err != nil { - return err - } - - // Create the bulker subsystem - bulker, err := f.initBulker(bulkCtx, tracer, cfg) - if err != nil { - return err - } - - // Execute the bulker engine in a goroutine with its orphaned context. - // Create an error channel for the case where the bulker exits - // unexpectedly (ie. not cancelled by the bulkCancel context). - errCh := make(chan error) - - go func() { - runFunc := loggedRunFunc(bulkCtx, "Bulker", bulker.Run) - - // Emit the error from bulker.Run to the local error channel. - // The error group will be listening for it. (see comments below) - errCh <- runFunc() - }() - - // Wrap context with an error group context to manage the lifecycle - // of the subsystems. An error from any subsystem, or if the - // parent context is cancelled, will cancel the group. - // see https://pkg.go.dev/golang.org/x/sync/errgroup#Group.Go - g, ctx := errgroup.WithContext(ctx) - - // Stub a function for inclusion in the errgroup that exits when - // the bulker exits. If the bulker exits before the error group, - // this will tear down the error group and g.Wait() will return. - // Otherwise it will be a noop. - g.Go(func() (err error) { - select { - case err = <-errCh: - case <-ctx.Done(): - err = ctx.Err() - } - return - }) - - if tracer != nil { - go func() { - <-ctx.Done() - log.Info().Msg("flushing instrumentation tracer...") - tracer.Flush(nil) - tracer.Close() - }() - } - - if err = f.runSubsystems(ctx, cfg, g, bulker, tracer); err != nil { - return err - } - - return g.Wait() -} - -func (f *FleetServer) runSubsystems(ctx context.Context, cfg *config.Config, g *errgroup.Group, bulker bulk.Bulk, tracer *apm.Tracer) (err error) { - esCli := bulker.Client() - - // Check version compatibility with Elasticsearch - remoteVersion, err := ver.CheckCompatibility(ctx, esCli, f.bi.Version) - if err != nil { - if len(remoteVersion) != 0 { - return fmt.Errorf("failed version compatibility check with elasticsearch (Agent: %s, Elasticsearch: %s): %w", f.bi.Version, remoteVersion, err) - } - return fmt.Errorf("failed version compatibility check with elasticsearch: %w", err) - } - - // Run migrations; current safe to do in background. That may change in the future. - g.Go(loggedRunFunc(ctx, "Migrations", func(ctx context.Context) error { - return dl.Migrate(ctx, bulker) - })) - - // Run schduler for periodic GC/cleanup - gcCfg := cfg.Inputs[0].Server.GC - sched, err := scheduler.New(gc.Schedules(bulker, gcCfg.ScheduleInterval, gcCfg.CleanupAfterExpiredInterval)) - if err != nil { - return fmt.Errorf("failed to create elasticsearch GC: %w", err) - } - g.Go(loggedRunFunc(ctx, "Elasticsearch GC", sched.Run)) - - // Monitoring es client, longer timeout, no retries - monCli, err := es.NewClient(ctx, cfg, true, elasticsearchOptions( - cfg.Inputs[0].Server.Instrumentation.Enabled, f.bi, - )...) - if err != nil { - return err - } - - // Coordinator policy monitor - pim, err := monitor.New(dl.FleetPolicies, esCli, monCli, - monitor.WithFetchSize(cfg.Inputs[0].Monitor.FetchSize), - monitor.WithPollTimeout(cfg.Inputs[0].Monitor.PollTimeout), - ) - if err != nil { - return err - } - - g.Go(loggedRunFunc(ctx, "Policy index monitor", pim.Run)) - cord := coordinator.NewMonitor(cfg.Fleet, f.bi.Version, bulker, pim, coordinator.NewCoordinatorZero) - g.Go(loggedRunFunc(ctx, "Coordinator policy monitor", cord.Run)) - - // Policy monitor - pm := policy.NewMonitor(bulker, pim, cfg.Inputs[0].Server.Limits.PolicyThrottle) - g.Go(loggedRunFunc(ctx, "Policy monitor", pm.Run)) - - // Policy self monitor - sm := policy.NewSelfMonitor(cfg.Fleet, bulker, pim, cfg.Inputs[0].Policy.ID, f.reporter) - g.Go(loggedRunFunc(ctx, "Policy self monitor", sm.Run)) - - // Actions monitoring - var am monitor.SimpleMonitor - var ad *action.Dispatcher - var tr *action.TokenResolver - - am, err = monitor.NewSimple(dl.FleetActions, esCli, monCli, - monitor.WithExpiration(true), - monitor.WithFetchSize(cfg.Inputs[0].Monitor.FetchSize), - monitor.WithPollTimeout(cfg.Inputs[0].Monitor.PollTimeout), - ) - if err != nil { - return err - } - g.Go(loggedRunFunc(ctx, "Revision monitor", am.Run)) - - ad = action.NewDispatcher(am) - g.Go(loggedRunFunc(ctx, "Revision dispatcher", ad.Run)) - tr, err = action.NewTokenResolver(bulker) - if err != nil { - return err - } - - bc := checkin.NewBulk(bulker) - g.Go(loggedRunFunc(ctx, "Bulk checkin", bc.Run)) - - ct := api.NewCheckinT(f.verCon, &cfg.Inputs[0].Server, f.cache, bc, pm, am, ad, tr, bulker) - et, err := api.NewEnrollerT(f.verCon, &cfg.Inputs[0].Server, bulker, f.cache) - if err != nil { - return err - } - - at := api.NewArtifactT(&cfg.Inputs[0].Server, bulker, f.cache) - ack := api.NewAckT(&cfg.Inputs[0].Server, bulker, f.cache) - st := api.NewStatusT(&cfg.Inputs[0].Server, bulker, f.cache) - - router := api.NewRouter(ctx, bulker, ct, et, at, ack, st, sm, tracer, f.bi) - - g.Go(loggedRunFunc(ctx, "Http server", func(ctx context.Context) error { - return api.Run(ctx, router, &cfg.Inputs[0].Server) - })) - - return err -} - -// Reload reloads the fleet server with the latest configuration. -func (f *FleetServer) Reload(ctx context.Context, cfg *config.Config) error { - select { - case f.cfgCh <- cfg: - case <-ctx.Done(): - } - return nil -} - -func (f *FleetServer) initTracer(cfg config.Instrumentation) (*apm.Tracer, error) { - if !cfg.Enabled { - return nil, nil - } - - log.Info().Msg("fleet-server instrumentation is enabled") - - // TODO(marclop): Ideally, we'd use apmtransport.NewHTTPTransportOptions() - // but it doesn't exist today. Update this code once we have something - // available via the APM Go agent. - const ( - envVerifyServerCert = "ELASTIC_APM_VERIFY_SERVER_CERT" - envServerCert = "ELASTIC_APM_SERVER_CERT" - envCACert = "ELASTIC_APM_SERVER_CA_CERT_FILE" - envGlobalLabels = "ELASTIC_APM_GLOBAL_LABELS" - envTransactionSampleRate = "ELASTIC_APM_TRANSACTION_SAMPLE_RATE" - ) - if cfg.TLS.SkipVerify { - os.Setenv(envVerifyServerCert, "false") - defer os.Unsetenv(envVerifyServerCert) - } - if cfg.TLS.ServerCertificate != "" { - os.Setenv(envServerCert, cfg.TLS.ServerCertificate) - defer os.Unsetenv(envServerCert) - } - if cfg.TLS.ServerCA != "" { - os.Setenv(envCACert, cfg.TLS.ServerCA) - defer os.Unsetenv(envCACert) - } - if cfg.GlobalLabels != "" { - os.Setenv(envGlobalLabels, cfg.GlobalLabels) - defer os.Unsetenv(envGlobalLabels) - } - if cfg.TransactionSampleRate != "" { - os.Setenv(envTransactionSampleRate, cfg.TransactionSampleRate) - defer os.Unsetenv(envTransactionSampleRate) - } - transport, err := apmtransport.NewHTTPTransport() - if err != nil { - return nil, err - } - - if len(cfg.Hosts) > 0 { - hosts := make([]*url.URL, 0, len(cfg.Hosts)) - for _, host := range cfg.Hosts { - u, err := url.Parse(host) - if err != nil { - return nil, fmt.Errorf("failed parsing %s: %w", host, err) - } - hosts = append(hosts, u) - } - transport.SetServerURL(hosts...) - } - if cfg.APIKey != "" { - transport.SetAPIKey(cfg.APIKey) - } else { - transport.SetSecretToken(cfg.SecretToken) - } - return apm.NewTracerOptions(apm.TracerOptions{ - ServiceName: "fleet-server", - ServiceVersion: f.bi.Version, - ServiceEnvironment: cfg.Environment, - Transport: transport, - }) -} - -func elasticsearchOptions(instumented bool, bi build.Info) []es.ConfigOption { - options := []es.ConfigOption{es.WithUserAgent(kUAFleetServer, bi)} - if instumented { - options = append(options, es.InstrumentRoundTripper()) - } - return options -} diff --git a/dev-tools/integration/.env b/dev-tools/integration/.env index 090840c33..4b07d45bc 100644 --- a/dev-tools/integration/.env +++ b/dev-tools/integration/.env @@ -1,4 +1,4 @@ -ELASTICSEARCH_VERSION=8.5.0-60a4c029-SNAPSHOT +ELASTICSEARCH_VERSION=8.6.0-f20b7179-SNAPSHOT ELASTICSEARCH_USERNAME=elastic ELASTICSEARCH_PASSWORD=changeme TEST_ELASTICSEARCH_HOSTS=localhost:9200 \ No newline at end of file diff --git a/go.mod b/go.mod index 946f6fb44..a1ff9ad48 100644 --- a/go.mod +++ b/go.mod @@ -6,7 +6,7 @@ require ( github.com/Pallinder/go-randomdata v1.2.0 github.com/dgraph-io/ristretto v0.1.0 github.com/elastic/elastic-agent-client/v7 v7.0.0-20221102171927-bc376a4e0f9f - github.com/elastic/elastic-agent-libs v0.2.6 + github.com/elastic/elastic-agent-libs v0.2.14 github.com/elastic/elastic-agent-system-metrics v0.3.0 github.com/elastic/go-elasticsearch/v7 v7.16.0 github.com/elastic/go-ucfg v0.8.5 @@ -30,6 +30,7 @@ require ( go.uber.org/zap v1.21.0 golang.org/x/sync v0.0.0-20210220032951-036812b2e83c golang.org/x/time v0.0.0-20210723032227-1f47c861a9ac + google.golang.org/grpc v1.46.0 google.golang.org/protobuf v1.28.0 ) @@ -40,52 +41,51 @@ require ( github.com/davecgh/go-spew v1.1.1 // indirect github.com/dustin/go-humanize v1.0.0 // indirect github.com/elastic/go-licenser v0.4.0 // indirect - github.com/elastic/go-structform v0.0.9 // indirect + github.com/elastic/go-structform v0.0.10 // indirect github.com/elastic/go-sysinfo v1.7.1 // indirect github.com/elastic/go-windows v1.0.1 // indirect github.com/elastic/gosigar v0.14.2 // indirect + github.com/fatih/color v1.13.0 // indirect github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b // indirect github.com/golang/protobuf v1.5.2 // indirect + github.com/hashicorp/errwrap v1.1.0 // indirect + github.com/hashicorp/go-multierror v1.1.1 // indirect github.com/inconshreveable/mousetrap v1.0.0 // indirect github.com/joeshaw/multierror v0.0.0-20140124173710-69b34d4ec901 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/kr/pretty v0.2.1 // indirect github.com/magefile/mage v1.14.0 // indirect + github.com/mattn/go-colorable v0.1.12 // indirect github.com/mattn/go-isatty v0.0.14 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/prometheus/procfs v0.7.3 // indirect github.com/santhosh-tekuri/jsonschema v1.2.4 // indirect github.com/spf13/pflag v1.0.5 // indirect github.com/stretchr/objx v0.2.0 // indirect go.elastic.co/apm/module/apmhttp v1.15.0 // indirect go.elastic.co/apm/module/apmhttp/v2 v2.0.0 // indirect go.elastic.co/apm/v2 v2.0.0 // indirect + go.elastic.co/ecszap v1.0.1 // indirect go.elastic.co/fastjson v1.1.0 // indirect + go.uber.org/atomic v1.9.0 // indirect + go.uber.org/multierr v1.8.0 // indirect golang.org/x/lint v0.0.0-20210508222113-6edffad5e616 // indirect golang.org/x/mod v0.5.1 // indirect + golang.org/x/net v0.0.0-20220425223048-2871e0cb64e4 // indirect + golang.org/x/sys v0.0.0-20220422013727-9388b58f7150 // indirect golang.org/x/text v0.3.7 // indirect + golang.org/x/tools v0.1.9 // indirect golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect + google.golang.org/genproto v0.0.0-20220426171045-31bebdecfb46 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect + howett.net/plist v1.0.0 // indirect ) require ( - github.com/elastic/elastic-agent v0.0.0-20220831162706-5f1e54f40d3e - github.com/fatih/color v1.13.0 // indirect - github.com/hashicorp/errwrap v1.1.0 // indirect - github.com/hashicorp/go-multierror v1.1.1 // indirect + github.com/elastic/elastic-agent v0.0.0-20221107053805-657f66dad4bd github.com/jcchavezs/porto v0.4.0 // indirect - github.com/mattn/go-colorable v0.1.12 // indirect - github.com/prometheus/procfs v0.7.3 // indirect go.elastic.co/apm/module/apmhttprouter v1.14.0 - go.elastic.co/ecszap v1.0.1 // indirect - go.uber.org/atomic v1.9.0 // indirect - go.uber.org/multierr v1.8.0 // indirect - golang.org/x/net v0.0.0-20220425223048-2871e0cb64e4 // indirect - golang.org/x/sys v0.0.0-20220422013727-9388b58f7150 // indirect - golang.org/x/tools v0.1.9 // indirect - google.golang.org/genproto v0.0.0-20220426171045-31bebdecfb46 // indirect - google.golang.org/grpc v1.46.0 - howett.net/plist v1.0.0 // indirect ) replace ( diff --git a/go.sum b/go.sum index 51d7b1846..1c4474890 100644 --- a/go.sum +++ b/go.sum @@ -424,17 +424,22 @@ github.com/dustin/go-humanize v1.0.0 h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4 github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk= github.com/elastic/e2e-testing v1.99.2-0.20220117192005-d3365c99b9c4 h1:uYT+Krd8dsvnhnLK9pe/JHZkYtXEGPfbV4Wt1JPPol0= github.com/elastic/e2e-testing v1.99.2-0.20220117192005-d3365c99b9c4/go.mod h1:UcNuf4pX/qDVNQr0zybm1NL2YoWik+jKBaINZqQCA40= -github.com/elastic/elastic-agent v0.0.0-20220831162706-5f1e54f40d3e h1:uGDp9HesS9m3T7YwgM0ATE/YP5FXcxxAAKHQDgP/GS0= github.com/elastic/elastic-agent v0.0.0-20220831162706-5f1e54f40d3e/go.mod h1:sE+a99VTBCfrgogKL2j0n9Jf0NocEOi32GNP1OtdnG4= -github.com/elastic/elastic-agent-autodiscover v0.0.0-20220404145827-89887023c1ab h1:Jk6Mfk5BF8gtfE7X0bNCiDGBtwJVxRI79b4wLCAsP+A= +github.com/elastic/elastic-agent v0.0.0-20221107053805-657f66dad4bd h1:4Vo4x8bMyIy09L6tNANIaWoVxnmBnxmc6Z9EYGq9y2o= +github.com/elastic/elastic-agent v0.0.0-20221107053805-657f66dad4bd/go.mod h1:ZlpbMaxcRpuN2BGe78Qp/QylMU+j897Cfyc888caLsk= github.com/elastic/elastic-agent-autodiscover v0.0.0-20220404145827-89887023c1ab/go.mod h1:Gg1fsQI+rVms9FJ2DefBSojfPIzgkV8xlyG8fPG0DE8= +github.com/elastic/elastic-agent-autodiscover v0.2.1 h1:Nbeayh3vq2FNm6xaFo34mhUdOu0EVlpj53CqCsbU0E4= +github.com/elastic/elastic-agent-autodiscover v0.2.1/go.mod h1:gPnzzfdYNdgznAb+iG9eyyXaQXBbAMHa+Y6Z8hXfcGY= +github.com/elastic/elastic-agent-client/v7 v7.0.0-20210727140539-f0905d9377f6/go.mod h1:uh/Gj9a0XEbYoM4NYz4LvaBVARz3QXLmlNjsrKY9fTc= github.com/elastic/elastic-agent-client/v7 v7.0.0-20220804181728-b0328d2fe484/go.mod h1:fkvyUfFwyAG5OnMF0h+FV9sC0Xn9YLITwQpSuwungQs= github.com/elastic/elastic-agent-client/v7 v7.0.0-20221102171927-bc376a4e0f9f h1:hZv3vpGoAIvt/r6iPgKdfTCKgI8Eyk5Bb8HHP2FtzTY= github.com/elastic/elastic-agent-client/v7 v7.0.0-20221102171927-bc376a4e0f9f/go.mod h1:FEXUbFMfaV62S0CtJgD+FFHGY7+4o4fXkDicyONPSH8= github.com/elastic/elastic-agent-libs v0.0.0-20220303160015-5b4e674da3dd/go.mod h1://82M1l73IHx0wDbS2Tzkq6Fx9fkmytS1KgkIyzvNTM= github.com/elastic/elastic-agent-libs v0.2.2/go.mod h1:1xDLBhIqBIjhJ7lr2s+xRFFkQHpitSp8q2zzv1Dqg+s= -github.com/elastic/elastic-agent-libs v0.2.6 h1:DpcUcCVYZ7lNtHLUlyT1u/GtGAh49wpL15DTH7+8O5o= +github.com/elastic/elastic-agent-libs v0.2.5/go.mod h1:chO3rtcLyGlKi9S0iGVZhYCzDfdDsAQYBc+ui588AFE= github.com/elastic/elastic-agent-libs v0.2.6/go.mod h1:chO3rtcLyGlKi9S0iGVZhYCzDfdDsAQYBc+ui588AFE= +github.com/elastic/elastic-agent-libs v0.2.14 h1:o1agY/37TKl5kjhv3ur5M9d127wzQPRxwA4Xoh0jUEo= +github.com/elastic/elastic-agent-libs v0.2.14/go.mod h1:0J9lzJh+BjttIiVjYDLncKYCEWUUHiiqnuI64y6C6ss= github.com/elastic/elastic-agent-system-metrics v0.3.0 h1:W8L0E8lWJmdguH+oIR7OzuFgopvw8ucZAE9w6iqVlpE= github.com/elastic/elastic-agent-system-metrics v0.3.0/go.mod h1:RIYhJOS7mUeyIthfOSqmmbEILYSzaDWLi5zQ70bQo+o= github.com/elastic/elastic-package v0.32.1/go.mod h1:l1fEnF52XRBL6a5h6uAemtdViz2bjtjUtgdQcuRhEAY= @@ -444,8 +449,9 @@ github.com/elastic/go-elasticsearch/v8 v8.0.0-20210317102009-a9d74cec0186/go.mod github.com/elastic/go-licenser v0.3.1/go.mod h1:D8eNQk70FOCVBl3smCGQt/lv7meBeQno2eI1S5apiHQ= github.com/elastic/go-licenser v0.4.0 h1:jLq6A5SilDS/Iz1ABRkO6BHy91B9jBora8FwGRsDqUI= github.com/elastic/go-licenser v0.4.0/go.mod h1:V56wHMpmdURfibNBggaSBfqgPxyT1Tldns1i87iTEvU= -github.com/elastic/go-structform v0.0.9 h1:HpcS7xljL4kSyUfDJ8cXTJC6rU5ChL1wYb6cx3HLD+o= github.com/elastic/go-structform v0.0.9/go.mod h1:CZWf9aIRYY5SuKSmOhtXScE5uQiLZNqAFnwKR4OrIM4= +github.com/elastic/go-structform v0.0.10 h1:oy08o/Ih2hHTkNcRY/1HhaYvIp5z6t8si8gnCJPDo1w= +github.com/elastic/go-structform v0.0.10/go.mod h1:CZWf9aIRYY5SuKSmOhtXScE5uQiLZNqAFnwKR4OrIM4= github.com/elastic/go-sysinfo v1.1.1/go.mod h1:i1ZYdU10oLNfRzq4vq62BEwD2fH8KaWh6eh0ikPT9F0= github.com/elastic/go-sysinfo v1.7.1 h1:Wx4DSARcKLllpKT2TnFVdSUJOsybqMYCNQZq1/wO+s0= github.com/elastic/go-sysinfo v1.7.1/go.mod h1:i1ZYdU10oLNfRzq4vq62BEwD2fH8KaWh6eh0ikPT9F0= @@ -713,9 +719,12 @@ github.com/googleapis/gnostic v0.5.1/go.mod h1:6U4PtQXGIEt/Z3h5MAT7FNofLnw9vXk2c github.com/googleapis/gnostic v0.5.5 h1:9fHAtK0uDfpveeqqo1hkEZJcFvYXAiCN3UutL8F9xHw= github.com/googleapis/gnostic v0.5.5/go.mod h1:7+EbHbldMins07ALC74bsA81Ovc97DwqyJO1AENw9kA= github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY= +github.com/gorilla/context v1.1.1/go.mod h1:kBGZzfjB9CEq2AlWe17Uuf7NDRt0dE0s8S51q0aT7Yg= github.com/gorilla/handlers v0.0.0-20150720190736-60c7bfde3e33/go.mod h1:Qkdc/uu4tH4g6mTK6auzZ766c4CA0Ng8+o/OAirnOIQ= github.com/gorilla/handlers v1.5.1/go.mod h1:t8XrUpc4KVXb7HGyJ4/cEnwQiaxrX/hz1Zv/4g96P1Q= +github.com/gorilla/mux v1.6.2/go.mod h1:1lud6UwP+6orDFRuTfBEV8e9/aOM/c4fVVCaMa2zaAs= github.com/gorilla/mux v1.7.2/go.mod h1:1lud6UwP+6orDFRuTfBEV8e9/aOM/c4fVVCaMa2zaAs= +github.com/gorilla/mux v1.8.0 h1:i40aqfkR1h2SlN9hojwV5ZA91wcXFOvkdNIeFDP5koI= github.com/gorilla/mux v1.8.0/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So= github.com/gorilla/websocket v0.0.0-20170926233335-4201258b820c/go.mod h1:E7qHFY5m1UJ88s3WnNqhKjPHQ0heANvMoAMk2YaljkQ= github.com/gorilla/websocket v1.4.0/go.mod h1:E7qHFY5m1UJ88s3WnNqhKjPHQ0heANvMoAMk2YaljkQ= @@ -1335,6 +1344,8 @@ go.elastic.co/apm v1.15.0/go.mod h1:dylGv2HKR0tiCV+wliJz1KHtDyuD8SPe69oV7VyK6WY= go.elastic.co/apm/module/apmelasticsearch v1.10.0/go.mod h1:lwoaGDfZzfb9e6TXd3h8/KNmLAONOas7o5NLVNmv8Xk= go.elastic.co/apm/module/apmelasticsearch v1.14.0 h1:tT9JSUImykCY5Kqs8A/taclNR5lkfZb6T+F5WrnVfgQ= go.elastic.co/apm/module/apmelasticsearch v1.14.0/go.mod h1:WJRQvN5v8YkigaqT2ss4zQUTe9k8kceyr13V2d2S4H0= +go.elastic.co/apm/module/apmgorilla v1.15.0 h1:1yTAksffgaFXYEIwlLRiQnxLfy3p3RtpDw8HDupIJfY= +go.elastic.co/apm/module/apmgorilla v1.15.0/go.mod h1:+23mZudYvZ9VgxCQjseLo9EF5gkKEr0KSQBupw+rzP8= go.elastic.co/apm/module/apmgrpc v1.15.0 h1:Z7h58uuMJUoYXK6INFunlcGEXZQ18QKAhPh6NFYDNHE= go.elastic.co/apm/module/apmgrpc v1.15.0/go.mod h1:IEbTGJzY5Xx737PkHDT3bbzh9syovK+IfAlckJsUgPE= go.elastic.co/apm/module/apmhttp v1.10.0/go.mod h1:Y4timwcJ8sQWbWpcw3Y7Mat1OssNpGhpwyfUnpqIDew= diff --git a/internal/pkg/api/auth.go b/internal/pkg/api/auth.go index d83c7d8ae..d7ed90156 100644 --- a/internal/pkg/api/auth.go +++ b/internal/pkg/api/auth.go @@ -33,6 +33,8 @@ var ( func authAPIKey(r *http.Request, bulker bulk.Bulk, c cache.Cache) (*apikey.APIKey, error) { span, ctx := apm.StartSpan(r.Context(), "authAPIKey", "auth") defer span.End() + start := time.Now() + reqID := r.Header.Get(logger.HeaderRequestID) key, err := apikey.ExtractAPIKey(r) if err != nil { @@ -41,15 +43,17 @@ func authAPIKey(r *http.Request, bulker bulk.Bulk, c cache.Cache) (*apikey.APIKe if c.ValidAPIKey(*key) { span.Context.SetLabel("api_key_cache_hit", true) + log.Debug(). + Str("id", key.ID). + Str(ECSHTTPRequestID, reqID). + Int64(ECSEventDuration, time.Since(start).Nanoseconds()). + Bool("fleet.apikey.cache_hit", true). + Msg("ApiKey authenticated") return key, nil } else { span.Context.SetLabel("api_key_cache_hit", false) } - reqID := r.Header.Get(logger.HeaderRequestID) - - start := time.Now() - info, err := bulker.APIKeyAuth(ctx, *key) if err != nil { @@ -62,7 +66,7 @@ func authAPIKey(r *http.Request, bulker bulk.Bulk, c cache.Cache) (*apikey.APIKe return nil, err } - log.Trace(). + log.Debug(). Str("id", key.ID). Str(ECSHTTPRequestID, reqID). Int64(ECSEventDuration, time.Since(start).Nanoseconds()). @@ -70,6 +74,7 @@ func authAPIKey(r *http.Request, bulker bulk.Bulk, c cache.Cache) (*apikey.APIKe Strs("roles", info.Roles). Bool("enabled", info.Enabled). RawJSON("meta", info.Metadata). + Bool("fleet.apikey.cache_hit", false). Msg("ApiKey authenticated") c.SetAPIKey(*key, info.Enabled) diff --git a/internal/pkg/api/error.go b/internal/pkg/api/error.go index e349adf17..45b90cd3c 100644 --- a/internal/pkg/api/error.go +++ b/internal/pkg/api/error.go @@ -12,7 +12,6 @@ import ( "strings" "github.com/elastic/fleet-server/v7/internal/pkg/dl" - "github.com/elastic/fleet-server/v7/internal/pkg/limit" "github.com/elastic/fleet-server/v7/internal/pkg/logger" "github.com/pkg/errors" @@ -43,7 +42,6 @@ type HTTPErrResp struct { // NewHTTPErrResp creates an ErrResp from a go error func NewHTTPErrResp(err error) HTTPErrResp { - errTable := []struct { target error meta HTTPErrResp @@ -57,24 +55,6 @@ func NewHTTPErrResp(err error) HTTPErrResp { zerolog.WarnLevel, }, }, - { - limit.ErrRateLimit, - HTTPErrResp{ - http.StatusTooManyRequests, - "RateLimit", - "exceeded the rate limit", - zerolog.DebugLevel, - }, - }, - { - limit.ErrMaxLimit, - HTTPErrResp{ - http.StatusTooManyRequests, - "MaxLimit", - "exceeded the max limit", - zerolog.DebugLevel, - }, - }, { ErrAPIKeyNotEnabled, HTTPErrResp{ @@ -138,6 +118,15 @@ func NewHTTPErrResp(err error) HTTPErrResp { zerolog.InfoLevel, }, }, + { + ErrUpdatingInactiveAgent, + HTTPErrResp{ + http.StatusUnauthorized, + "Unauthorized", + "Agent not active", + zerolog.InfoLevel, + }, + }, } for _, e := range errTable { @@ -161,6 +150,7 @@ func NewHTTPErrResp(err error) HTTPErrResp { return HTTPErrResp{ StatusCode: http.StatusBadRequest, Error: "BadRequest", + Message: err.Error(), Level: zerolog.InfoLevel, } } diff --git a/internal/pkg/api/handleAck.go b/internal/pkg/api/handleAck.go index 3f284b5da..304a1350d 100644 --- a/internal/pkg/api/handleAck.go +++ b/internal/pkg/api/handleAck.go @@ -15,20 +15,24 @@ import ( "strings" "time" + "github.com/julienschmidt/httprouter" + "github.com/pkg/errors" + "github.com/rs/zerolog" + "github.com/rs/zerolog/log" + "github.com/elastic/fleet-server/v7/internal/pkg/bulk" "github.com/elastic/fleet-server/v7/internal/pkg/cache" "github.com/elastic/fleet-server/v7/internal/pkg/config" "github.com/elastic/fleet-server/v7/internal/pkg/dl" "github.com/elastic/fleet-server/v7/internal/pkg/es" - "github.com/elastic/fleet-server/v7/internal/pkg/limit" "github.com/elastic/fleet-server/v7/internal/pkg/logger" "github.com/elastic/fleet-server/v7/internal/pkg/model" "github.com/elastic/fleet-server/v7/internal/pkg/policy" - "github.com/pkg/errors" + "github.com/elastic/fleet-server/v7/internal/pkg/smap" +) - "github.com/julienschmidt/httprouter" - "github.com/rs/zerolog" - "github.com/rs/zerolog/log" +var ( + ErrUpdatingInactiveAgent = errors.New("updating inactive agent") ) type HTTPError struct { @@ -41,27 +45,21 @@ func (e *HTTPError) Error() string { type AckT struct { cfg *config.Server - limit *limit.Limiter bulk bulk.Bulk cache cache.Cache } func NewAckT(cfg *config.Server, bulker bulk.Bulk, cache cache.Cache) *AckT { - log.Info(). - Interface("limits", cfg.Limits.AckLimit). - Msg("Setting config ack_limits") - return &AckT{ cfg: cfg, bulk: bulker, cache: cache, - limit: limit.NewLimiter(&cfg.Limits.AckLimit), } } -func (rt Router) handleAcks(w http.ResponseWriter, r *http.Request, ps httprouter.Params) { +//nolint:dupl // function body calls different internal handler then handleCheckin +func (rt *Router) handleAcks(w http.ResponseWriter, r *http.Request, ps httprouter.Params) { start := time.Now() - id := ps.ByName("id") reqID := r.Header.Get(logger.HeaderRequestID) @@ -90,12 +88,6 @@ func (rt Router) handleAcks(w http.ResponseWriter, r *http.Request, ps httproute } func (ack *AckT) handleAcks(zlog *zerolog.Logger, w http.ResponseWriter, r *http.Request, id string) error { - limitF, err := ack.limit.Acquire() - if err != nil { - return err - } - defer limitF() - agent, err := authAgent(r, &id, ack.bulk, ack.cache) if err != nil { return err @@ -106,10 +98,6 @@ func (ack *AckT) handleAcks(zlog *zerolog.Logger, w http.ResponseWriter, r *http return ctx.Str(LogAccessAPIKeyID, agent.AccessAPIKeyID) }) - // Metrics; serenity now. - dfunc := cntAcks.IncStart() - defer dfunc() - return ack.processRequest(*zlog, w, r, agent) } @@ -277,17 +265,17 @@ func (ack *AckT) handleAckEvents(ctx context.Context, zlog zerolog.Logger, agent // The unenroll and upgrade acks might overwrite it later setResult(n, http.StatusOK) - if ev.Error == "" { - if action.Type == TypeUnenroll { - unenrollIdxs = append(unenrollIdxs, n) - } else if action.Type == TypeUpgrade { - if err := ack.handleUpgrade(ctx, zlog, agent); err != nil { - setError(n, err) - log.Error().Err(err).Msg("handle upgrade event") - continue - } + if action.Type == TypeUpgrade { + if err := ack.handleUpgrade(ctx, zlog, agent, ev); err != nil { + setError(n, err) + log.Error().Err(err).Msg("handle upgrade event") + continue } } + + if ev.Error == "" && action.Type == TypeUnenroll { + unenrollIdxs = append(unenrollIdxs, n) + } } // Process policy acks @@ -337,8 +325,9 @@ func (ack *AckT) handlePolicyChange(ctx context.Context, zlog zerolog.Logger, ag Int64("rev.coordinatorIdx", rev.CoordinatorIdx). Msg("ack policy revision") - if ok && rev.PolicyID == agent.PolicyID && (rev.RevisionIdx > currRev || - (rev.RevisionIdx == currRev && rev.CoordinatorIdx > currCoord)) { + if ok && rev.PolicyID == agent.PolicyID && + (rev.RevisionIdx > currRev || + (rev.RevisionIdx == currRev && rev.CoordinatorIdx > currCoord)) { found = true currRev = rev.RevisionIdx currCoord = rev.CoordinatorIdx @@ -349,20 +338,77 @@ func (ack *AckT) handlePolicyChange(ctx context.Context, zlog zerolog.Logger, ag return nil } - sz := len(agent.DefaultAPIKeyHistory) - if sz > 0 { - ids := make([]string, sz) - for i := 0; i < sz; i++ { - ids[i] = agent.DefaultAPIKeyHistory[i].ID + for _, output := range agent.Outputs { + if output.Type != policy.OutputTypeElasticsearch { + continue } - log.Info().Strs("ids", ids).Msg("Invalidate old API keys") - if err := ack.bulk.APIKeyInvalidate(ctx, ids...); err != nil { - log.Info().Err(err).Strs("ids", ids).Msg("Failed to invalidate API keys") + + err := ack.updateAPIKey(ctx, + zlog, + agent.Id, + currRev, currCoord, + agent.PolicyID, + output.APIKeyID, output.PermissionsHash, output.ToRetireAPIKeyIds) + if err != nil { + return err } } + return nil + +} + +func (ack *AckT) updateAPIKey(ctx context.Context, + zlog zerolog.Logger, + agentID string, + currRev, currCoord int64, + policyID, apiKeyID, permissionHash string, + toRetireAPIKeyIDs []model.ToRetireAPIKeyIdsItems) error { + + if apiKeyID != "" { + res, err := ack.bulk.APIKeyRead(ctx, apiKeyID, true) + if err != nil { + if isAgentActive(ctx, zlog, ack.bulk, agentID) { + zlog.Error(). + Err(err). + Str(LogAPIKeyID, apiKeyID). + Msg("Failed to read API Key roles") + } else { + // race when API key was invalidated before acking + zlog.Info(). + Err(err). + Str(LogAPIKeyID, apiKeyID). + Msg("Failed to read invalidated API Key roles") + + // prevents future checks + return ErrUpdatingInactiveAgent + } + } else { + clean, removedRolesCount, err := cleanRoles(res.RoleDescriptors) + if err != nil { + zlog.Error(). + Err(err). + RawJSON("roles", res.RoleDescriptors). + Str(LogAPIKeyID, apiKeyID). + Msg("Failed to cleanup roles") + } else if removedRolesCount > 0 { + if err := ack.bulk.APIKeyUpdate(ctx, apiKeyID, permissionHash, clean); err != nil { + zlog.Error().Err(err).RawJSON("roles", clean).Str(LogAPIKeyID, apiKeyID).Msg("Failed to update API Key") + } else { + zlog.Debug(). + Str("hash.sha256", permissionHash). + Str(LogAPIKeyID, apiKeyID). + RawJSON("roles", clean). + Int("removedRoles", removedRolesCount). + Msg("Updating agent record to pick up reduced roles.") + } + } + } + ack.invalidateAPIKeys(ctx, toRetireAPIKeyIDs, apiKeyID) + } + body := makeUpdatePolicyBody( - agent.PolicyID, + policyID, currRev, currCoord, ) @@ -370,14 +416,14 @@ func (ack *AckT) handlePolicyChange(ctx context.Context, zlog zerolog.Logger, ag err := ack.bulk.Update( ctx, dl.FleetAgents, - agent.Id, + agentID, body, bulk.WithRefresh(), bulk.WithRetryOnConflict(3), ) - zlog.Info().Err(err). - Str(LogPolicyID, agent.PolicyID). + zlog.Err(err). + Str(LogPolicyID, policyID). Int64("policyRevision", currRev). Int64("policyCoordinator", currCoord). Msg("ack policy") @@ -385,8 +431,50 @@ func (ack *AckT) handlePolicyChange(ctx context.Context, zlog zerolog.Logger, ag return errors.Wrap(err, "handlePolicyChange update") } +func cleanRoles(roles json.RawMessage) (json.RawMessage, int, error) { + rr := smap.Map{} + if err := json.Unmarshal(roles, &rr); err != nil { + return nil, 0, errors.Wrap(err, "failed to unmarshal provided roles") + } + + keys := make([]string, 0, len(rr)) + for k := range rr { + if strings.HasSuffix(k, "-rdstale") { + keys = append(keys, k) + } + } + + if len(keys) == 0 { + return roles, 0, nil + } + + for _, k := range keys { + delete(rr, k) + } + + r, err := json.Marshal(rr) + return r, len(keys), errors.Wrap(err, "failed to marshal resulting role definition") +} + +func (ack *AckT) invalidateAPIKeys(ctx context.Context, toRetireAPIKeyIDs []model.ToRetireAPIKeyIdsItems, skip string) { + ids := make([]string, 0, len(toRetireAPIKeyIDs)) + for _, k := range toRetireAPIKeyIDs { + if k.ID == skip || k.ID == "" { + continue + } + ids = append(ids, k.ID) + } + + if len(ids) > 0 { + log.Info().Strs("fleet.policy.apiKeyIDsToRetire", ids).Msg("Invalidate old API keys") + if err := ack.bulk.APIKeyInvalidate(ctx, ids...); err != nil { + log.Info().Err(err).Strs("ids", ids).Msg("Failed to invalidate API keys") + } + } +} + func (ack *AckT) handleUnenroll(ctx context.Context, zlog zerolog.Logger, agent *model.Agent) error { - apiKeys := _getAPIKeyIDs(agent) + apiKeys := agent.APIKeyIDs() if len(apiKeys) > 0 { zlog = zlog.With().Strs(LogAPIKeyID, apiKeys).Logger() @@ -407,7 +495,7 @@ func (ack *AckT) handleUnenroll(ctx context.Context, zlog zerolog.Logger, agent return errors.Wrap(err, "handleUnenroll marshal") } - if err = ack.bulk.Update(ctx, dl.FleetAgents, agent.Id, body, bulk.WithRefresh()); err != nil { + if err = ack.bulk.Update(ctx, dl.FleetAgents, agent.Id, body, bulk.WithRefresh(), bulk.WithRetryOnConflict(3)); err != nil { return errors.Wrap(err, "handleUnenroll update") } @@ -415,12 +503,37 @@ func (ack *AckT) handleUnenroll(ctx context.Context, zlog zerolog.Logger, agent return nil } -func (ack *AckT) handleUpgrade(ctx context.Context, zlog zerolog.Logger, agent *model.Agent) error { - +func (ack *AckT) handleUpgrade(ctx context.Context, zlog zerolog.Logger, agent *model.Agent, event Event) error { now := time.Now().UTC().Format(time.RFC3339) - doc := bulk.UpdateFields{ - dl.FieldUpgradeStartedAt: nil, - dl.FieldUpgradedAt: now, + doc := bulk.UpdateFields{} + if event.Error != "" { + // unmarshal event payload + var pl struct { + Retry bool `json:"retry"` + Attempt int `json:"retry_attempt"` + } + err := json.Unmarshal(event.Payload, &pl) + if err != nil { + zlog.Error().Err(err).Msg("unable to unmarshal upgrade event payload") + } + + // if the payload indicates a retry, mark change the upgrade status to retrying. + if pl.Retry { + zlog.Info().Int("retry_attempt", pl.Attempt).Msg("marking agent upgrade as retrying") + doc[dl.FieldUpgradeStatus] = "retrying" // Keep FieldUpgradeStatedAt abd FieldUpgradeded at to original values + } else { + zlog.Info().Int("retry_attempt", pl.Attempt).Msg("marking agent upgrade as failed, agent logs contain failure message") + doc = bulk.UpdateFields{ + dl.FieldUpgradeStartedAt: nil, + dl.FieldUpgradeStatus: "failed", + } + } + } else { + doc = bulk.UpdateFields{ + dl.FieldUpgradeStartedAt: nil, + dl.FieldUpgradeStatus: nil, + dl.FieldUpgradedAt: now, + } } body, err := doc.Marshal() @@ -428,7 +541,7 @@ func (ack *AckT) handleUpgrade(ctx context.Context, zlog zerolog.Logger, agent * return errors.Wrap(err, "handleUpgrade marshal") } - if err = ack.bulk.Update(ctx, dl.FleetAgents, agent.Id, body, bulk.WithRefresh()); err != nil { + if err = ack.bulk.Update(ctx, dl.FleetAgents, agent.Id, body, bulk.WithRefresh(), bulk.WithRetryOnConflict(3)); err != nil { return errors.Wrap(err, "handleUpgrade update") } @@ -440,15 +553,16 @@ func (ack *AckT) handleUpgrade(ctx context.Context, zlog zerolog.Logger, agent * return nil } -func _getAPIKeyIDs(agent *model.Agent) []string { - keys := make([]string, 0, 1) - if agent.AccessAPIKeyID != "" { - keys = append(keys, agent.AccessAPIKeyID) - } - if agent.DefaultAPIKeyID != "" { - keys = append(keys, agent.DefaultAPIKeyID) +func isAgentActive(ctx context.Context, zlog zerolog.Logger, bulk bulk.Bulk, agentID string) bool { + agent, err := dl.FindAgent(ctx, bulk, dl.QueryAgentByID, dl.FieldID, agentID) + if err != nil { + zlog.Error(). + Err(err). + Msg("failed to find agent by ID") + return true } - return keys + + return agent.Active // it is a valid error in case agent is active (was not invalidated) } // Generate an update script that validates that the policy_id diff --git a/internal/pkg/api/handleAck_test.go b/internal/pkg/api/handleAck_test.go index 90c961456..d2adc8f9b 100644 --- a/internal/pkg/api/handleAck_test.go +++ b/internal/pkg/api/handleAck_test.go @@ -15,13 +15,14 @@ import ( "net/http" "testing" + "github.com/google/go-cmp/cmp" + "github.com/elastic/fleet-server/v7/internal/pkg/cache" "github.com/elastic/fleet-server/v7/internal/pkg/config" "github.com/elastic/fleet-server/v7/internal/pkg/es" "github.com/elastic/fleet-server/v7/internal/pkg/model" ftesting "github.com/elastic/fleet-server/v7/internal/pkg/testing" testlog "github.com/elastic/fleet-server/v7/internal/pkg/testing/log" - "github.com/google/go-cmp/cmp" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" @@ -405,12 +406,67 @@ func TestHandleAckEvents(t *testing.T) { }, err: &HTTPError{Status: http.StatusNotFound}, }, + { + name: "upgrade action failed", + events: []Event{ + { + ActionID: "ab12dcd8-bde0-4045-92dc-c4b27668d73a", + Type: "UPGRADE", + Error: "Error with no payload", + }, + }, + res: newAckResponse(false, []AckResponseItem{ + { + Status: http.StatusOK, + Message: http.StatusText(http.StatusOK), + }, + }), + bulker: func(t *testing.T) *ftesting.MockBulk { + m := ftesting.NewMockBulk() + m.On("Search", mock.Anything, mock.Anything, mock.MatchedBy(matchAction(t, "ab12dcd8-bde0-4045-92dc-c4b27668d73a")), mock.Anything).Return(&es.ResultT{HitsT: es.HitsT{ + Hits: []es.HitT{{ + Source: []byte(`{"action_id":"ab12dcd8-bde0-4045-92dc-c4b27668d73a","type":"UPGRADE"}`), + }}, + }}, nil).Once() + m.On("Create", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return("", nil).Once() + m.On("Update", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(nil).Once() + return m + }, + }, + { + name: "upgrade action retrying", + events: []Event{ + { + ActionID: "ab12dcd8-bde0-4045-92dc-c4b27668d73a", + Type: "UPGRADE", + Error: "Error with payload", + Payload: json.RawMessage(`{"retry":true,"retry_attempt":1}`), + }, + }, + res: newAckResponse(false, []AckResponseItem{ + { + Status: http.StatusOK, + Message: http.StatusText(http.StatusOK), + }, + }), + bulker: func(t *testing.T) *ftesting.MockBulk { + m := ftesting.NewMockBulk() + m.On("Search", mock.Anything, mock.Anything, mock.MatchedBy(matchAction(t, "ab12dcd8-bde0-4045-92dc-c4b27668d73a")), mock.Anything).Return(&es.ResultT{HitsT: es.HitsT{ + Hits: []es.HitT{{ + Source: []byte(`{"action_id":"ab12dcd8-bde0-4045-92dc-c4b27668d73a","type":"UPGRADE"}`), + }}, + }}, nil).Once() + m.On("Create", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return("", nil).Once() + m.On("Update", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(nil).Once() + return m + }, + }, } for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { logger := testlog.SetLogger(t) - cache, err := cache.New(cache.Config{NumCounters: 100, MaxCost: 100000}) + cache, err := cache.New(config.Cache{NumCounters: 100, MaxCost: 100000}) if err != nil { t.Fatal(err) } @@ -439,3 +495,136 @@ func TestHandleAckEvents(t *testing.T) { }) } } + +func TestInvalidateAPIKeys(t *testing.T) { + toRetire1 := []model.ToRetireAPIKeyIdsItems{{ + ID: "toRetire1", + }} + toRetire2 := []model.ToRetireAPIKeyIdsItems{{ + ID: "toRetire2_0", + }, { + ID: "toRetire2_1", + }} + var toRetire3 []model.ToRetireAPIKeyIdsItems + + skips := map[string]string{ + "1": "toRetire1", + "2": "toRetire2_0", + "3": "", + } + wants := map[string][]string{ + "1": {}, + "2": {"toRetire2_1"}, + "3": {}, + } + + agent := model.Agent{ + Outputs: map[string]*model.PolicyOutput{ + "1": {ToRetireAPIKeyIds: toRetire1}, + "2": {ToRetireAPIKeyIds: toRetire2}, + "3": {ToRetireAPIKeyIds: toRetire3}, + }, + } + + for i, out := range agent.Outputs { + skip := skips[i] + want := wants[i] + + bulker := ftesting.NewMockBulk() + if len(want) > 0 { + bulker.On("APIKeyInvalidate", + context.Background(), mock.MatchedBy(func(ids []string) bool { + // if A contains B and B contains A => A = B + return assert.Subset(t, ids, want) && + assert.Subset(t, want, ids) + })). + Return(nil) + } + + ack := &AckT{bulk: bulker} + ack.invalidateAPIKeys(context.Background(), out.ToRetireAPIKeyIds, skip) + + bulker.AssertExpectations(t) + } +} + +func TestAckHandleUpgrade(t *testing.T) { + tests := []struct { + name string + event Event + bulker func(t *testing.T) *ftesting.MockBulk + }{{ + name: "ok", + event: Event{}, + bulker: func(t *testing.T) *ftesting.MockBulk { + m := ftesting.NewMockBulk() + m.On("Update", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(nil).Once() + return m + }, + }, { + name: "retry signaled", + event: Event{ + Error: "upgrade error", + Payload: json.RawMessage(`{"retry":true,"retry_attempt":1}`), + }, + bulker: func(t *testing.T) *ftesting.MockBulk { + m := ftesting.NewMockBulk() + m.On("Update", mock.Anything, mock.Anything, mock.Anything, mock.MatchedBy(func(p []byte) bool { + var body struct { + Doc struct { + Status string `json:"upgrade_status"` + } `json:"doc"` + } + if err := json.Unmarshal(p, &body); err != nil { + t.Fatal(err) + } + return body.Doc.Status == "retrying" + }), mock.Anything).Return(nil).Once() + return m + }, + }, { + name: "no more retries", + event: Event{ + Error: "upgrade error", + Payload: json.RawMessage(`{"retry":false}`), + }, + bulker: func(t *testing.T) *ftesting.MockBulk { + m := ftesting.NewMockBulk() + m.On("Update", mock.Anything, mock.Anything, mock.Anything, mock.MatchedBy(func(p []byte) bool { + var body struct { + Doc struct { + Status string `json:"upgrade_status"` + } `json:"doc"` + } + if err := json.Unmarshal(p, &body); err != nil { + t.Fatal(err) + } + return body.Doc.Status == "failed" + }), mock.Anything).Return(nil).Once() + return m + }, + }} + cfg := &config.Server{ + Limits: config.ServerLimits{}, + } + agent := &model.Agent{ + ESDocument: model.ESDocument{Id: "ab12dcd8-bde0-4045-92dc-c4b27668d735"}, + Agent: &model.AgentMetadata{Version: "8.0.0"}, + } + ctx := context.Background() + cache, err := cache.New(config.Cache{NumCounters: 100, MaxCost: 100000}) + if err != nil { + t.Fatal(err) + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + logger := testlog.SetLogger(t) + bulker := tc.bulker(t) + ack := NewAckT(cfg, bulker, cache) + + err := ack.handleUpgrade(ctx, logger, agent, tc.event) + assert.NoError(t, err) + bulker.AssertExpectations(t) + }) + } +} diff --git a/internal/pkg/api/handleArtifacts.go b/internal/pkg/api/handleArtifacts.go index 13c0879c9..a5a362b3a 100644 --- a/internal/pkg/api/handleArtifacts.go +++ b/internal/pkg/api/handleArtifacts.go @@ -19,7 +19,6 @@ import ( "github.com/elastic/fleet-server/v7/internal/pkg/cache" "github.com/elastic/fleet-server/v7/internal/pkg/config" "github.com/elastic/fleet-server/v7/internal/pkg/dl" - "github.com/elastic/fleet-server/v7/internal/pkg/limit" "github.com/elastic/fleet-server/v7/internal/pkg/logger" "github.com/elastic/fleet-server/v7/internal/pkg/model" "github.com/elastic/fleet-server/v7/internal/pkg/throttle" @@ -46,24 +45,17 @@ type ArtifactT struct { bulker bulk.Bulk cache cache.Cache esThrottle *throttle.Throttle - limit *limit.Limiter } func NewArtifactT(cfg *config.Server, bulker bulk.Bulk, cache cache.Cache) *ArtifactT { - log.Info(). - Interface("limits", cfg.Limits.ArtifactLimit). - Int("maxParallel", defaultMaxParallel). - Msg("Artifact install limits") - return &ArtifactT{ bulker: bulker, cache: cache, - limit: limit.NewLimiter(&cfg.Limits.ArtifactLimit), esThrottle: throttle.NewThrottle(defaultMaxParallel), } } -func (rt Router) handleArtifacts(w http.ResponseWriter, r *http.Request, ps httprouter.Params) { +func (rt *Router) handleArtifacts(w http.ResponseWriter, r *http.Request, ps httprouter.Params) { start := time.Now() var ( @@ -112,12 +104,6 @@ func (rt Router) handleArtifacts(w http.ResponseWriter, r *http.Request, ps http } func (at ArtifactT) handleArtifacts(zlog *zerolog.Logger, r *http.Request, id, sha2 string) (io.Reader, error) { - limitF, err := at.limit.Acquire() - if err != nil { - return nil, err - } - defer limitF() - // Authenticate the APIKey; retrieve agent record. // Note: This is going to be a bit slow even if we hit the cache on the api key. // In order to validate that the agent still has that api key, we fetch the agent record from elastic. @@ -131,10 +117,6 @@ func (at ArtifactT) handleArtifacts(zlog *zerolog.Logger, r *http.Request, id, s return ctx.Str(LogAccessAPIKeyID, agent.AccessAPIKeyID) }) - // Metrics; serenity now. - dfunc := cntArtifacts.IncStart() - defer dfunc() - return at.processRequest(r.Context(), *zlog, agent, id, sha2) } diff --git a/internal/pkg/api/handleCheckin.go b/internal/pkg/api/handleCheckin.go index d3c1323e4..6bbbbb8af 100644 --- a/internal/pkg/api/handleCheckin.go +++ b/internal/pkg/api/handleCheckin.go @@ -10,6 +10,7 @@ import ( "compress/gzip" "context" "encoding/json" + "fmt" "math/rand" "net/http" "reflect" @@ -21,7 +22,6 @@ import ( "github.com/elastic/fleet-server/v7/internal/pkg/checkin" "github.com/elastic/fleet-server/v7/internal/pkg/config" "github.com/elastic/fleet-server/v7/internal/pkg/dl" - "github.com/elastic/fleet-server/v7/internal/pkg/limit" "github.com/elastic/fleet-server/v7/internal/pkg/logger" "github.com/elastic/fleet-server/v7/internal/pkg/model" "github.com/elastic/fleet-server/v7/internal/pkg/monitor" @@ -47,7 +47,8 @@ const ( kEncodingGzip = "gzip" ) -func (rt Router) handleCheckin(w http.ResponseWriter, r *http.Request, ps httprouter.Params) { +//nolint:dupl // function body calls different internal hander then handleAck +func (rt *Router) handleCheckin(w http.ResponseWriter, r *http.Request, ps httprouter.Params) { start := time.Now() id := ps.ByName("id") @@ -60,17 +61,10 @@ func (rt Router) handleCheckin(w http.ResponseWriter, r *http.Request, ps httpro Logger() err := rt.ct.handleCheckin(&zlog, w, r, id) - if err != nil { cntCheckin.IncError(err) resp := NewHTTPErrResp(err) - // Log this as warn for visibility that limit has been reached. - // This allows customers to tune the configuration on detection of threshold. - if errors.Is(err, limit.ErrMaxLimit) { - resp.Level = zerolog.WarnLevel - } - zlog.WithLevel(resp.Level). Err(err). Int(ECSHTTPResponseCode, resp.StatusCode). @@ -93,7 +87,6 @@ type CheckinT struct { ad *action.Dispatcher tr *action.TokenResolver bulker bulk.Bulk - limit *limit.Limiter } func NewCheckinT( @@ -107,14 +100,6 @@ func NewCheckinT( tr *action.TokenResolver, bulker bulk.Bulk, ) *CheckinT { - - log.Info(). - Interface("limits", cfg.Limits.CheckinLimit). - Dur("long_poll_timeout", cfg.Timeouts.CheckinLongPoll). - Dur("long_poll_timestamp", cfg.Timeouts.CheckinTimestamp). - Dur("long_poll_jitter", cfg.Timeouts.CheckinJitter). - Msg("Checkin install limits") - ct := &CheckinT{ verCon: verCon, cfg: cfg, @@ -124,7 +109,6 @@ func NewCheckinT( gcp: gcp, ad: ad, tr: tr, - limit: limit.NewLimiter(&cfg.Limits.CheckinLimit), bulker: bulker, } @@ -132,15 +116,8 @@ func NewCheckinT( } func (ct *CheckinT) handleCheckin(zlog *zerolog.Logger, w http.ResponseWriter, r *http.Request, id string) error { - start := time.Now() - limitF, err := ct.limit.Acquire() - if err != nil { - return err - } - defer limitF() - agent, err := authAgent(r, &id, ct.bulker, ct.cache) if err != nil { return err @@ -158,11 +135,6 @@ func (ct *CheckinT) handleCheckin(zlog *zerolog.Logger, w http.ResponseWriter, r // Safely check if the agent version is different, return empty string otherwise newVer := agent.CheckDifferentVersion(ver) - - // Metrics; serenity now. - dfunc := cntCheckin.IncStart() - defer dfunc() - return ct.processRequest(*zlog, w, r, start, agent, newVer) } @@ -259,6 +231,7 @@ func (ct *CheckinT) processRequest(zlog zerolog.Logger, w http.ResponseWriter, r if err != nil { return err } + pendingActions = filterActions(agent.Id, pendingActions) actions, ackToken = convertActions(agent.Id, pendingActions) if len(actions) == 0 { @@ -269,6 +242,7 @@ func (ct *CheckinT) processRequest(zlog zerolog.Logger, w http.ResponseWriter, r return ctx.Err() case acdocs := <-actCh: var acs []ActionResp + acdocs = filterActions(agent.Id, acdocs) acs, ackToken = convertActions(agent.Id, acdocs) actions = append(actions, acs...) break LOOP @@ -404,6 +378,22 @@ func (ct *CheckinT) fetchAgentPendingActions(ctx context.Context, seqno sqn.SeqN return actions, err } +// filterActions removes the POLICY_CHANGE action from the passed list. +// The source of this list are documents from the fleet actions index. +// The POLICY_CHANGE action that the agent receives are generated by the fleet-server when it detects a different policy in processRequest() +func filterActions(agentID string, actions []model.Action) []model.Action { + resp := make([]model.Action, 0, len(actions)) + for _, action := range actions { + if action.Type == TypePolicyChange { + log.Info().Str("agent_id", agentID).Str("action_id", action.ActionID).Msg("Removing POLICY_CHANGE action found in index from check in response") + continue + } + resp = append(resp, action) + } + return resp + +} + func convertActions(agentID string, actions []model.Action) ([]ActionResp, string) { var ackToken string sz := len(actions) @@ -436,13 +426,13 @@ func convertActions(agentID string, actions []model.Action) ([]ActionResp, strin // func processPolicy(ctx context.Context, zlog zerolog.Logger, bulker bulk.Bulk, agentID string, pp *policy.ParsedPolicy) (*ActionResp, error) { zlog = zlog.With(). - Str("ctx", "processPolicy"). - Int64("policyRevision", pp.Policy.RevisionIdx). - Int64("policyCoordinator", pp.Policy.CoordinatorIdx). + Str("fleet.ctx", "processPolicy"). + Int64("fleet.policyRevision", pp.Policy.RevisionIdx). + Int64("fleet.policyCoordinator", pp.Policy.CoordinatorIdx). Str(LogPolicyID, pp.Policy.PolicyID). Logger() - // Repull and decode the agent object. Do not trust the cache. + // Repull and decode the agent object. Do not trust the cache. agent, err := dl.FindAgent(ctx, bulker, dl.QueryAgentByID, dl.FieldID, agentID) if err != nil { zlog.Error().Err(err).Msg("fail find agent record") @@ -452,7 +442,6 @@ func processPolicy(ctx context.Context, zlog zerolog.Logger, bulker bulk.Bulk, a // Parse the outputs maps in order to prepare the outputs const outputsProperty = "outputs" outputs, err := smap.Parse(pp.Fields[outputsProperty]) - if err != nil { return nil, err } @@ -464,9 +453,9 @@ func processPolicy(ctx context.Context, zlog zerolog.Logger, bulker bulk.Bulk, a // Iterate through the policy outputs and prepare them for _, policyOutput := range pp.Outputs { err = policyOutput.Prepare(ctx, zlog, bulker, &agent, outputs) - if err != nil { - return nil, err + return nil, fmt.Errorf("failed to prepare output %q: %w", + policyOutput.Name, err) } } diff --git a/internal/pkg/api/handleCheckin_test.go b/internal/pkg/api/handleCheckin_test.go new file mode 100644 index 000000000..569963cbb --- /dev/null +++ b/internal/pkg/api/handleCheckin_test.go @@ -0,0 +1,103 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +//go:build !integration +// +build !integration + +package api + +import ( + "encoding/json" + "testing" + + "github.com/elastic/fleet-server/v7/internal/pkg/model" + "github.com/stretchr/testify/assert" +) + +func TestConvertActions(t *testing.T) { + tests := []struct { + name string + actions []model.Action + resp []ActionResp + token string + }{{ + name: "empty actions", + actions: nil, + resp: []ActionResp{}, + token: "", + }, { + name: "single action", + actions: []model.Action{{ActionID: "1234"}}, + resp: []ActionResp{{ + AgentID: "agent-id", + ID: "1234", + Data: json.RawMessage(nil), + }}, + token: "", + }, { + name: "multiple actions", + actions: []model.Action{ + {ActionID: "1234"}, + {ActionID: "5678"}, + }, + resp: []ActionResp{{ + AgentID: "agent-id", + ID: "1234", + Data: json.RawMessage(nil), + }, { + AgentID: "agent-id", + ID: "5678", + Data: json.RawMessage(nil), + }}, + token: "", + }} + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + resp, token := convertActions("agent-id", tc.actions) + assert.Equal(t, tc.resp, resp) + assert.Equal(t, tc.token, token) + }) + } +} + +func TestFilterActions(t *testing.T) { + tests := []struct { + name string + actions []model.Action + resp []model.Action + }{{ + name: "empty list", + actions: []model.Action{}, + resp: []model.Action{}, + }, { + name: "nothing filtered", + actions: []model.Action{{ + ActionID: "1234", + }, { + ActionID: "5678", + }}, + resp: []model.Action{{ + ActionID: "1234", + }, { + ActionID: "5678", + }}, + }, { + name: "filter POLICY_CHANGE action", + actions: []model.Action{{ + ActionID: "1234", + Type: TypePolicyChange, + }, { + ActionID: "5678", + }}, + resp: []model.Action{{ + ActionID: "5678", + }}, + }} + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + resp := filterActions("agent-id", tc.actions) + assert.Equal(t, tc.resp, resp) + }) + } +} diff --git a/internal/pkg/api/handleChecking_test.go b/internal/pkg/api/handleChecking_test.go deleted file mode 100644 index 151f56055..000000000 --- a/internal/pkg/api/handleChecking_test.go +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one -// or more contributor license agreements. Licensed under the Elastic License; -// you may not use this file except in compliance with the Elastic License. - -//go:build !integration -// +build !integration - -package api - -import ( - "encoding/json" - "testing" - - "github.com/elastic/fleet-server/v7/internal/pkg/model" - "github.com/stretchr/testify/assert" -) - -func TestConvertActionsEmpty(t *testing.T) { - resp, token := convertActions("1234", nil) - assert.Equal(t, resp, []ActionResp{}) - assert.Equal(t, token, "") -} - -func TestConvertActions(t *testing.T) { - actions := []model.Action{ - { - ActionID: "1234", - }, - } - resp, token := convertActions("agent-id", actions) - assert.Equal(t, resp, []ActionResp{ - { - AgentID: "agent-id", - ID: "1234", - Data: json.RawMessage(nil), - }, - }) - assert.Equal(t, token, "") -} diff --git a/internal/pkg/api/handleEnroll.go b/internal/pkg/api/handleEnroll.go index a3c2f9833..f08e6d770 100644 --- a/internal/pkg/api/handleEnroll.go +++ b/internal/pkg/api/handleEnroll.go @@ -16,7 +16,6 @@ import ( "github.com/elastic/fleet-server/v7/internal/pkg/cache" "github.com/elastic/fleet-server/v7/internal/pkg/config" "github.com/elastic/fleet-server/v7/internal/pkg/dl" - "github.com/elastic/fleet-server/v7/internal/pkg/limit" "github.com/elastic/fleet-server/v7/internal/pkg/logger" "github.com/elastic/fleet-server/v7/internal/pkg/model" "github.com/elastic/fleet-server/v7/internal/pkg/rollback" @@ -49,26 +48,19 @@ type EnrollerT struct { cfg *config.Server bulker bulk.Bulk cache cache.Cache - limit *limit.Limiter } func NewEnrollerT(verCon version.Constraints, cfg *config.Server, bulker bulk.Bulk, c cache.Cache) (*EnrollerT, error) { - - log.Info(). - Interface("limits", cfg.Limits.EnrollLimit). - Msg("Setting config enroll_limit") - return &EnrollerT{ verCon: verCon, cfg: cfg, - limit: limit.NewLimiter(&cfg.Limits.EnrollLimit), bulker: bulker, cache: c, }, nil } -func (rt Router) handleEnroll(w http.ResponseWriter, r *http.Request, ps httprouter.Params) { +func (rt *Router) handleEnroll(w http.ResponseWriter, r *http.Request, ps httprouter.Params) { start := time.Now() // Work around wonky router rule @@ -130,13 +122,6 @@ func (rt Router) handleEnroll(w http.ResponseWriter, r *http.Request, ps httprou } func (et *EnrollerT) handleEnroll(rb *rollback.Rollback, zlog *zerolog.Logger, w http.ResponseWriter, r *http.Request) (*EnrollResponse, error) { - - limitF, err := et.limit.Acquire() - if err != nil { - return nil, err - } - defer limitF() - key, err := authAPIKey(r, et.bulker, et.cache) if err != nil { return nil, err @@ -152,10 +137,6 @@ func (et *EnrollerT) handleEnroll(rb *rollback.Rollback, zlog *zerolog.Logger, w return nil, err } - // Metrics; serenity now. - dfunc := cntEnroll.IncStart() - defer dfunc() - return et.processRequest(rb, *zlog, w, r, key.ID, ver) } @@ -187,7 +168,13 @@ func (et *EnrollerT) processRequest(rb *rollback.Rollback, zlog zerolog.Logger, return et._enroll(r.Context(), rb, zlog, req, erec.PolicyID, ver) } -func (et *EnrollerT) _enroll(ctx context.Context, rb *rollback.Rollback, zlog zerolog.Logger, req *EnrollRequest, policyID, ver string) (*EnrollResponse, error) { +func (et *EnrollerT) _enroll( + ctx context.Context, + rb *rollback.Rollback, + zlog zerolog.Logger, + req *EnrollRequest, + policyID, + ver string) (*EnrollResponse, error) { if req.SharedID != "" { // TODO: Support pre-existing install @@ -293,7 +280,7 @@ func invalidateAPIKey(ctx context.Context, zlog zerolog.Logger, bulker bulk.Bulk LOOP: for { - _, err := bulker.APIKeyRead(ctx, apikeyID) + _, err := bulker.APIKeyRead(ctx, apikeyID, false) switch { case err == nil: @@ -427,7 +414,7 @@ func generateAccessAPIKey(ctx context.Context, bulk bulk.Bulk, agentID string) ( agentID, "", []byte(kFleetAccessRolesJSON), - apikey.NewMetadata(agentID, apikey.TypeAccess), + apikey.NewMetadata(agentID, "", apikey.TypeAccess), ) } diff --git a/internal/pkg/api/handleStatus.go b/internal/pkg/api/handleStatus.go index 917eca05c..922a8251a 100644 --- a/internal/pkg/api/handleStatus.go +++ b/internal/pkg/api/handleStatus.go @@ -17,7 +17,6 @@ import ( "github.com/elastic/fleet-server/v7/internal/pkg/bulk" "github.com/elastic/fleet-server/v7/internal/pkg/cache" "github.com/elastic/fleet-server/v7/internal/pkg/config" - "github.com/elastic/fleet-server/v7/internal/pkg/limit" "github.com/elastic/fleet-server/v7/internal/pkg/logger" "github.com/julienschmidt/httprouter" @@ -33,7 +32,6 @@ type AuthFunc func(*http.Request) (*apikey.APIKey, error) type StatusT struct { cfg *config.Server - limit *limit.Limiter bulk bulk.Bulk cache cache.Cache authfn AuthFunc @@ -42,15 +40,10 @@ type StatusT struct { type OptFunc func(*StatusT) func NewStatusT(cfg *config.Server, bulker bulk.Bulk, cache cache.Cache, opts ...OptFunc) *StatusT { - log.Info(). - Interface("limits", cfg.Limits.StatusLimit). - Msg("Setting config status_limits") - st := &StatusT{ cfg: cfg, bulk: bulker, cache: cache, - limit: limit.NewLimiter(&cfg.Limits.StatusLimit), } st.authfn = st.authenticate @@ -68,14 +61,7 @@ func (st StatusT) authenticate(r *http.Request) (*apikey.APIKey, error) { return authAPIKey(r, st.bulk, st.cache) } -func (st StatusT) handleStatus(_ *zerolog.Logger, r *http.Request, rt *Router) (resp StatusResponse, state client.UnitState, err error) { - limitF, err := st.limit.Acquire() - // When failing to acquire a limiter send an error response. - if err != nil { - return - } - defer limitF() - +func (st StatusT) handleStatus(_ *zerolog.Logger, r *http.Request, rt *Router) (resp StatusResponse, state client.UnitState) { authed := true if _, aerr := st.authfn(r); aerr != nil { log.Debug().Err(aerr).Msg("unauthenticated status request, return short status response only") @@ -96,16 +82,11 @@ func (st StatusT) handleStatus(_ *zerolog.Logger, r *http.Request, rt *Router) ( } } - return resp, state, nil - + return resp, state } -func (rt Router) handleStatus(w http.ResponseWriter, r *http.Request, _ httprouter.Params) { +func (rt *Router) handleStatus(w http.ResponseWriter, r *http.Request, _ httprouter.Params) { start := time.Now() - - dfunc := cntStatus.IncStart() - defer dfunc() - reqID := r.Header.Get(logger.HeaderRequestID) zlog := log.With(). @@ -113,22 +94,7 @@ func (rt Router) handleStatus(w http.ResponseWriter, r *http.Request, _ httprout Str("mod", kStatusMod). Logger() - resp, state, err := rt.st.handleStatus(&zlog, r, &rt) - if err != nil { - cntStatus.IncError(err) - resp := NewHTTPErrResp(err) - - zlog.WithLevel(resp.Level). - Err(err). - Int(ECSHTTPResponseCode, resp.StatusCode). - Int64(ECSEventDuration, time.Since(start).Nanoseconds()). - Msg("fail status") - - if rerr := resp.Write(w); rerr != nil { - zlog.Error().Err(rerr).Msg("fail writing error response") - } - return - } + resp, state := rt.st.handleStatus(&zlog, r, rt) data, err := json.Marshal(&resp) if err != nil { diff --git a/internal/pkg/api/handleStatus_test.go b/internal/pkg/api/handleStatus_test.go index 19359cd79..fd58cb824 100644 --- a/internal/pkg/api/handleStatus_test.go +++ b/internal/pkg/api/handleStatus_test.go @@ -52,7 +52,7 @@ func TestHandleStatus(t *testing.T) { cfg := &config.Server{} cfg.InitDefaults() - c, err := cache.New(cache.Config{NumCounters: 100, MaxCost: 100000}) + c, err := cache.New(config.Cache{NumCounters: 100, MaxCost: 100000}) require.NoError(t, err) authfnOk := func(r *http.Request) (*apikey.APIKey, error) { diff --git a/internal/pkg/api/router.go b/internal/pkg/api/router.go index bfa34dc05..7d219cbdf 100644 --- a/internal/pkg/api/router.go +++ b/internal/pkg/api/router.go @@ -6,10 +6,16 @@ package api import ( "context" + "crypto/tls" + "errors" + "net" "net/http" + "github.com/elastic/elastic-agent-libs/transport/tlscommon" "github.com/elastic/fleet-server/v7/internal/pkg/build" "github.com/elastic/fleet-server/v7/internal/pkg/bulk" + "github.com/elastic/fleet-server/v7/internal/pkg/config" + "github.com/elastic/fleet-server/v7/internal/pkg/limit" "github.com/elastic/fleet-server/v7/internal/pkg/logger" "github.com/elastic/fleet-server/v7/internal/pkg/policy" "github.com/julienschmidt/httprouter" @@ -27,7 +33,8 @@ const ( ) type Router struct { - ctx context.Context + ctx context.Context // used only by handleEnroll, set at start of Run func + cfg *config.Server bulker bulk.Bulk ct *CheckinT et *EnrollerT @@ -35,12 +42,13 @@ type Router struct { ack *AckT st *StatusT sm policy.SelfMonitor + tracer *apm.Tracer bi build.Info } -func NewRouter(ctx context.Context, bulker bulk.Bulk, ct *CheckinT, et *EnrollerT, at *ArtifactT, ack *AckT, st *StatusT, sm policy.SelfMonitor, tracer *apm.Tracer, bi build.Info) *httprouter.Router { - r := Router{ - ctx: ctx, +func NewRouter(cfg *config.Server, bulker bulk.Bulk, ct *CheckinT, et *EnrollerT, at *ArtifactT, ack *AckT, st *StatusT, sm policy.SelfMonitor, tracer *apm.Tracer, bi build.Info) *Router { + rt := &Router{ + cfg: cfg, bulker: bulker, ct: ct, et: et, @@ -48,9 +56,18 @@ func NewRouter(ctx context.Context, bulker bulk.Bulk, ct *CheckinT, et *Enroller at: at, ack: ack, st: st, + tracer: tracer, bi: bi, } + return rt +} + +// Create a new httprouter, the passed addr is only added as a label in log messages +func (rt *Router) newHTTPRouter(addr string) *httprouter.Router { + log.Info().Str("addr", addr).Interface("limits", rt.cfg.Limits).Msg("fleet-server creating new limiter") + limiter := limit.NewHTTPWrapper(addr, &rt.cfg.Limits) + routes := []struct { method string path string @@ -59,43 +76,43 @@ func NewRouter(ctx context.Context, bulker bulk.Bulk, ct *CheckinT, et *Enroller { http.MethodGet, RouteStatus, - r.handleStatus, + limiter.WrapStatus(rt.handleStatus, &cntStatus), }, { http.MethodPost, RouteEnroll, - r.handleEnroll, + limiter.WrapEnroll(rt.handleEnroll, &cntEnroll), }, { http.MethodPost, RouteCheckin, - r.handleCheckin, + limiter.WrapCheckin(rt.handleCheckin, &cntCheckin), }, { http.MethodPost, RouteAcks, - r.handleAcks, + limiter.WrapAck(rt.handleAcks, &cntAcks), }, { http.MethodGet, RouteArtifacts, - r.handleArtifacts, + limiter.WrapArtifact(rt.handleArtifacts, &cntArtifacts), }, } router := httprouter.New() - // Install routes for _, rte := range routes { log.Info(). + Str("addr", addr). Str("method", rte.method). Str("path", rte.path). Msg("fleet-server route added") handler := rte.handler - if tracer != nil { + if rt.tracer != nil { handler = apmhttprouter.Wrap( - rte.handler, rte.path, apmhttprouter.WithTracer(tracer), + rte.handler, rte.path, apmhttprouter.WithTracer(rt.tracer), ) } router.Handle( @@ -104,8 +121,120 @@ func NewRouter(ctx context.Context, bulker bulk.Bulk, ct *CheckinT, et *Enroller logger.HTTPHandler(handler), ) } + log.Info().Str("addr", addr).Msg("fleet-server routes set up") + return router +} - log.Info().Msg("fleet-server routes set up") +// Run starts the api server on the listeners configured in the config. +// Each listener has a unique limit.Limiter to allow for non-global rate limits. +func (rt *Router) Run(ctx context.Context) error { + rt.ctx = ctx - return router + listeners := rt.cfg.BindEndpoints() + rdto := rt.cfg.Timeouts.Read + wrto := rt.cfg.Timeouts.Write + idle := rt.cfg.Timeouts.Idle + rdhr := rt.cfg.Timeouts.ReadHeader + mhbz := rt.cfg.Limits.MaxHeaderByteSize + bctx := func(net.Listener) context.Context { return ctx } + + errChan := make(chan error) + baseCtx, cancel := context.WithCancel(ctx) + defer cancel() + + for _, addr := range listeners { + log.Info(). + Str("bind", addr). + Dur("rdTimeout", rdto). + Dur("wrTimeout", wrto). + Msg("server listening") + + server := http.Server{ + Addr: addr, + ReadTimeout: rdto, + WriteTimeout: wrto, + IdleTimeout: idle, + ReadHeaderTimeout: rdhr, + Handler: rt.newHTTPRouter(addr), // Note that we use a different router for each listener instead of wrapping with different middleware instances as it is cleaner to do + BaseContext: bctx, + ConnState: diagConn, + MaxHeaderBytes: mhbz, + ErrorLog: errLogger(), + } + + forceCh := make(chan struct{}) + defer close(forceCh) + + // handler to close server + go func() { + select { + case <-ctx.Done(): + log.Debug().Msg("force server close on ctx.Done()") + err := server.Close() + if err != nil { + log.Error().Err(err).Msg("error while closing server") + } + case <-forceCh: + log.Debug().Msg("go routine forced closed on exit") + } + }() + + var listenCfg net.ListenConfig + + ln, err := listenCfg.Listen(ctx, "tcp", addr) + if err != nil { + return err + } + + // Bind the deferred Close() to the stack variable to handle case where 'ln' is wrapped + defer func() { + err := ln.Close() + if err != nil { + log.Error().Err(err).Msg("error while closing listener.") + } + }() + + // Conn Limiter must be before the TLS handshake in the stack; + // The server should not eat the cost of the handshake if there + // is no capacity to service the connection. + // Also, it appears the HTTP2 implementation depends on the tls.Listener + // being at the top of the stack. + ln = wrapConnLimitter(ctx, ln, rt.cfg) + + if rt.cfg.TLS != nil && rt.cfg.TLS.IsEnabled() { + commonTLSCfg, err := tlscommon.LoadTLSServerConfig(rt.cfg.TLS) + if err != nil { + return err + } + server.TLSConfig = commonTLSCfg.BuildServerConfig(rt.cfg.Host) + + // Must enable http/2 in the configuration explicitly. + // (see https://golang.org/pkg/net/http/#Server.Serve) + server.TLSConfig.NextProtos = []string{"h2", "http/1.1"} + + ln = tls.NewListener(ln, server.TLSConfig) + + } else { + log.Warn().Msg("Exposed over insecure HTTP; enablement of TLS is strongly recommended") + } + + log.Debug().Msgf("Listening on %s", addr) + + go func(_ context.Context, errChan chan error, ln net.Listener) { + if err := server.Serve(ln); err != nil && !errors.Is(err, http.ErrServerClosed) { + errChan <- err + } + }(baseCtx, errChan, ln) + + } + + select { + case err := <-errChan: + if !errors.Is(err, context.Canceled) { + return err + } + case <-baseCtx.Done(): + } + + return nil } diff --git a/internal/pkg/api/server_test.go b/internal/pkg/api/router_test.go similarity index 91% rename from internal/pkg/api/server_test.go rename to internal/pkg/api/router_test.go index 44a223c6c..9844c0ec9 100644 --- a/internal/pkg/api/server_test.go +++ b/internal/pkg/api/router_test.go @@ -38,7 +38,7 @@ func TestRun(t *testing.T) { cfg.Port = port verCon := mustBuildConstraints("8.0.0") - c, err := cache.New(cache.Config{NumCounters: 100, MaxCost: 100000}) + c, err := cache.New(config.Cache{NumCounters: 100, MaxCost: 100000}) require.NoError(t, err) bulker := ftesting.NewMockBulk() pim := mock.NewMockMonitor() @@ -48,13 +48,13 @@ func TestRun(t *testing.T) { et, err := NewEnrollerT(verCon, cfg, nil, c) require.NoError(t, err) - router := NewRouter(ctx, bulker, ct, et, nil, nil, nil, nil, nil, fbuild.Info{}) + router := NewRouter(cfg, bulker, ct, et, nil, nil, nil, nil, nil, fbuild.Info{}) errCh := make(chan error) var wg sync.WaitGroup wg.Add(1) go func() { - err = Run(ctx, router, cfg) + err = router.Run(ctx) wg.Done() }() var errFromChan error diff --git a/internal/pkg/api/schema.go b/internal/pkg/api/schema.go index 5d04d563e..03d65966c 100644 --- a/internal/pkg/api/schema.go +++ b/internal/pkg/api/schema.go @@ -79,11 +79,10 @@ type EnrollResponse struct { type CheckinRequest struct { Status string `json:"status"` + Message string `json:"message"` AckToken string `json:"ack_token,omitempty"` - Events []Event `json:"events"` LocalMeta json.RawMessage `json:"local_metadata"` - Message string `json:"message"` // V2 Agent message - Components json.RawMessage `json:"components,omitempty"` // V2 Agent components + Components json.RawMessage `json:"components,omitempty"` } type CheckinResponse struct { diff --git a/internal/pkg/api/server.go b/internal/pkg/api/server.go index 8787b2c34..32ab05358 100644 --- a/internal/pkg/api/server.go +++ b/internal/pkg/api/server.go @@ -6,13 +6,10 @@ package api import ( "context" - "crypto/tls" - "errors" slog "log" "net" "net/http" - "github.com/elastic/elastic-agent-libs/transport/tlscommon" "github.com/elastic/fleet-server/v7/internal/pkg/config" "github.com/elastic/fleet-server/v7/internal/pkg/limit" "github.com/elastic/fleet-server/v7/internal/pkg/logger" @@ -39,117 +36,6 @@ func diagConn(c net.Conn, s http.ConnState) { } } -// Run runs the passed router with the config. -func Run(ctx context.Context, router http.Handler, cfg *config.Server) error { - listeners := cfg.BindEndpoints() - rdto := cfg.Timeouts.Read - wrto := cfg.Timeouts.Write - idle := cfg.Timeouts.Idle - rdhr := cfg.Timeouts.ReadHeader - mhbz := cfg.Limits.MaxHeaderByteSize - bctx := func(net.Listener) context.Context { return ctx } - - errChan := make(chan error) - cancelCtx, cancel := context.WithCancel(ctx) - defer cancel() - - for _, addr := range listeners { - log.Info(). - Str("bind", addr). - Dur("rdTimeout", rdto). - Dur("wrTimeout", wrto). - Msg("server listening") - - server := http.Server{ - Addr: addr, - ReadTimeout: rdto, - WriteTimeout: wrto, - IdleTimeout: idle, - ReadHeaderTimeout: rdhr, - Handler: router, - BaseContext: bctx, - ConnState: diagConn, - MaxHeaderBytes: mhbz, - ErrorLog: errLogger(), - } - - forceCh := make(chan struct{}) - defer close(forceCh) - - // handler to close server - go func() { - select { - case <-ctx.Done(): - log.Debug().Msg("force server close on ctx.Done()") - err := server.Close() - if err != nil { - log.Error().Err(err).Msg("error while closing server") - } - case <-forceCh: - log.Debug().Msg("go routine forced closed on exit") - } - }() - - var listenCfg net.ListenConfig - - ln, err := listenCfg.Listen(ctx, "tcp", addr) - if err != nil { - return err - } - - // Bind the deferred Close() to the stack variable to handle case where 'ln' is wrapped - defer func() { - err := ln.Close() - if err != nil { - log.Error().Err(err).Msg("error while closing listener.") - } - }() - - // Conn Limiter must be before the TLS handshake in the stack; - // The server should not eat the cost of the handshake if there - // is no capacity to service the connection. - // Also, it appears the HTTP2 implementation depends on the tls.Listener - // being at the top of the stack. - ln = wrapConnLimitter(ctx, ln, cfg) - - if cfg.TLS != nil && cfg.TLS.IsEnabled() { - commonTLSCfg, err := tlscommon.LoadTLSServerConfig(cfg.TLS) - if err != nil { - return err - } - server.TLSConfig = commonTLSCfg.BuildServerConfig(cfg.Host) - - // Must enable http/2 in the configuration explicitly. - // (see https://golang.org/pkg/net/http/#Server.Serve) - server.TLSConfig.NextProtos = []string{"h2", "http/1.1"} - - ln = tls.NewListener(ln, server.TLSConfig) - - } else { - log.Warn().Msg("Exposed over insecure HTTP; enablement of TLS is strongly recommended") - } - - log.Debug().Msgf("Listening on %s", addr) - - go func(_ context.Context, errChan chan error, ln net.Listener) { - if err := server.Serve(ln); err != nil && !errors.Is(err, http.ErrServerClosed) { - errChan <- err - } - }(cancelCtx, errChan, ln) - - } - - select { - case err := <-errChan: - if !errors.Is(err, context.Canceled) { - return err - } - case <-cancelCtx.Done(): - } - - return nil -} - func wrapConnLimitter(_ context.Context, ln net.Listener, cfg *config.Server) net.Listener { hardLimit := cfg.Limits.MaxConnections diff --git a/internal/pkg/apikey/apikey.go b/internal/pkg/apikey/apikey.go index 4924a647b..05551f272 100644 --- a/internal/pkg/apikey/apikey.go +++ b/internal/pkg/apikey/apikey.go @@ -6,12 +6,17 @@ package apikey import ( + "context" "encoding/base64" - "errors" + "encoding/json" "fmt" "net/http" "strings" "unicode/utf8" + + "github.com/elastic/go-elasticsearch/v7" + "github.com/elastic/go-elasticsearch/v7/esapi" + "github.com/pkg/errors" ) const ( @@ -28,6 +33,66 @@ var ( var AuthKey = http.CanonicalHeaderKey("Authorization") +// APIKeyMetadata tracks Metadata associated with an APIKey. +type APIKeyMetadata struct { + ID string + Metadata Metadata + RoleDescriptors json.RawMessage +} + +// Read gathers APIKeyMetadata from Elasticsearch using the given client. +func Read(ctx context.Context, client *elasticsearch.Client, id string, withOwner bool) (*APIKeyMetadata, error) { + + opts := []func(*esapi.SecurityGetAPIKeyRequest){ + client.Security.GetAPIKey.WithContext(ctx), + client.Security.GetAPIKey.WithID(id), + } + if withOwner { + opts = append(opts, client.Security.GetAPIKey.WithOwner(true)) + } + + res, err := client.Security.GetAPIKey( + opts..., + ) + + if err != nil { + return nil, fmt.Errorf("request to elasticsearch failed: %w", err) + } + defer res.Body.Close() + + if res.IsError() { + return nil, fmt.Errorf("%s: %w", res.String(), ErrAPIKeyNotFound) + } + + type APIKeyResponse struct { + ID string `json:"id"` + Metadata Metadata `json:"metadata"` + RoleDescriptors json.RawMessage `json:"role_descriptors"` + } + type GetAPIKeyResponse struct { + APIKeys []APIKeyResponse `json:"api_keys"` + } + + var resp GetAPIKeyResponse + d := json.NewDecoder(res.Body) + if err = d.Decode(&resp); err != nil { + return nil, fmt.Errorf( + "could not decode elasticsearch GetAPIKeyResponse: %w", err) + } + + if len(resp.APIKeys) == 0 { + return nil, ErrAPIKeyNotFound + } + + first := resp.APIKeys[0] + + return &APIKeyMetadata{ + ID: first.ID, + Metadata: first.Metadata, + RoleDescriptors: first.RoleDescriptors, + }, nil +} + // APIKey is used to represent an Elasticsearch API Key. type APIKey struct { ID string diff --git a/internal/pkg/apikey/apikey_integration_test.go b/internal/pkg/apikey/apikey_integration_test.go index 5c4e3b97c..ce4529254 100644 --- a/internal/pkg/apikey/apikey_integration_test.go +++ b/internal/pkg/apikey/apikey_integration_test.go @@ -30,7 +30,7 @@ const testFleetRoles = ` } ` -func TestCreateAPIKeyWithMetadata(t *testing.T) { +func TestRead_existingKey(t *testing.T) { ctx, cn := context.WithCancel(context.Background()) defer cn() @@ -48,13 +48,13 @@ func TestCreateAPIKeyWithMetadata(t *testing.T) { agentID := uuid.Must(uuid.NewV4()).String() name := uuid.Must(uuid.NewV4()).String() akey, err := Create(ctx, es, name, "", "true", []byte(testFleetRoles), - NewMetadata(agentID, TypeAccess)) + NewMetadata(agentID, "", TypeAccess)) if err != nil { t.Fatal(err) } // Get the key and verify that metadata was saved correctly - aKeyMeta, err := Read(ctx, es, akey.ID) + aKeyMeta, err := Read(ctx, es, akey.ID, false) if err != nil { t.Fatal(err) } @@ -79,9 +79,99 @@ func TestCreateAPIKeyWithMetadata(t *testing.T) { t.Error(diff) } - // Try to get the key that doesn't exists, expect ErrApiKeyNotFound - _, err = Read(ctx, es, "0000000000000") +} + +func TestRead_noKey(t *testing.T) { + ctx, cn := context.WithCancel(context.Background()) + defer cn() + + cfg := elasticsearch.Config{ + Username: "elastic", + Password: "changeme", + } + + es, err := elasticsearch.NewClient(cfg) + if err != nil { + t.Fatal(err) + } + + // Try to get the key that doesn't exist, expect ErrApiKeyNotFound + _, err = Read(ctx, es, "0000000000000", false) if !errors.Is(err, ErrAPIKeyNotFound) { - t.Errorf("Unexpected error type: %v", err) + t.Errorf("Unexpected error: %v", err) + } +} + +func TestCreateAPIKeyWithMetadata(t *testing.T) { + tts := []struct { + name string + outputName string + }{ + {name: "with metadata.output_name", outputName: "a_output_name"}, + {name: "without metadata.output_name"}, + } + + for _, tt := range tts { + t.Run(tt.name, func(t *testing.T) { + ctx, cn := context.WithCancel(context.Background()) + defer cn() + + cfg := elasticsearch.Config{ + Username: "elastic", + Password: "changeme", + } + + es, err := elasticsearch.NewClient(cfg) + if err != nil { + t.Fatal(err) + } + + // Create the API key + agentID := uuid.Must(uuid.NewV4()).String() + name := uuid.Must(uuid.NewV4()).String() + outputName := tt.outputName + apiKey, err := Create( + ctx, + es, + name, + "", + "true", + []byte(testFleetRoles), + NewMetadata(agentID, outputName, TypeAccess)) + if err != nil { + t.Fatal(err) + } + + // Get the API key and verify that the metadata was saved correctly + aKeyMeta, err := Read(ctx, es, apiKey.ID, false) + if err != nil { + t.Fatal(err) + } + + diff := cmp.Diff(ManagedByFleetServer, aKeyMeta.Metadata.ManagedBy) + if diff != "" { + t.Error(diff) + } + + diff = cmp.Diff(true, aKeyMeta.Metadata.Managed) + if diff != "" { + t.Error(diff) + } + + diff = cmp.Diff(agentID, aKeyMeta.Metadata.AgentID) + if diff != "" { + t.Error(diff) + } + + diff = cmp.Diff(outputName, aKeyMeta.Metadata.OutputName) + if diff != "" { + t.Error(diff) + } + + diff = cmp.Diff(TypeAccess.String(), aKeyMeta.Metadata.Type) + if diff != "" { + t.Error(diff) + } + }) } } diff --git a/internal/pkg/apikey/create.go b/internal/pkg/apikey/create.go index f3cee99f8..de61390c3 100644 --- a/internal/pkg/apikey/create.go +++ b/internal/pkg/apikey/create.go @@ -42,7 +42,6 @@ func Create(ctx context.Context, client *elasticsearch.Client, name, ttl, refres bytes.NewReader(body), opts..., ) - if err != nil { return nil, err } diff --git a/internal/pkg/apikey/get.go b/internal/pkg/apikey/get.go deleted file mode 100644 index 5d931c670..000000000 --- a/internal/pkg/apikey/get.go +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one -// or more contributor license agreements. Licensed under the Elastic License; -// you may not use this file except in compliance with the Elastic License. - -package apikey - -import ( - "context" - "encoding/json" - - "github.com/elastic/go-elasticsearch/v7" - "github.com/elastic/go-elasticsearch/v7/esapi" - "github.com/pkg/errors" -) - -// APIKetMetadata tracks Metadata associated with an APIKey. -type APIKeyMetadata struct { - ID string - Metadata Metadata -} - -// Read gathers APIKeyMetadata from Elasticsearch using the given client. -func Read(ctx context.Context, client *elasticsearch.Client, id string) (*APIKeyMetadata, error) { - - opts := []func(*esapi.SecurityGetAPIKeyRequest){ - client.Security.GetAPIKey.WithContext(ctx), - client.Security.GetAPIKey.WithID(id), - } - - res, err := client.Security.GetAPIKey( - opts..., - ) - - if err != nil { - return nil, err - } - defer res.Body.Close() - - if res.IsError() { - err = errors.Wrap(ErrAPIKeyNotFound, res.String()) - return nil, err - } - - type APIKeyResponse struct { - ID string `json:"id"` - Metadata Metadata `json:"metadata"` - } - type GetAPIKeyResponse struct { - APIKeys []APIKeyResponse `json:"api_keys"` - } - - var resp GetAPIKeyResponse - d := json.NewDecoder(res.Body) - if err = d.Decode(&resp); err != nil { - return nil, err - } - - if len(resp.APIKeys) == 0 { - return nil, ErrAPIKeyNotFound - } - - first := resp.APIKeys[0] - - return &APIKeyMetadata{ - ID: first.ID, - Metadata: first.Metadata, - }, nil -} diff --git a/internal/pkg/apikey/invalidate.go b/internal/pkg/apikey/invalidate.go index 421662388..6c5d5d304 100644 --- a/internal/pkg/apikey/invalidate.go +++ b/internal/pkg/apikey/invalidate.go @@ -38,7 +38,6 @@ func Invalidate(ctx context.Context, client *elasticsearch.Client, ids ...string bytes.NewReader(body), opts..., ) - if err != nil { return fmt.Errorf("InvalidateAPIKey: %w", err) } diff --git a/internal/pkg/apikey/metadata.go b/internal/pkg/apikey/metadata.go index c80997c7b..d00380c01 100644 --- a/internal/pkg/apikey/metadata.go +++ b/internal/pkg/apikey/metadata.go @@ -19,18 +19,20 @@ func (t Type) String() string { // Metadata is additional information associated with an APIKey. type Metadata struct { - AgentID string `json:"agent_id,omitempty"` - Managed bool `json:"managed,omitempty"` - ManagedBy string `json:"managed_by,omitempty"` - Type string `json:"type,omitempty"` + AgentID string `json:"agent_id,omitempty"` + Managed bool `json:"managed,omitempty"` + ManagedBy string `json:"managed_by,omitempty"` + OutputName string `json:"output_name,omitempty"` + Type string `json:"type,omitempty"` } // NewMetadata returns Metadata for the given agentID. -func NewMetadata(agentID string, typ Type) Metadata { +func NewMetadata(agentID string, outputName string, typ Type) Metadata { return Metadata{ - AgentID: agentID, - Managed: true, - ManagedBy: ManagedByFleetServer, - Type: typ.String(), + AgentID: agentID, + Managed: true, + ManagedBy: ManagedByFleetServer, + OutputName: outputName, + Type: typ.String(), } } diff --git a/internal/pkg/bulk/block.go b/internal/pkg/bulk/block.go index 28c80927e..c2535172e 100644 --- a/internal/pkg/bulk/block.go +++ b/internal/pkg/bulk/block.go @@ -43,6 +43,7 @@ const ( ActionDelete ActionIndex ActionUpdate + ActionUpdateAPIKey ActionRead ActionSearch ActionFleetSearch @@ -53,6 +54,7 @@ var actionStrings = []string{ "delete", "index", "update", + "update_api_key", "read", "search", "fleet_search", diff --git a/internal/pkg/bulk/bulk_integration_test.go b/internal/pkg/bulk/bulk_integration_test.go index cc08642c9..05bb6202d 100644 --- a/internal/pkg/bulk/bulk_integration_test.go +++ b/internal/pkg/bulk/bulk_integration_test.go @@ -265,7 +265,7 @@ func TestBulkSearch(t *testing.T) { } if len(res.Hits) != 1 { - t.Fatal(fmt.Sprintf("hit mismatch: %d", len(res.Hits))) + t.Fatalf("hit mismatch: %d", len(res.Hits)) } var dst3 testT diff --git a/internal/pkg/bulk/engine.go b/internal/pkg/bulk/engine.go index 93420c780..68840729c 100644 --- a/internal/pkg/bulk/engine.go +++ b/internal/pkg/bulk/engine.go @@ -55,9 +55,10 @@ type Bulk interface { // APIKey operations APIKeyCreate(ctx context.Context, name, ttl string, roles []byte, meta interface{}) (*APIKey, error) - APIKeyRead(ctx context.Context, id string) (*APIKeyMetadata, error) + APIKeyRead(ctx context.Context, id string, withOwner bool) (*APIKeyMetadata, error) APIKeyAuth(ctx context.Context, key APIKey) (*SecurityInfo, error) APIKeyInvalidate(ctx context.Context, ids ...string) error + APIKeyUpdate(ctx context.Context, id, outputPolicyHash string, roles []byte) error // Accessor used to talk to elastic search direcly bypassing bulk engine Client() *elasticsearch.Client @@ -81,6 +82,7 @@ const ( defaultMaxPending = 32 defaultBlockQueueSz = 32 // Small capacity to allow multiOp to spin fast defaultAPIKeyMaxParallel = 32 + defaultApikeyMaxReqSize = 100 * 1024 * 1024 ) func NewBulker(es esapi.Transport, tracer *apm.Tracer, opts ...BulkOpt) *Bulker { @@ -136,6 +138,8 @@ func blkToQueueType(blk *bulkT) queueType { } else { queueIdx = kQueueRead } + case ActionUpdateAPIKey: + queueIdx = kQueueAPIKeyUpdate default: if forceRefresh { queueIdx = kQueueRefreshBulk @@ -288,6 +292,8 @@ func (b *Bulker) flushQueue(ctx context.Context, w *semaphore.Weighted, queue qu err = b.flushRead(ctx, queue) case kQueueSearch, kQueueFleetSearch: err = b.flushSearch(ctx, queue) + case kQueueAPIKeyUpdate: + err = b.flushUpdateAPIKey(ctx, queue) default: err = b.flushBulk(ctx, queue) } diff --git a/internal/pkg/bulk/opApiKey.go b/internal/pkg/bulk/opApiKey.go index 049c0ce17..099a7d291 100644 --- a/internal/pkg/bulk/opApiKey.go +++ b/internal/pkg/bulk/opApiKey.go @@ -5,15 +5,36 @@ package bulk import ( + "bytes" "context" + "encoding/json" + "math" "github.com/elastic/fleet-server/v7/internal/pkg/apikey" + "github.com/elastic/fleet-server/v7/internal/pkg/es" + "github.com/rs/zerolog/log" +) + +const ( + envelopeSize = 64 // 64B + safeBuffer = 0.9 ) // The ApiKey API's are not yet bulk enabled. Stub the calls in the bulker // and limit parallel access to prevent many requests from overloading // the connection pool in the elastic search client. +type apiKeyUpdateRequest struct { + ID string `json:"id,omitempty"` + Roles json.RawMessage `json:"role_descriptors,omitempty"` + RolesHash string `json:"role_hash,omitempty"` +} + +type esAPIKeyBulkUpdateRequest struct { + IDs []string `json:"ids,omitempty"` + Roles json.RawMessage `json:"role_descriptors,omitempty"` +} + func (b *Bulker) APIKeyAuth(ctx context.Context, key APIKey) (*SecurityInfo, error) { if err := b.apikeyLimit.Acquire(ctx, 1); err != nil { return nil, err @@ -32,13 +53,13 @@ func (b *Bulker) APIKeyCreate(ctx context.Context, name, ttl string, roles []byt return apikey.Create(ctx, b.Client(), name, ttl, "false", roles, meta) } -func (b *Bulker) APIKeyRead(ctx context.Context, id string) (*APIKeyMetadata, error) { +func (b *Bulker) APIKeyRead(ctx context.Context, id string, withOwner bool) (*APIKeyMetadata, error) { if err := b.apikeyLimit.Acquire(ctx, 1); err != nil { return nil, err } defer b.apikeyLimit.Release(1) - return apikey.Read(ctx, b.Client(), id) + return apikey.Read(ctx, b.Client(), id, withOwner) } func (b *Bulker) APIKeyInvalidate(ctx context.Context, ids ...string) error { @@ -49,3 +70,178 @@ func (b *Bulker) APIKeyInvalidate(ctx context.Context, ids ...string) error { return apikey.Invalidate(ctx, b.Client(), ids...) } + +func (b *Bulker) APIKeyUpdate(ctx context.Context, id, outputPolicyHash string, roles []byte) error { + req := &apiKeyUpdateRequest{ + ID: id, + Roles: roles, + RolesHash: outputPolicyHash, + } + + body, err := json.Marshal(req) + if err != nil { + return err + } + + _, err = b.waitBulkAction(ctx, ActionUpdateAPIKey, "", id, body) + return err +} + +// flushUpdateAPIKey takes an update API Key queue and groups request based on roles applied +// It needs to group agent IDs per Role Hash in order to produce more efficient request containing a list of IDs for a change(update) +// One thing to have in mind is that in a single queue there may be change and ack request with roles. in this case +// Later occurrence wins overwriting policy change to reduced set of permissions. +// Even if the order was incorrect we end up with just a bit broader permission set, never too strict, so agent does not +// end up with fewer permissions than it needs +func (b *Bulker) flushUpdateAPIKey(ctx context.Context, queue queueT) error { + idsPerRole := make(map[string][]string) + roles := make(map[string]json.RawMessage) + rolePerID := make(map[string]string) + responses := make(map[int]int) + idxToID := make(map[int32]string) + IDToResponse := make(map[string]int) + maxKeySize := 0 + + // merge ids + for n := queue.head; n != nil; n = n.next { + content := n.buf.Bytes() + metaMap := make(map[string]interface{}) + dec := json.NewDecoder(bytes.NewReader(content)) + if err := dec.Decode(&metaMap); err != nil { + log.Error(). + Err(err). + Str("mod", kModBulk). + Msg("Failed to unmarshal api key update meta map") + return err + } + + var req *apiKeyUpdateRequest + if err := dec.Decode(&req); err != nil { + log.Error(). + Err(err). + Str("mod", kModBulk). + Str("request", string(content)). + Msg("Failed to unmarshal api key update request") + return err + } + + if _, tracked := roles[req.RolesHash]; !tracked { + roles[req.RolesHash] = req.Roles + } + + // last one wins, it may be policy change and ack are in the same queue + rolePerID[req.ID] = req.RolesHash + idxToID[n.idx] = req.ID + if maxKeySize < len(req.ID) { + maxKeySize = len(req.ID) + } + } + + for id, roleHash := range rolePerID { + delete(rolePerID, id) + idsPerRole[roleHash] = append(idsPerRole[roleHash], id) + + } + + responseIdx := 0 + for hash, role := range roles { + idsPerBatch := b.getIDsCountPerBatch(len(role), maxKeySize) + ids := idsPerRole[hash] + if idsPerBatch <= 0 { + log.Error().Str("err", "request too large").Msg("No API Key ID could fit request size for bulk update") + log.Debug(). + RawJSON("role", role). + Strs("ids", ids). + Msg("IDs could not fit into a message") + + // idsPerRole for specific role no longer needed + delete(idsPerRole, hash) + continue + } + + batches := int(math.Ceil(float64(len(ids)) / float64(idsPerBatch))) + + // batch ids into batches of meaningful size + for batch := 0; batch < batches; batch++ { + // guard against indexing out of range + to := (batch + 1) * idsPerBatch + if to > len(ids) { + to = len(ids) + } + + // handle ids in batch, we put them into single request + // and assign response index to the id so we can notify caller + idsInBatch := ids[batch*idsPerBatch : to] + bulkReq := &esAPIKeyBulkUpdateRequest{ + IDs: idsInBatch, + Roles: role, + } + delete(roles, hash) + + payload, err := json.Marshal(bulkReq) + if err != nil { + return err + } + + req := &es.UpdateApiKeyBulkRequest{ + Body: bytes.NewReader(payload), + } + + res, err := req.Do(ctx, b.es) + if err != nil { + log.Error().Err(err).Msg("Error sending bulk API Key update request to Elasticsearch") + return err + } + if res.Body != nil { + defer res.Body.Close() + } + if res.IsError() { + log.Error().Str("err", res.String()).Msg("Error in bulk API Key update result to Elasticsearch") + return parseError(res) + } + + log.Debug().Strs("IDs", bulkReq.IDs).RawJSON("role", role).Msg("API Keys updated.") + + responses[responseIdx] = res.StatusCode + for _, id := range idsInBatch { + IDToResponse[id] = responseIdx + } + responseIdx++ + } + + // idsPerRole for specific role no longer needed + delete(idsPerRole, hash) + } + + // WARNING: Once we start pushing items to + // the queue, the node pointers are invalid. + // Do NOT return a non-nil value or failQueue + // up the stack will fail. + + for n := queue.head; n != nil; n = n.next { + // 'n' is invalid immediately on channel send + responseIdx := IDToResponse[idxToID[n.idx]] + res := responses[responseIdx] + select { + case n.ch <- respT{ + err: nil, + idx: n.idx, + data: &BulkIndexerResponseItem{ + DocumentID: "", + Status: res, + }, + }: + default: + panic("Unexpected blocked response channel on flushRead") + } + } + return nil +} + +func (b *Bulker) getIDsCountPerBatch(roleSize, maxKeySize int) int { + spareSpace := b.opts.apikeyMaxReqSize - roleSize - envelopeSize + if spareSpace > maxKeySize { + return int(float64(spareSpace) * safeBuffer / float64(maxKeySize)) + } + return 0 +} diff --git a/internal/pkg/bulk/opBulk.go b/internal/pkg/bulk/opBulk.go index 50b2c47e0..7ecb7c8a6 100644 --- a/internal/pkg/bulk/opBulk.go +++ b/internal/pkg/bulk/opBulk.go @@ -7,12 +7,15 @@ package bulk import ( "bytes" "context" + "errors" "fmt" "time" "github.com/elastic/go-elasticsearch/v7/esapi" "github.com/mailru/easyjson" "github.com/rs/zerolog/log" + + "github.com/elastic/fleet-server/v7/internal/pkg/es" ) func (b *Bulker) Create(ctx context.Context, index, id string, body []byte, opts ...Opt) (string, error) { @@ -73,6 +76,9 @@ func (b *Bulker) waitBulkAction(ctx context.Context, action actionT, index, id s if !ok { return nil, fmt.Errorf("unable to cast to *BulkIndexerResponseItem, detected type %T", resp.data) } + if err := es.TranslateError(r.Status, r.Error); err != nil { + return nil, err + } return r, nil } @@ -187,7 +193,6 @@ func (b *Bulker) flushBulk(ctx context.Context, queue queueT) error { } res, err := req.Do(ctx, b.es) - if err != nil { log.Error().Err(err).Str("mod", kModBulk).Msg("Fail BulkRequest req.Do") return err @@ -217,12 +222,18 @@ func (b *Bulker) flushBulk(ctx context.Context, queue queueT) error { var blk bulkIndexerResponse blk.Items = make([]bulkStubItem, 0, queueCnt) + // TODO: We're loosing information abut the errors, we should check a way + // to return the full error ES returns if err = easyjson.Unmarshal(buf.Bytes(), &blk); err != nil { - log.Error(). - Err(err). + log.Err(err). Str("mod", kModBulk). - Msg("Unmarshal error") - return err + Msg("flushBulk failed, could not unmarshal ES response") + return fmt.Errorf("flushBulk failed, could not unmarshal ES response: %w", err) + } + if blk.HasErrors { + // We lack information to properly correlate this error with what has failed. + // Thus, for now it'd be more noise than information outside an investigation. + log.Debug().Err(errors.New(buf.String())).Msg("Bulk call: Es returned an error") } log.Trace(). diff --git a/internal/pkg/bulk/opt.go b/internal/pkg/bulk/opt.go index e0701823e..6eeb2fe21 100644 --- a/internal/pkg/bulk/opt.go +++ b/internal/pkg/bulk/opt.go @@ -62,6 +62,7 @@ type bulkOptT struct { maxPending int blockQueueSz int apikeyMaxParallel int + apikeyMaxReqSize int } type BulkOpt func(*bulkOptT) @@ -108,6 +109,15 @@ func WithAPIKeyMaxParallel(max int) BulkOpt { } } +// WithAPIKeyMaxRequestSize sets the maximum size of the request body. Default 100MB +func WithAPIKeyMaxRequestSize(maxBytes int) BulkOpt { + return func(opt *bulkOptT) { + if opt.apikeyMaxReqSize > 0 { + opt.apikeyMaxReqSize = maxBytes + } + } +} + func parseBulkOpts(opts ...BulkOpt) bulkOptT { bopt := bulkOptT{ flushInterval: defaultFlushInterval, @@ -116,6 +126,7 @@ func parseBulkOpts(opts ...BulkOpt) bulkOptT { maxPending: defaultMaxPending, apikeyMaxParallel: defaultAPIKeyMaxParallel, blockQueueSz: defaultBlockQueueSz, + apikeyMaxReqSize: defaultApikeyMaxReqSize, } for _, f := range opts { @@ -132,6 +143,7 @@ func (o *bulkOptT) MarshalZerologObject(e *zerolog.Event) { e.Int("maxPending", o.maxPending) e.Int("blockQueueSz", o.blockQueueSz) e.Int("apikeyMaxParallel", o.apikeyMaxParallel) + e.Int("apikeyMaxReqSize", o.apikeyMaxReqSize) } // BulkOptsFromCfg transforms config to a slize of BulkOpt @@ -152,5 +164,6 @@ func BulkOptsFromCfg(cfg *config.Config) []BulkOpt { WithFlushThresholdSize(bulkCfg.FlushThresholdSize), WithMaxPending(bulkCfg.FlushMaxPending), WithAPIKeyMaxParallel(maxKeyParallel), + WithAPIKeyMaxRequestSize(cfg.Output.Elasticsearch.MaxContentLength), } } diff --git a/internal/pkg/bulk/queue.go b/internal/pkg/bulk/queue.go index dc7bde5d1..f2060212a 100644 --- a/internal/pkg/bulk/queue.go +++ b/internal/pkg/bulk/queue.go @@ -20,6 +20,7 @@ const ( kQueueFleetSearch kQueueRefreshBulk kQueueRefreshRead + kQueueAPIKeyUpdate kNumQueues ) @@ -37,6 +38,8 @@ func (q queueT) Type() string { return "refreshBulk" case kQueueRefreshRead: return "refreshRead" + case kQueueAPIKeyUpdate: + return "apiKeyUpdate" } panic("unknown") } diff --git a/internal/pkg/cache/cache.go b/internal/pkg/cache/cache.go index 67b2075e6..909988702 100644 --- a/internal/pkg/cache/cache.go +++ b/internal/pkg/cache/cache.go @@ -11,15 +11,15 @@ import ( "sync" "time" - "github.com/rs/zerolog" "github.com/rs/zerolog/log" "github.com/elastic/fleet-server/v7/internal/pkg/apikey" + "github.com/elastic/fleet-server/v7/internal/pkg/config" "github.com/elastic/fleet-server/v7/internal/pkg/model" ) type Cache interface { - Reconfigure(Config) error + Reconfigure(config.Cache) error SetAction(model.Action) GetAction(id string) (model.Action, bool) @@ -39,37 +39,17 @@ type SecurityInfo = apikey.SecurityInfo type CacheT struct { cache Cacher - cfg Config + cfg config.Cache mut sync.RWMutex } -type Config struct { - NumCounters int64 // number of keys to track frequency of - MaxCost int64 // maximum cost of cache in 'cost' units - ActionTTL time.Duration - APIKeyTTL time.Duration - EnrollKeyTTL time.Duration - ArtifactTTL time.Duration - APIKeyJitter time.Duration -} - -func (c *Config) MarshalZerologObject(e *zerolog.Event) { - e.Int64("numCounters", c.NumCounters) - e.Int64("maxCost", c.MaxCost) - e.Dur("actionTTL", c.ActionTTL) - e.Dur("enrollTTL", c.EnrollKeyTTL) - e.Dur("artifactTTL", c.ArtifactTTL) - e.Dur("apiKeyTTL", c.APIKeyTTL) - e.Dur("apiKeyJitter", c.APIKeyJitter) -} - type actionCache struct { actionID string actionType string } // New creates a new cache. -func New(cfg Config) (*CacheT, error) { +func New(cfg config.Cache) (*CacheT, error) { cache, err := newCache(cfg) if err != nil { return nil, err @@ -84,7 +64,7 @@ func New(cfg Config) (*CacheT, error) { } // Reconfigure will drop cache -func (c *CacheT) Reconfigure(cfg Config) error { +func (c *CacheT) Reconfigure(cfg config.Cache) error { c.mut.Lock() defer c.mut.Unlock() diff --git a/internal/pkg/cache/impl_integration.go b/internal/pkg/cache/impl_integration.go index 013b4a6f5..6c418745b 100644 --- a/internal/pkg/cache/impl_integration.go +++ b/internal/pkg/cache/impl_integration.go @@ -9,9 +9,11 @@ package cache import ( "time" + + "github.com/elastic/fleet-server/v7/internal/pkg/config" ) -func newCache(_ Config) (Cacher, error) { +func newCache(_ config.Cache) (Cacher, error) { return &NoCache{}, nil } diff --git a/internal/pkg/cache/impl_ristretto.go b/internal/pkg/cache/impl_ristretto.go index 582ba23e7..b8a38a018 100644 --- a/internal/pkg/cache/impl_ristretto.go +++ b/internal/pkg/cache/impl_ristretto.go @@ -9,9 +9,11 @@ package cache import ( "github.com/dgraph-io/ristretto" + + "github.com/elastic/fleet-server/v7/internal/pkg/config" ) -func newCache(cfg Config) (Cacher, error) { +func newCache(cfg config.Cache) (Cacher, error) { rcfg := &ristretto.Config{ NumCounters: cfg.NumCounters, MaxCost: cfg.MaxCost, diff --git a/internal/pkg/checkin/bulk.go b/internal/pkg/checkin/bulk.go index 8ece0547f..8eb67cba7 100644 --- a/internal/pkg/checkin/bulk.go +++ b/internal/pkg/checkin/bulk.go @@ -100,7 +100,7 @@ func (bc *Bulk) timestamp() string { // CheckIn will add the agent (identified by id) to the pending set. // The pending agents are sent to elasticsearch as a bulk update at each flush interval. // WARNING: Bulk will take ownership of fields, so do not use after passing in. -func (bc *Bulk) CheckIn(id string, status, message string, meta []byte, components []byte, seqno sqn.SeqNo, newVer string) error { +func (bc *Bulk) CheckIn(id string, status string, message string, meta []byte, components []byte, seqno sqn.SeqNo, newVer string) error { // Separate out the extra data to minimize // the memory footprint of the 90% case of just // updating the timestamp. diff --git a/internal/pkg/checkin/bulk_test.go b/internal/pkg/checkin/bulk_test.go index af18eefd8..8ca60c137 100644 --- a/internal/pkg/checkin/bulk_test.go +++ b/internal/pkg/checkin/bulk_test.go @@ -82,6 +82,7 @@ type bulkcase struct { desc string id string status string + message string meta []byte components []byte seqno sqn.SeqNo @@ -97,6 +98,7 @@ func TestBulkSimple(t *testing.T) { "Simple case", "simpleId", "online", + "message", nil, nil, nil, @@ -106,6 +108,7 @@ func TestBulkSimple(t *testing.T) { "Singled field case", "singleFieldId", "online", + "message", []byte(`{"hey":"now"}`), []byte(`[{"id":"winlog-default"}]`), nil, @@ -115,6 +118,7 @@ func TestBulkSimple(t *testing.T) { "Multi field case", "multiFieldId", "online", + "message", []byte(`{"hey":"now","brown":"cow"}`), []byte(`[{"id":"winlog-default","type":"winlog"}]`), nil, @@ -124,6 +128,7 @@ func TestBulkSimple(t *testing.T) { "Multi field nested case", "multiFieldNestedId", "online", + "message", []byte(`{"hey":"now","wee":{"little":"doggie"}}`), []byte(`[{"id":"winlog-default","type":"winlog"}]`), nil, @@ -133,6 +138,7 @@ func TestBulkSimple(t *testing.T) { "Simple case with seqNo", "simpleseqno", "online", + "message", nil, nil, sqn.SeqNo{1, 2, 3, 4}, @@ -142,6 +148,7 @@ func TestBulkSimple(t *testing.T) { "Field case with seqNo", "simpleseqno", "online", + "message", []byte(`{"uncle":"fester"}`), []byte(`[{"id":"log-default"}]`), sqn.SeqNo{5, 6, 7, 8}, @@ -151,6 +158,7 @@ func TestBulkSimple(t *testing.T) { "Unusual status", "singleFieldId", "unusual", + "message", nil, nil, nil, @@ -160,6 +168,7 @@ func TestBulkSimple(t *testing.T) { "Empty status", "singleFieldId", "", + "message", nil, nil, nil, @@ -174,7 +183,7 @@ func TestBulkSimple(t *testing.T) { mockBulk.On("MUpdate", mock.Anything, mock.MatchedBy(matchOp(t, c, start)), mock.Anything).Return([]bulk.BulkIndexerResponseItem{}, nil).Once() bc := NewBulk(mockBulk) - if err := bc.CheckIn(c.id, c.status, "", c.meta, c.components, c.seqno, c.ver); err != nil { + if err := bc.CheckIn(c.id, c.status, c.message, c.meta, c.components, c.seqno, c.ver); err != nil { t.Fatal(err) } diff --git a/internal/pkg/config/cache.go b/internal/pkg/config/cache.go index f6e3f84e7..738a74c85 100644 --- a/internal/pkg/config/cache.go +++ b/internal/pkg/config/cache.go @@ -6,6 +6,8 @@ package config import ( "time" + + "github.com/rs/zerolog" ) const ( @@ -30,14 +32,54 @@ func (c *Cache) InitDefaults() { c.LoadLimits(loadLimits(0)) } +// LoadLimits loads envLimits for any attribute that is not defined in Cache func (c *Cache) LoadLimits(limits *envLimits) { l := limits.Cache - c.NumCounters = l.NumCounters - c.MaxCost = l.MaxCost - c.ActionTTL = defaultActionTTL - c.EnrollKeyTTL = defaultEnrollKeyTTL - c.ArtifactTTL = defaultArtifactTTL - c.APIKeyTTL = defaultAPIKeyTTL - c.APIKeyJitter = defaultAPIKeyJitter + if c.NumCounters == 0 { + c.NumCounters = l.NumCounters + } + if c.MaxCost == 0 { + c.MaxCost = l.MaxCost + } + if c.ActionTTL == 0 { + c.ActionTTL = defaultActionTTL + } + if c.EnrollKeyTTL == 0 { + c.EnrollKeyTTL = defaultEnrollKeyTTL + } + if c.ArtifactTTL == 0 { + c.ArtifactTTL = defaultArtifactTTL + } + if c.APIKeyTTL == 0 { + c.APIKeyTTL = defaultAPIKeyTTL + } + if c.APIKeyJitter == 0 { + c.APIKeyJitter = defaultAPIKeyJitter + } +} + +// CopyCache returns a copy of the config's Cache settings +func CopyCache(cfg *Config) Cache { + ccfg := cfg.Inputs[0].Cache + return Cache{ + NumCounters: ccfg.NumCounters, + MaxCost: ccfg.MaxCost, + ActionTTL: ccfg.ActionTTL, + EnrollKeyTTL: ccfg.EnrollKeyTTL, + ArtifactTTL: ccfg.ArtifactTTL, + APIKeyTTL: ccfg.APIKeyTTL, + APIKeyJitter: ccfg.APIKeyJitter, + } +} + +// MarshalZerologObject turns the cache settings into a zerolog event +func (c *Cache) MarshalZerologObject(e *zerolog.Event) { + e.Int64("numCounters", c.NumCounters) + e.Int64("maxCost", c.MaxCost) + e.Dur("actionTTL", c.ActionTTL) + e.Dur("enrollTTL", c.EnrollKeyTTL) + e.Dur("artifactTTL", c.ArtifactTTL) + e.Dur("apiKeyTTL", c.APIKeyTTL) + e.Dur("apiKeyJitter", c.APIKeyJitter) } diff --git a/internal/pkg/config/config.go b/internal/pkg/config/config.go index ce175a853..827ade339 100644 --- a/internal/pkg/config/config.go +++ b/internal/pkg/config/config.go @@ -23,6 +23,8 @@ var DefaultOptions = []ucfg.Option{ ucfg.FieldReplaceValues("inputs"), } +const kRedacted = "[redacted]" + // Config is the global configuration. type Config struct { Fleet Fleet `config:"fleet"` @@ -106,6 +108,66 @@ func (c *Config) Merge(other *Config) (*Config, error) { return cfg, nil } +func redactOutput(cfg *Config) Output { + redacted := cfg.Output + + if redacted.Elasticsearch.APIKey != "" { + redacted.Elasticsearch.APIKey = kRedacted + } + + if redacted.Elasticsearch.ServiceToken != "" { + redacted.Elasticsearch.ServiceToken = kRedacted + } + + if redacted.Elasticsearch.TLS != nil { + newTLS := *redacted.Elasticsearch.TLS + + if newTLS.Certificate.Key != "" { + newTLS.Certificate.Key = kRedacted + } + if newTLS.Certificate.Passphrase != "" { + newTLS.Certificate.Passphrase = kRedacted + } + + redacted.Elasticsearch.TLS = &newTLS + } + + return redacted +} + +func redactServer(cfg *Config) Server { + redacted := cfg.Inputs[0].Server + + if redacted.TLS != nil { + newTLS := *redacted.TLS + + if newTLS.Certificate.Key != "" { + newTLS.Certificate.Key = kRedacted + } + if newTLS.Certificate.Passphrase != "" { + newTLS.Certificate.Passphrase = kRedacted + } + + redacted.TLS = &newTLS + } + + return redacted +} + +// Redact returns a copy of the config with all sensitive attributes redacted. +func (c *Config) Redact() *Config { + redacted := &Config{ + Fleet: c.Fleet, + Output: c.Output, + Inputs: make([]Input, 1), + Logging: c.Logging, + HTTP: c.HTTP, + } + redacted.Inputs[0].Server = redactServer(c) + redacted.Output = redactOutput(c) + return redacted +} + func checkDeprecatedOptions(deprecatedOpts map[string]string, c *ucfg.Config) { for opt, message := range deprecatedOpts { if c.HasField(opt) { diff --git a/internal/pkg/config/config_test.go b/internal/pkg/config/config_test.go index 4e3d7ea61..eb18fd35a 100644 --- a/internal/pkg/config/config_test.go +++ b/internal/pkg/config/config_test.go @@ -181,6 +181,38 @@ func TestConfig(t *testing.T) { } } +func TestLoadServerLimits(t *testing.T) { + t.Run("empty loads limits", func(t *testing.T) { + c := &Config{Inputs: []Input{{}}} + err := c.LoadServerLimits() + assert.NoError(t, err) + assert.Equal(t, int64(defaultCheckinMaxBody), c.Inputs[0].Server.Limits.CheckinLimit.MaxBody) + assert.Equal(t, defaultActionTTL, c.Inputs[0].Cache.ActionTTL) + }) + t.Run("existing values are not overridden", func(t *testing.T) { + c := &Config{ + Inputs: []Input{{ + Server: Server{ + Limits: ServerLimits{ + CheckinLimit: Limit{ + MaxBody: 5 * defaultCheckinMaxBody, + }, + }, + }, + Cache: Cache{ + ActionTTL: time.Minute, + }, + }}, + } + err := c.LoadServerLimits() + assert.NoError(t, err) + assert.Equal(t, int64(5*defaultCheckinMaxBody), c.Inputs[0].Server.Limits.CheckinLimit.MaxBody) + assert.Equal(t, defaultCheckinBurst, c.Inputs[0].Server.Limits.CheckinLimit.Burst) + assert.Equal(t, time.Minute, c.Inputs[0].Cache.ActionTTL) + }) + +} + // Stub out the defaults so that the above is easier to maintain func defaultCache() Cache { @@ -237,12 +269,13 @@ func defaultFleet() Fleet { func defaultElastic() Elasticsearch { return Elasticsearch{ - Protocol: "http", - ServiceToken: "test-token", - Hosts: []string{"localhost:9200"}, - MaxRetries: 3, - MaxConnPerHost: 128, - Timeout: 90 * time.Second, + Protocol: "http", + ServiceToken: "test-token", + Hosts: []string{"localhost:9200"}, + MaxRetries: 3, + MaxConnPerHost: 128, + MaxContentLength: 104857600, + Timeout: 90 * time.Second, } } diff --git a/internal/pkg/config/defaults/gt10000_limits.yml b/internal/pkg/config/defaults/gt10000_limits.yml index 593ef713d..f9513f6e0 100644 --- a/internal/pkg/config/defaults/gt10000_limits.yml +++ b/internal/pkg/config/defaults/gt10000_limits.yml @@ -1,6 +1,6 @@ num_agents: min: 10000 - max: 12500 + max: 12499 recommended_min_ram: 8192 cache_limits: num_counters: 160000 @@ -23,4 +23,4 @@ server_limits: ack_limit: interval: 500us burst: 4000 - max: 8000 \ No newline at end of file + max: 8000 diff --git a/internal/pkg/config/defaults/gt12500_limits.yml b/internal/pkg/config/defaults/gt12500_limits.yml index 2b97f89b5..2032b7c0e 100644 --- a/internal/pkg/config/defaults/gt12500_limits.yml +++ b/internal/pkg/config/defaults/gt12500_limits.yml @@ -1,6 +1,6 @@ num_agents: min: 12500 - max: 30000 + max: 29999 recommended_min_ram: 16384 cache_limits: num_counters: 160000 diff --git a/internal/pkg/config/defaults/gt5000_limits.yml b/internal/pkg/config/defaults/gt5000_limits.yml index f6ca69b63..fc6092d5b 100644 --- a/internal/pkg/config/defaults/gt5000_limits.yml +++ b/internal/pkg/config/defaults/gt5000_limits.yml @@ -1,6 +1,6 @@ num_agents: min: 5000 - max: 7500 + max: 7499 recommended_min_ram: 2048 cache_limits: num_counters: 40000 @@ -23,4 +23,4 @@ server_limits: ack_limit: interval: 2ms burst: 1000 - max: 2000 \ No newline at end of file + max: 2000 diff --git a/internal/pkg/config/defaults/gt50_limits.yml b/internal/pkg/config/defaults/gt50_limits.yml index 748059659..e15e15a06 100644 --- a/internal/pkg/config/defaults/gt50_limits.yml +++ b/internal/pkg/config/defaults/gt50_limits.yml @@ -1,6 +1,6 @@ num_agents: min: 50 - max: 5000 + max: 4999 recommended_min_ram: 1024 cache_limits: num_counters: 20000 @@ -23,4 +23,4 @@ server_limits: ack_limit: interval: 4ms burst: 500 - max: 1000 \ No newline at end of file + max: 1000 diff --git a/internal/pkg/config/defaults/gt7500_limits.yml b/internal/pkg/config/defaults/gt7500_limits.yml index 829ca3d9a..88140adae 100644 --- a/internal/pkg/config/defaults/gt7500_limits.yml +++ b/internal/pkg/config/defaults/gt7500_limits.yml @@ -1,6 +1,6 @@ num_agents: min: 7500 - max: 10000 + max: 9999 recommended_min_ram: 4096 cache_limits: num_counters: 80000 @@ -23,4 +23,4 @@ server_limits: ack_limit: interval: 1ms burst: 2000 - max: 4000 \ No newline at end of file + max: 4000 diff --git a/internal/pkg/config/defaults/lte50_limits.yml b/internal/pkg/config/defaults/lte50_limits.yml index 74ebd135a..18381479a 100644 --- a/internal/pkg/config/defaults/lte50_limits.yml +++ b/internal/pkg/config/defaults/lte50_limits.yml @@ -1,5 +1,5 @@ num_agents: - max: 50 + max: 49 recommended_min_ram: 0 cache_limits: num_counters: 2000 @@ -22,4 +22,4 @@ server_limits: ack_limit: interval: 10ms burst: 20 - max: 20 \ No newline at end of file + max: 20 diff --git a/internal/pkg/config/env_defaults.go b/internal/pkg/config/env_defaults.go index a693d879d..0b4410f1a 100644 --- a/internal/pkg/config/env_defaults.go +++ b/internal/pkg/config/env_defaults.go @@ -154,7 +154,7 @@ func init() { // internal/pkg/config/defaults/gt7500_limits.yml // internal/pkg/config/defaults/lte50_limits.yml // internal/pkg/config/defaults/max_limits.yml - unpacked := packer.MustUnpack("eJzsl1+PqkYYxu/7Mfa66QKKjU3OxawIC5vBSIABbhoGFMHhT44gf5p+92ZArausQtrkJE0vTeSZd573N88788dLmOSb74lLXrN98OqlyTYMXv3N1i1IfngNcpZhGOZ3EsZhfviljsnLby84MXOLExMH8YzXpB/y4i130XQmC8sSCqBchCDASJyuQlDByCghyGoHqUeMWIJjjTjiPHcRv/ViMV+FYAr1fb2I1Z3N7YjFVRlG5Kxb+Ijd2tycxYlGvATSNTi6BgRpIEvsbiPOIzxRmVUIaijsJ1A3aiiAD48zaz826xu9EnNVZk/I1pd2NZ5oDJaMmSyYuVde6XF84SCVcdG88Jo0gI1M90W/j1zJjFyOFGfdVZAG8gIELuIZx1I4G5UzWTBK+G421Ad58Rb6ltL4Qhqo57oXN/6s6dpi7UsktpHKXLRDuiYIZIkUvkT3Ix5WIWCh8Ma29S5AYCdm7U3WM1lYn2oE3T6ENFhdfDILL54fsDg/uIjNaC2n2rLOV2eHhTSA+lt+r7u8rbk+ebGzuXyLJZK71nrWp6fqoPQteKpJYb2k7RNzZoRyY1vBTBYCWvu3l5+fwsjxI2CsaQFXMDYdOA9hrNQITlWQRTZSU8c8GfYO2w12a6lHv2t20/0fnOHIbWtHYW2bDSPAq82ShREoF4lKTSEe+aznSfS3ylsTLfXiOeNLFaEG49a0i16BYzPyJXLE7QGA9dlAemBsLs8wuTT2HwEDdaOEwrJtrpdomROLkf8EmpFNptoEJ8oRS9UFHhn0HaJlie91q7uao5MXSP3+Lx3KaVt78G0AkPyIcDSuOKF+eGwbVLEZYY7NHcTvHaRtMSKFNVF2WG/3xqyGs8h8EYzsJRhB1lA/HEs5ea81rVfvb0cskWhjzhn3XTn679rB0a8C4atg1MHIYFT6A+amp2ozOhTrL1i5YVD+wQwO2z/Vg2AYf8Ppu0pCtnUmUYjNzXOMzMKRzL3FsTRJahsxXVXR+sPmRNrZz7RIPItNSpfZdqAl4XkKjhzJRvk4Bb2R5Jk9zhu3J5/6UrWdtBTGRc7OnmhPCBmkeyHRQXyNuerwjGYosfdJpd+m9VDqtME1Pifu1xETuEu4v0/f6Xr4kLt1udLtwdy1V5w+7nTArARjoo6fvuxj7uRy5PStehKkvk+m1puR7Ik9fZXv0/QHXwMhume5S7hPeswQ/ki+GRJ5naXGE9TAuHh78OIYe8GDwvPBOuqC1xMXMDL642IwXsuyV/c+hkYN1N5oW99EUeu1GLns46PVv++7Yfp0kMZuNTTT4IAMg7Xa2NPhT9ruRv8fedK2r67/n7SnJ+2fP/0VAAD//1871n0=") + unpacked := packer.MustUnpack("eJzsl12Psjgbx8+fjzHHT3YAxY2bzEEVYXDSGglS4GRDYUSwvOQW5GWz331TfImjOED2TjbZ7OEk48XV//Xrr+0fL0Gcff6IHfqa7v1XN4m3gf/qfW6dnGaHVz/jOY7jfqdBFGSHX6qIvvz2QmIjMwU5trHIuXXyoc5nmYPHE1VaFFACxTwAPsHyeBWAEoZrcbVOKxujI8E8JZFGbXmaOVjcupGcrQIwhvq+mkdoZwk7agplSjC91M09zG8tYcqTWKNuDNk3BPYNCBJfVfjdpzwNyQhxqwBUUNqPoL6poAQ+XMGovMio7uoVRChTa0S3nrKryEjjiLKZqJKRucVNPUHMbYw4B09zt058WKtsXez3oaMYoSPQ/FJ35Se+Oge+g0XONpeChYuJKm0K+G7ULAd1Pgs8c1l7UuKjS9/z23w2BVyzb8uVp9DIwoi71g7YN4GvKjT3FLYe+bAKAA+lGd/0Owe+FRuVO1pPVGl97hGc1iEl/uqak5G70fRA5OnBwXzKejn3lp5ytXdESnyoz7LHuov7nqtzFjtLyLZEoZljridt9ZAOCs+E556WvBs3c+IujDBuLNOfqJLf9P7y/04YBXEAjBVr4AbGaqXvu2AsUQjHCKShhVFiG+fA3mGzwNO30NE7Dbs+/T+4wJFZ5o7B2gwbhkBE9YKHISjmMWKhUJd+recq7G8kmiMtcaMp5yklZX2SJrRrvZxERugp9EiaDQCrS4Bsw1hClhJ6HezfAgbqmwJKi2a4bqyldiSHXgc0Q4eszmeUxMsjUcorPCpo20SLgjzWLR96Ds9ZYPTjJ23KcdO7//bWDaQ4QI6bG05YHi630vcfbmSEROAzG4t7G2tbgmlujpY7ojdr41b9WeSeiJG/ihGkNcvDNpfn7LW6yep9diQKDT+NKee8L4/eu3aw9RshPBOjDgaKcdkumLuZonqwFKsnrNwxqP7DDPZbP6sHQdKLv/703ZiQkSfO4yW1hGlGsJHbirE3BZ6ZpLIwd+oqXH9Ygswm+5UWReSJwegymgk0JHRbcOCRvCm+t6A7kDyjJfnN/c7noQTKZpLmknOwvbNGWgchvepeSbSxWBGhPHTRDBX+0VT6va37Uqf177GX9X4dcAq7/Ffr7cUu6yEJiMjvbb3xE+uVUFqPke4KP/06GA61ntySv9py+oLBpy/Ej5yc7XE7W24gf2Ubfxfevti0yV0OHf7bG0jZYtHq0c7rXtdAmn320d65ZIfmwDDFffPqGHrJg1L34Troktc2snDTrozeelsUT1C4rzvoUG3V2/pORz3Ral/3w4HaiVXklH2dBk/P3G/BghWqrXH/Z+3pVv8vedY2L6//nrXNi+ft5c///RUAAP//y2rXXQ==") for f, v := range unpacked { cfg, err := yaml.NewConfig(v, DefaultOptions...) diff --git a/internal/pkg/config/env_defaults_test.go b/internal/pkg/config/env_defaults_test.go index f2606716f..40309ffc0 100644 --- a/internal/pkg/config/env_defaults_test.go +++ b/internal/pkg/config/env_defaults_test.go @@ -20,11 +20,11 @@ func TestLoadLimits(t *testing.T) { ConfiguredAgentLimit int ExpectedAgentLimit int }{ - {"few agents", 5, 50}, - {"512", 512, 5000}, - {"precise", 7500, 7500}, - {"10k", 10050, 12500}, - {"close to max", 13000, 30000}, + {"few agents", 5, 49}, + {"512", 512, 4999}, + {"precise", 7499, 7499}, + {"10k", 10050, 12499}, + {"close to max", 13000, 29999}, {"above max", 30001, int(getMaxInt())}, } diff --git a/internal/pkg/config/limits.go b/internal/pkg/config/limits.go index 1ee1ebe46..0a9b4cb18 100644 --- a/internal/pkg/config/limits.go +++ b/internal/pkg/config/limits.go @@ -36,38 +36,41 @@ func (c *ServerLimits) InitDefaults() { func (c *ServerLimits) LoadLimits(limits *envLimits) { l := limits.Server - c.MaxHeaderByteSize = 8192 // 8k - c.MaxConnections = l.MaxConnections - c.PolicyThrottle = l.PolicyThrottle + if c.MaxHeaderByteSize == 0 { + c.MaxHeaderByteSize = 8192 // 8k + } + if c.MaxConnections == 0 { + c.MaxConnections = l.MaxConnections + } + if c.PolicyThrottle == 0 { + c.PolicyThrottle = l.PolicyThrottle + } + + c.CheckinLimit = mergeEnvLimit(c.CheckinLimit, l.CheckinLimit) + c.ArtifactLimit = mergeEnvLimit(c.ArtifactLimit, l.ArtifactLimit) + c.EnrollLimit = mergeEnvLimit(c.EnrollLimit, l.EnrollLimit) + c.AckLimit = mergeEnvLimit(c.AckLimit, l.AckLimit) + c.StatusLimit = mergeEnvLimit(c.StatusLimit, l.StatusLimit) +} - c.CheckinLimit = Limit{ - Interval: l.CheckinLimit.Interval, - Burst: l.CheckinLimit.Burst, - Max: l.CheckinLimit.Max, - MaxBody: l.CheckinLimit.MaxBody, +func mergeEnvLimit(L Limit, l limit) Limit { + result := Limit{ + Interval: L.Interval, + Burst: L.Burst, + Max: L.Max, + MaxBody: L.MaxBody, } - c.ArtifactLimit = Limit{ - Interval: l.ArtifactLimit.Interval, - Burst: l.ArtifactLimit.Burst, - Max: l.ArtifactLimit.Max, - MaxBody: l.ArtifactLimit.MaxBody, + if result.Interval == 0 { + result.Interval = l.Interval } - c.EnrollLimit = Limit{ - Interval: l.EnrollLimit.Interval, - Burst: l.EnrollLimit.Burst, - Max: l.EnrollLimit.Max, - MaxBody: l.EnrollLimit.MaxBody, + if result.Burst == 0 { + result.Burst = l.Burst } - c.AckLimit = Limit{ - Interval: l.AckLimit.Interval, - Burst: l.AckLimit.Burst, - Max: l.AckLimit.Max, - MaxBody: l.AckLimit.MaxBody, + if result.Max == 0 { + result.Max = l.Max } - c.StatusLimit = Limit{ - Interval: l.StatusLimit.Interval, - Burst: l.StatusLimit.Burst, - Max: l.StatusLimit.Max, - MaxBody: l.StatusLimit.MaxBody, + if result.MaxBody == 0 { + result.MaxBody = l.MaxBody } + return result } diff --git a/internal/pkg/config/output.go b/internal/pkg/config/output.go index 5804d5858..8e8751d45 100644 --- a/internal/pkg/config/output.go +++ b/internal/pkg/config/output.go @@ -28,19 +28,20 @@ var hasScheme = regexp.MustCompile(`^([a-z][a-z0-9+\-.]*)://`) // Elasticsearch is the configuration for elasticsearch. type Elasticsearch struct { - Protocol string `config:"protocol"` - Hosts []string `config:"hosts"` - Path string `config:"path"` - Headers map[string]string `config:"headers"` - APIKey string `config:"api_key"` - ServiceToken string `config:"service_token"` - ProxyURL string `config:"proxy_url"` - ProxyDisable bool `config:"proxy_disable"` - ProxyHeaders map[string]string `config:"proxy_headers"` - TLS *tlscommon.Config `config:"ssl"` - MaxRetries int `config:"max_retries"` - MaxConnPerHost int `config:"max_conn_per_host"` - Timeout time.Duration `config:"timeout"` + Protocol string `config:"protocol"` + Hosts []string `config:"hosts"` + Path string `config:"path"` + Headers map[string]string `config:"headers"` + APIKey string `config:"api_key"` + ServiceToken string `config:"service_token"` + ProxyURL string `config:"proxy_url"` + ProxyDisable bool `config:"proxy_disable"` + ProxyHeaders map[string]string `config:"proxy_headers"` + TLS *tlscommon.Config `config:"ssl"` + MaxRetries int `config:"max_retries"` + MaxConnPerHost int `config:"max_conn_per_host"` + Timeout time.Duration `config:"timeout"` + MaxContentLength int `config:"max_content_length"` } // InitDefaults initializes the defaults for the configuration. @@ -50,6 +51,7 @@ func (c *Elasticsearch) InitDefaults() { c.Timeout = 90 * time.Second c.MaxRetries = 3 c.MaxConnPerHost = 128 + c.MaxContentLength = 100 * 1024 * 1024 } // Validate ensures that the configuration is valid. diff --git a/internal/pkg/coordinator/monitor.go b/internal/pkg/coordinator/monitor.go index 53870e58e..f203ab45a 100644 --- a/internal/pkg/coordinator/monitor.go +++ b/internal/pkg/coordinator/monitor.go @@ -508,7 +508,7 @@ func runUnenroller(ctx context.Context, bulker bulk.Bulk, policyID string, unenr func runUnenrollerWork(ctx context.Context, bulker bulk.Bulk, policyID string, unenrollTimeout time.Duration, zlog zerolog.Logger, agentsIndex string) error { agents, err := dl.FindOfflineAgents(ctx, bulker, policyID, unenrollTimeout, dl.WithIndexName(agentsIndex)) - if err != nil || len(agents) == 0 { + if err != nil { return err } @@ -540,11 +540,13 @@ func unenrollAgent(ctx context.Context, zlog zerolog.Logger, bulker bulk.Bulk, a dl.FieldUnenrolledReason: unenrolledReasonTimeout, dl.FieldUpdatedAt: now, } + body, err := fields.Marshal() if err != nil { return err } - apiKeys := getAPIKeyIDs(agent) + + apiKeys := agent.APIKeyIDs() zlog = zlog.With(). Str(logger.AgentID, agent.Id). @@ -560,24 +562,13 @@ func unenrollAgent(ctx context.Context, zlog zerolog.Logger, bulker bulk.Bulk, a return err } } - if err = bulker.Update(ctx, agentsIndex, agent.Id, body, bulk.WithRefresh()); err != nil { + if err = bulker.Update(ctx, agentsIndex, agent.Id, body, bulk.WithRefresh(), bulk.WithRetryOnConflict(3)); err != nil { zlog.Error().Err(err).Msg("Fail unenrollAgent record update") } return err } -func getAPIKeyIDs(agent *model.Agent) []string { - keys := make([]string, 0, 1) - if agent.AccessAPIKeyID != "" { - keys = append(keys, agent.AccessAPIKeyID) - } - if agent.DefaultAPIKeyID != "" { - keys = append(keys, agent.DefaultAPIKeyID) - } - return keys -} - func waitWithContext(ctx context.Context, to time.Duration) error { t := time.NewTimer(to) defer t.Stop() diff --git a/internal/pkg/coordinator/monitor_integration_test.go b/internal/pkg/coordinator/monitor_integration_test.go index ffef699d1..defc4a9c7 100644 --- a/internal/pkg/coordinator/monitor_integration_test.go +++ b/internal/pkg/coordinator/monitor_integration_test.go @@ -159,7 +159,7 @@ func TestMonitorUnenroller(t *testing.T) { agentID, "", []byte(""), - apikey.NewMetadata(agentID, apikey.TypeAccess), + apikey.NewMetadata(agentID, "", apikey.TypeAccess), ) require.NoError(t, err) outputKey, err := bulker.APIKeyCreate( @@ -167,20 +167,21 @@ func TestMonitorUnenroller(t *testing.T) { agentID, "", []byte(""), - apikey.NewMetadata(agentID, apikey.TypeAccess), + apikey.NewMetadata(agentID, "default", apikey.TypeAccess), ) require.NoError(t, err) // add agent that should be unenrolled sixAgo := time.Now().UTC().Add(-6 * time.Minute) agentBody, err := json.Marshal(model.Agent{ - AccessAPIKeyID: accessKey.ID, - DefaultAPIKeyID: outputKey.ID, - Active: true, - EnrolledAt: sixAgo.Format(time.RFC3339), - LastCheckin: sixAgo.Format(time.RFC3339), - PolicyID: policy1Id, - UpdatedAt: sixAgo.Format(time.RFC3339), + AccessAPIKeyID: accessKey.ID, + Outputs: map[string]*model.PolicyOutput{ + "default": {APIKeyID: outputKey.ID}}, + Active: true, + EnrolledAt: sixAgo.Format(time.RFC3339), + LastCheckin: sixAgo.Format(time.RFC3339), + PolicyID: policy1Id, + UpdatedAt: sixAgo.Format(time.RFC3339), }) require.NoError(t, err) _, err = bulker.Create(ctx, agentsIndex, agentID, agentBody) @@ -306,7 +307,7 @@ func TestMonitorUnenrollerSetAndClear(t *testing.T) { agentID, "", []byte(""), - apikey.NewMetadata(agentID, apikey.TypeAccess), + apikey.NewMetadata(agentID, "", apikey.TypeAccess), ) require.NoError(t, err) outputKey, err := bulker.APIKeyCreate( @@ -314,7 +315,7 @@ func TestMonitorUnenrollerSetAndClear(t *testing.T) { agentID, "", []byte(""), - apikey.NewMetadata(agentID, apikey.TypeAccess), + apikey.NewMetadata(agentID, "default", apikey.TypeAccess), ) require.NoError(t, err) diff --git a/internal/pkg/dl/agent.go b/internal/pkg/dl/agent.go index 1d52082f7..a4871fa73 100644 --- a/internal/pkg/dl/agent.go +++ b/internal/pkg/dl/agent.go @@ -6,6 +6,7 @@ package dl import ( "context" + "fmt" "time" "github.com/elastic/fleet-server/v7/internal/pkg/bulk" @@ -48,19 +49,23 @@ func prepareOfflineAgentsByPolicyID() *dsl.Tmpl { return tmpl } -func FindAgent(ctx context.Context, bulker bulk.Bulk, tmpl *dsl.Tmpl, name string, v interface{}, opt ...Option) (agent model.Agent, err error) { +func FindAgent(ctx context.Context, bulker bulk.Bulk, tmpl *dsl.Tmpl, name string, v interface{}, opt ...Option) (model.Agent, error) { o := newOption(FleetAgents, opt...) res, err := SearchWithOneParam(ctx, bulker, tmpl, o.indexName, name, v) if err != nil { - return + return model.Agent{}, fmt.Errorf("failed searching for agent: %w", err) } if len(res.Hits) == 0 { - return agent, ErrNotFound + return model.Agent{}, ErrNotFound } - err = res.Hits[0].Unmarshal(&agent) - return agent, err + var agent model.Agent + if err = res.Hits[0].Unmarshal(&agent); err != nil { + return model.Agent{}, fmt.Errorf("could not unmarshal ES document into model.Agent: %w", err) + } + + return agent, nil } func FindOfflineAgents(ctx context.Context, bulker bulk.Bulk, policyID string, unenrollTimeout time.Duration, opt ...Option) ([]model.Agent, error) { @@ -71,18 +76,19 @@ func FindOfflineAgents(ctx context.Context, bulker bulk.Bulk, policyID string, u FieldLastCheckin: past, }) if err != nil { - return nil, err + return nil, fmt.Errorf("failed searching for agent: %w", err) } if len(res.Hits) == 0 { - return nil, nil + return nil, ErrNotFound } agents := make([]model.Agent, len(res.Hits)) for i, hit := range res.Hits { if err := hit.Unmarshal(&agents[i]); err != nil { - return nil, err + return nil, fmt.Errorf("could not unmarshal ES document into model.Agent: %w", err) } } + return agents, nil } diff --git a/internal/pkg/dl/agent_integration_test.go b/internal/pkg/dl/agent_integration_test.go index 4e65ddb94..3baab6c7e 100644 --- a/internal/pkg/dl/agent_integration_test.go +++ b/internal/pkg/dl/agent_integration_test.go @@ -108,3 +108,48 @@ func TestFindOfflineAgents(t *testing.T) { require.Len(t, agents, 2) assert.EqualValues(t, []string{twoDayOldID, threeDayOldID}, []string{agents[0].Id, agents[1].Id}) } + +func TestFindAgent_NewModel(t *testing.T) { + index, bulker := ftesting.SetupCleanIndex(context.Background(), t, FleetAgents) + + now := time.Now().UTC() + nowStr := now.Format(time.RFC3339) + + policyID := uuid.Must(uuid.NewV4()).String() + agentID := uuid.Must(uuid.NewV4()).String() + + wantOutputs := map[string]*model.PolicyOutput{ + "default": { + Type: "elasticsearch", + APIKey: "TestFindNewModelAgent_APIKey", + ToRetireAPIKeyIds: []model.ToRetireAPIKeyIdsItems{ + { + ID: "TestFindNewModelAgent_APIKeyID_invalidated", + RetiredAt: "TestFindNewModelAgent_APIKeyID_invalidated_at"}, + }, + APIKeyID: "TestFindNewModelAgent_APIKeyID", + PermissionsHash: "TestFindNewModelAgent_PermisPolicysionsHash", + }, + } + body, err := json.Marshal(model.Agent{ + PolicyID: policyID, + Active: true, + LastCheckin: nowStr, + LastCheckinStatus: "", + UpdatedAt: nowStr, + EnrolledAt: nowStr, + Outputs: wantOutputs, + }) + require.NoError(t, err) + + _, err = bulker.Create( + context.Background(), index, agentID, body, bulk.WithRefresh()) + require.NoError(t, err) + + agent, err := FindAgent( + context.Background(), bulker, QueryAgentByID, FieldID, agentID, WithIndexName(index)) + require.NoError(t, err) + + assert.Equal(t, agentID, agent.Id) + assert.Equal(t, wantOutputs, agent.Outputs) +} diff --git a/internal/pkg/dl/constants.go b/internal/pkg/dl/constants.go index c0f1f2c0b..7d1f90555 100644 --- a/internal/pkg/dl/constants.go +++ b/internal/pkg/dl/constants.go @@ -27,30 +27,32 @@ const ( FieldMaxSeqNo = "max_seq_no" FieldActionSeqNo = "action_seq_no" - FieldActionID = "action_id" - FieldPolicyID = "policy_id" - FieldRevisionIdx = "revision_idx" - FieldCoordinatorIdx = "coordinator_idx" - FieldLastCheckin = "last_checkin" - FieldLastCheckinStatus = "last_checkin_status" - FieldLastCheckinMessage = "last_checkin_message" - FieldLocalMetadata = "local_metadata" - FieldComponents = "components" - FieldPolicyRevisionIdx = "policy_revision_idx" - FieldPolicyCoordinatorIdx = "policy_coordinator_idx" - FieldDefaultAPIKey = "default_api_key" - FieldDefaultAPIKeyID = "default_api_key_id" //nolint:gosec // field name - FieldDefaultAPIKeyHistory = "default_api_key_history" //nolint:gosec // field name - FieldPolicyOutputPermissionsHash = "policy_output_permissions_hash" - FieldUnenrolledReason = "unenrolled_reason" - FieldAgentVersion = "version" - FieldAgent = "agent" + FieldActionID = "action_id" + FieldAgent = "agent" + FieldAgentVersion = "version" + FieldCoordinatorIdx = "coordinator_idx" + FieldLastCheckin = "last_checkin" + FieldLastCheckinStatus = "last_checkin_status" + FieldLastCheckinMessage = "last_checkin_message" + FieldLocalMetadata = "local_metadata" + FieldComponents = "components" + FieldPolicyCoordinatorIdx = "policy_coordinator_idx" + FieldPolicyID = "policy_id" + FieldPolicyOutputAPIKey = "api_key" + FieldPolicyOutputAPIKeyID = "api_key_id" + FieldPolicyOutputPermissionsHash = "permissions_hash" + FieldPolicyOutputToRetireAPIKeyIDs = "to_retire_api_key_ids" //nolint:gosec // false positive + FieldPolicyRevisionIdx = "policy_revision_idx" + FieldRevisionIdx = "revision_idx" + FieldUnenrolledReason = "unenrolled_reason" + FiledType = "type" FieldActive = "active" FieldUpdatedAt = "updated_at" FieldUnenrolledAt = "unenrolled_at" FieldUpgradedAt = "upgraded_at" FieldUpgradeStartedAt = "upgrade_started_at" + FieldUpgradeStatus = "upgrade_status" FieldDecodedSha256 = "decoded_sha256" FieldIdentifier = "identifier" diff --git a/internal/pkg/dl/enrollment_api_key.go b/internal/pkg/dl/enrollment_api_key.go index 0252beff0..e9b62e21a 100644 --- a/internal/pkg/dl/enrollment_api_key.go +++ b/internal/pkg/dl/enrollment_api_key.go @@ -19,25 +19,29 @@ const ( ) var ( - QueryEnrollmentAPIKeyByID = prepareFindEnrollmentAPIKeyByID() - QueryEnrollmentAPIKeyByPolicyID = prepareFindEnrollmentAPIKeyByPolicyID() + QueryEnrollmentAPIKeyByID = prepareFindActiveEnrollmentAPIKeyByID() + QueryEnrollmentAPIKeyByPolicyID = prepareFindActiveEnrollmentAPIKeyByPolicyID() ) -func prepareFindEnrollmentAPIKeyByID() *dsl.Tmpl { +func prepareFindActiveEnrollmentAPIKeyByID() *dsl.Tmpl { tmpl := dsl.NewTmpl() root := dsl.NewRoot() - root.Query().Bool().Filter().Term(FieldAPIKeyID, tmpl.Bind(FieldAPIKeyID), nil) + filter := root.Query().Bool().Filter() + filter.Term(FieldAPIKeyID, tmpl.Bind(FieldAPIKeyID), nil) + filter.Term(FieldActive, true, nil) tmpl.MustResolve(root) return tmpl } -func prepareFindEnrollmentAPIKeyByPolicyID() *dsl.Tmpl { +func prepareFindActiveEnrollmentAPIKeyByPolicyID() *dsl.Tmpl { tmpl := dsl.NewTmpl() root := dsl.NewRoot() - root.Query().Bool().Filter().Term(FieldPolicyID, tmpl.Bind(FieldPolicyID), nil) + filter := root.Query().Bool().Filter() + filter.Term(FieldPolicyID, tmpl.Bind(FieldPolicyID), nil) + filter.Term(FieldActive, true, nil) tmpl.MustResolve(root) return tmpl diff --git a/internal/pkg/dl/enrollment_api_key_integration_test.go b/internal/pkg/dl/enrollment_api_key_integration_test.go index 02883ee3c..4cd2de509 100644 --- a/internal/pkg/dl/enrollment_api_key_integration_test.go +++ b/internal/pkg/dl/enrollment_api_key_integration_test.go @@ -22,13 +22,13 @@ import ( ftesting "github.com/elastic/fleet-server/v7/internal/pkg/testing" ) -func createRandomEnrollmentAPIKey(policyID string) model.EnrollmentAPIKey { +func createRandomEnrollmentAPIKey(policyID string, active bool) model.EnrollmentAPIKey { now := time.Now().UTC() return model.EnrollmentAPIKey{ ESDocument: model.ESDocument{ Id: xid.New().String(), }, - Active: true, + Active: active, APIKey: "d2JndlFIWUJJUVVxWDVia2NJTV86X0d6ZmljZGNTc1d4R1otbklrZFFRZw==", APIKeyID: xid.New().String(), CreatedAt: now.Format(time.RFC3339), @@ -38,8 +38,8 @@ func createRandomEnrollmentAPIKey(policyID string) model.EnrollmentAPIKey { } -func storeRandomEnrollmentAPIKey(ctx context.Context, bulker bulk.Bulk, index string, policyID string) (rec model.EnrollmentAPIKey, err error) { - rec = createRandomEnrollmentAPIKey(policyID) +func storeRandomEnrollmentAPIKey(ctx context.Context, bulker bulk.Bulk, index string, policyID string, active bool) (rec model.EnrollmentAPIKey, err error) { + rec = createRandomEnrollmentAPIKey(policyID, active) body, err := json.Marshal(rec) if err != nil { @@ -58,7 +58,7 @@ func TestSearchEnrollmentAPIKeyByID(t *testing.T) { index, bulker := ftesting.SetupCleanIndex(ctx, t, FleetEnrollmentAPIKeys) - rec, err := storeRandomEnrollmentAPIKey(ctx, bulker, index, uuid.Must(uuid.NewV4()).String()) + rec, err := storeRandomEnrollmentAPIKey(ctx, bulker, index, uuid.Must(uuid.NewV4()).String(), true) if err != nil { t.Fatal(err) } @@ -91,15 +91,15 @@ func TestSearchEnrollmentAPIKeyByPolicyID(t *testing.T) { index, bulker := ftesting.SetupCleanIndex(ctx, t, FleetEnrollmentAPIKeys) policyID := uuid.Must(uuid.NewV4()).String() - rec1, err := storeRandomEnrollmentAPIKey(ctx, bulker, index, policyID) + rec1, err := storeRandomEnrollmentAPIKey(ctx, bulker, index, policyID, true) if err != nil { t.Fatal(err) } - rec2, err := storeRandomEnrollmentAPIKey(ctx, bulker, index, policyID) + rec2, err := storeRandomEnrollmentAPIKey(ctx, bulker, index, policyID, true) if err != nil { t.Fatal(err) } - _, err = storeRandomEnrollmentAPIKey(ctx, bulker, index, uuid.Must(uuid.NewV4()).String()) + _, err = storeRandomEnrollmentAPIKey(ctx, bulker, index, uuid.Must(uuid.NewV4()).String(), true) if err != nil { t.Fatal(err) } @@ -114,3 +114,32 @@ func TestSearchEnrollmentAPIKeyByPolicyID(t *testing.T) { t.Fatal(diff) } } + +func TestSearchEnrollmentAPIKeyByPolicyIDWithInactiveIDs(t *testing.T) { + ctx, cn := context.WithCancel(context.Background()) + defer cn() + + index, bulker := ftesting.SetupCleanIndex(ctx, t, FleetEnrollmentAPIKeys) + + policyID := uuid.Must(uuid.NewV4()).String() + rec, err := storeRandomEnrollmentAPIKey(ctx, bulker, index, policyID, true) + if err != nil { + t.Fatalf("unable to store enrollment key: %v", err) + } + for i := 0; i < 10; i++ { + _, err = storeRandomEnrollmentAPIKey(ctx, bulker, index, uuid.Must(uuid.NewV4()).String(), false) + if err != nil { + t.Fatalf("unable to store enrollment key: %v", err) + } + } + + foundRecs, err := findEnrollmentAPIKeys(ctx, bulker, index, QueryEnrollmentAPIKeyByPolicyID, FieldPolicyID, policyID) + if err != nil { + t.Fatalf("unable to find enrollment key: %v", err) + } + + diff := cmp.Diff([]model.EnrollmentAPIKey{rec}, foundRecs) + if diff != "" { + t.Fatalf("expected content does not match: %v", diff) + } +} diff --git a/internal/pkg/dl/migration.go b/internal/pkg/dl/migration.go index 4beb26741..a70918d61 100644 --- a/internal/pkg/dl/migration.go +++ b/internal/pkg/dl/migration.go @@ -12,59 +12,78 @@ import ( "net/http" "time" - "github.com/elastic/fleet-server/v7/internal/pkg/bulk" - "github.com/elastic/fleet-server/v7/internal/pkg/dsl" - "github.com/elastic/go-elasticsearch/v7/esapi" "github.com/pkg/errors" "github.com/rs/zerolog/log" -) -func Migrate(ctx context.Context, bulker bulk.Bulk) error { - return migrateAgentMetadata(ctx, bulker) -} - -// FleetServer 7.15 added a new *AgentMetadata field to the Agent record. -// This field was populated in new enrollments in 7.15 and later; however, the -// change was not backported to support 7.14. The security team is reliant on the -// existence of this field in 7.16, so the following migration was added to -// support upgrade from 7.14. -// -// It is currently safe to run this in the background; albeit with some -// concern on conflicts. The conflict risk exists regardless as N Fleet Servers -// can be run in parallel at the same time. -// -// As the update only occurs once, the 99.9% case is a noop. -func migrateAgentMetadata(ctx context.Context, bulker bulk.Bulk) error { + "github.com/elastic/fleet-server/v7/internal/pkg/bulk" + "github.com/elastic/fleet-server/v7/internal/pkg/dsl" +) - root := dsl.NewRoot() - root.Query().Bool().MustNot().Exists("agent.id") +type ( + migrationFn func(context.Context, bulk.Bulk) error + migrationBodyFn func() (string, string, []byte, error) + migrationResponse struct { + Took int `json:"took"` + TimedOut bool `json:"timed_out"` + Total int `json:"total"` + Updated int `json:"updated"` + Deleted int `json:"deleted"` + Batches int `json:"batches"` + VersionConflicts int `json:"version_conflicts"` + Noops int `json:"noops"` + Retries struct { + Bulk int `json:"bulk"` + Search int `json:"search"` + } `json:"retries"` + Failures []json.RawMessage `json:"failures"` + } +) - painless := "ctx._source.agent = [:]; ctx._source.agent.id = ctx._id;" - root.Param("script", painless) +// timeNow is used to get the current time. It should be replaced for testing. +var timeNow = time.Now - body, err := root.MarshalJSON() - if err != nil { - return err +// Migrate applies, in sequence, the migration functions. Currently, each migration +// function is responsible to ensure it only applies the migration if needed, +// being a no-op otherwise. +func Migrate(ctx context.Context, bulker bulk.Bulk) error { + for _, fn := range []migrationFn{migrateTov7_15, migrateToV8_5} { + if err := fn(ctx, bulker); err != nil { + return err + } } -LOOP: + return nil +} + +func migrate(ctx context.Context, bulker bulk.Bulk, fn migrationBodyFn) (int, error) { + var updatedDocs int for { - nConflicts, err := updateAgentMetadata(ctx, bulker, body) + name, index, body, err := fn() if err != nil { - return err - } - if nConflicts == 0 { - break LOOP + return updatedDocs, + fmt.Errorf("failed to prepare request for migration %s: %w", + name, err) } - time.Sleep(time.Second) + resp, err := applyMigration(ctx, name, index, bulker, body) + if err != nil { + log.Err(err). + Bytes("http.request.body.content", body). + Msgf("migration %s failed", name) + return updatedDocs, fmt.Errorf("failed to apply migration %q: %w", + name, err) + } + updatedDocs += resp.Updated + if resp.VersionConflicts == 0 { + break + } } - return nil + return updatedDocs, nil } -func updateAgentMetadata(ctx context.Context, bulker bulk.Bulk, body []byte) (int, error) { +func applyMigration(ctx context.Context, name string, index string, bulker bulk.Bulk, body []byte) (migrationResponse, error) { start := time.Now() client := bulker.Client() @@ -78,59 +97,199 @@ func updateAgentMetadata(ctx context.Context, bulker bulk.Bulk, body []byte) (in client.UpdateByQuery.WithConflicts("proceed"), } - res, err := client.UpdateByQuery([]string{FleetAgents}, opts...) - + res, err := client.UpdateByQuery([]string{index}, opts...) if err != nil { - return 0, err + return migrationResponse{}, err } if res.IsError() { if res.StatusCode == http.StatusNotFound { // Ignore index not created yet; nothing to upgrade - return 0, nil + return migrationResponse{}, nil } - return 0, fmt.Errorf("Migrate UpdateByQuery %s", res.String()) + return migrationResponse{}, fmt.Errorf("migrate %s UpdateByQuery failed: %s", + name, res.String()) } - resp := struct { - Took int `json:"took"` - TimedOut bool `json:"timed_out"` - Total int `json:"total"` - Updated int `json:"updated"` - Deleted int `json:"deleted"` - Batches int `json:"batches"` - VersionConflicts int `json:"version_conflicts"` - Noops int `json:"noops"` - Retries struct { - Bulk int `json:"bulk"` - Search int `json:"search"` - } `json:"retries"` - Failures []json.RawMessage `json:"failures"` - }{} + resp := migrationResponse{} decoder := json.NewDecoder(res.Body) if err := decoder.Decode(&resp); err != nil { - return 0, errors.Wrap(err, "decode UpdateByQuery response") + return migrationResponse{}, errors.Wrap(err, "decode UpdateByQuery response") } log.Info(). - Int("took", resp.Took). - Bool("timed_out", resp.TimedOut). - Int("total", resp.Total). - Int("updated", resp.Updated). - Int("deleted", resp.Deleted). - Int("batches", resp.Batches). - Int("version_conflicts", resp.VersionConflicts). - Int("noops", resp.Noops). - Int("retries.bulk", resp.Retries.Bulk). - Int("retries.search", resp.Retries.Search). - Dur("rtt", time.Since(start)). - Msg("migrate agent records response") + Str("fleet.migration.name", name). + Int("fleet.migration.es.took", resp.Took). + Bool("fleet.migration.es.timed_out", resp.TimedOut). + Int("fleet.migration.updated", resp.Updated). + Int("fleet.migration.deleted", resp.Deleted). + Int("fleet.migration.batches", resp.Batches). + Int("fleet.migration.version_conflicts", resp.VersionConflicts). + Int("fleet.migration.noops", resp.Noops). + Int("fleet.migration.retries.bulk", resp.Retries.Bulk). + Int("fleet.migration.retries.search", resp.Retries.Search). + Dur("fleet.migration.total.duration", time.Since(start)). + Int("fleet.migration.total.count", resp.Total). + Msgf("migration %s done", name) for _, fail := range resp.Failures { - log.Error().RawJSON("failure", fail).Msg("migration failure") + log.Error().RawJSON("failure", fail).Msgf("failed applying %s migration", name) + } + + return resp, err +} + +// ============================== V7.15 migration ============================== +func migrateTov7_15(ctx context.Context, bulker bulk.Bulk) error { + log.Debug().Msg("applying migration to v7.15") + _, err := migrate(ctx, bulker, migrateAgentMetadata) + if err != nil { + return fmt.Errorf("v7.15.0 data migration failed: %w", err) + } + + return nil +} + +// FleetServer 7.15 added a new *AgentMetadata field to the Agent record. +// This field was populated in new enrollments in 7.15 and later; however, the +// change was not backported to support 7.14. The security team is reliant on the +// existence of this field in 7.16, so the following migration was added to +// support upgrade from 7.14. +// +// It is currently safe to run this in the background; albeit with some +// concern on conflicts. The conflict risk exists regardless as N Fleet Servers +// can be run in parallel at the same time. +// +// As the update only occurs once, the 99.9% case is a noop. +func migrateAgentMetadata() (string, string, []byte, error) { + const migrationName = "AgentMetadata" + query := dsl.NewRoot() + query.Query().Bool().MustNot().Exists("agent.id") + + painless := "ctx._source.agent = [:]; ctx._source.agent.id = ctx._id;" + query.Param("script", painless) + + body, err := query.MarshalJSON() + if err != nil { + return migrationName, FleetAgents, nil, fmt.Errorf("could not marshal ES query: %w", err) + } + + return migrationName, FleetAgents, body, nil +} + +// ============================== V8.5.0 migration ============================= +// https://github.com/elastic/fleet-server/issues/1672 + +func migrateToV8_5(ctx context.Context, bulker bulk.Bulk) error { + log.Debug().Msg("applying migration to v8.5.0") + migrated, err := migrate(ctx, bulker, migrateAgentOutputs) + if err != nil { + return fmt.Errorf("v8.5.0 data migration failed: %w", err) + } + + // The migration was necessary and indeed run, thus we need to regenerate + // the API keys for all agents. In order to do so, we increase the policy + // coordinator index to force a policy update. + if migrated > 0 { + _, err := migrate(ctx, bulker, migratePolicyCoordinatorIdx) + if err != nil { + return fmt.Errorf("v8.5.0 data migration failed: %w", err) + } + } + + return nil +} + +// migrateAgentOutputs performs the necessary changes on the Agent documents +// to introduce the `Outputs` field. +// +// FleetServer 8.5.0 introduces a new field to the Agent document, Outputs, to +// store the outputs credentials and data. The DefaultAPIKey, DefaultAPIKeyID, +// DefaultAPIKeyHistory and PolicyOutputPermissionsHash are now deprecated in +// favour of the new `Outputs` fields, which maps the output name to its data. +// This change fixes https://github.com/elastic/fleet-server/issues/1672. +// +// The change is backward compatible as the deprecated fields are just set to +// their zero value and an older version of FleetServer can repopulate them. +// However, reverting FleetServer to an older version might cause very issue +// this change fixes. +func migrateAgentOutputs() (string, string, []byte, error) { + const ( + migrationName = "AgentOutputs" + fieldOutputs = "outputs" + fieldDefaultAPIKeyID = "default_api_key_id" // nolint:gosec,G101 // this is not a credential + fieldRetiredAt = "retiredAt" + ) + + query := dsl.NewRoot() + query.Query().Bool().Must().Exists(fieldDefaultAPIKeyID) + + fields := map[string]interface{}{fieldRetiredAt: timeNow().UTC().Format(time.RFC3339)} + painless := ` +// set up the new fields +ctx._source['` + fieldOutputs + `']=new HashMap(); +ctx._source['` + fieldOutputs + `']['default']=new HashMap(); +ctx._source['` + fieldOutputs + `']['default'].to_retire_api_key_ids=new ArrayList(); + +// copy 'default_api_key_history' to new 'outputs' field +ctx._source['` + fieldOutputs + `']['default'].type="elasticsearch"; +if (ctx._source.default_api_key_history != null && ctx._source.default_api_key_history.length > 0) { + ctx._source['` + fieldOutputs + `']['default'].to_retire_api_key_ids=ctx._source.default_api_key_history; +} + +Map map = new HashMap(); +map.put("retired_at", params.` + fieldRetiredAt + `); +map.put("id", ctx._source.default_api_key_id); + +// Make current API key empty, so fleet-server will generate a new one +// Add current API jey to be retired +if (ctx._source['` + fieldOutputs + `']['default'].to_retire_api_key_ids != null) { + ctx._source['` + fieldOutputs + `']['default'].to_retire_api_key_ids.add(map); +} +ctx._source['` + fieldOutputs + `']['default'].api_key=""; +ctx._source['` + fieldOutputs + `']['default'].api_key_id=""; +ctx._source['` + fieldOutputs + `']['default'].permissions_hash=ctx._source.policy_output_permissions_hash; + +// Erase deprecated fields +ctx._source.default_api_key_history=null; +ctx._source.default_api_key=null; +ctx._source.default_api_key_id=null; +ctx._source.policy_output_permissions_hash=null; +` + query.Param("script", map[string]interface{}{ + "lang": "painless", + "source": painless, + "params": fields, + }) + + body, err := query.MarshalJSON() + if err != nil { + return migrationName, FleetAgents, nil, fmt.Errorf("could not marshal ES query: %w", err) + } + + return migrationName, FleetAgents, body, nil +} + +// migratePolicyCoordinatorIdx increases the policy's CoordinatorIdx to force +// a policy update ensuring the output data will be migrated to the new +// Agent.Outputs field. See migrateAgentOutputs and https://github.com/elastic/fleet-server/issues/1672 +// for details. +func migratePolicyCoordinatorIdx() (string, string, []byte, error) { + const migrationName = "PolicyCoordinatorIdx" + + query := dsl.NewRoot() + query.Query().MatchAll() + painless := `ctx._source.coordinator_idx++;` + query.Param("script", painless) + + body, err := query.MarshalJSON() + if err != nil { + log.Debug().Str("painlessScript", painless). + Msgf("%s: failed painless script", migrationName) + return migrationName, FleetPolicies, nil, fmt.Errorf("could not marshal ES query: %w", err) } - return resp.VersionConflicts, err + return migrationName, FleetPolicies, body, nil } diff --git a/internal/pkg/dl/migration_integration_test.go b/internal/pkg/dl/migration_integration_test.go new file mode 100644 index 000000000..fdbfd8a7e --- /dev/null +++ b/internal/pkg/dl/migration_integration_test.go @@ -0,0 +1,358 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +//go:build integration + +package dl + +import ( + "context" + "encoding/json" + "fmt" + "testing" + "time" + + "github.com/gofrs/uuid" + "github.com/google/go-cmp/cmp" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/elastic/fleet-server/v7/internal/pkg/bulk" + "github.com/elastic/fleet-server/v7/internal/pkg/model" + ftesting "github.com/elastic/fleet-server/v7/internal/pkg/testing" +) + +const nowStr = "2022-08-12T16:50:05Z" + +func createSomeAgents(t *testing.T, n int, apiKey bulk.APIKey, index string, bulker bulk.Bulk) []string { + t.Helper() + + var createdAgents []string + + for i := 0; i < n; i++ { + outputAPIKey := bulk.APIKey{ + ID: fmt.Sprint(apiKey.ID, i), + Key: fmt.Sprint(apiKey.Key, i), + } + + agentID := uuid.Must(uuid.NewV4()).String() + policyID := uuid.Must(uuid.NewV4()).String() + + agentModel := model.Agent{ + PolicyID: policyID, + Active: true, + LastCheckin: nowStr, + LastCheckinStatus: "", + UpdatedAt: nowStr, + EnrolledAt: nowStr, + DefaultAPIKeyID: outputAPIKey.ID, + DefaultAPIKey: outputAPIKey.Agent(), + PolicyOutputPermissionsHash: fmt.Sprint("a_output_permission_SHA_", i), + DefaultAPIKeyHistory: []model.ToRetireAPIKeyIdsItems{ + { + ID: "old_" + outputAPIKey.ID, + RetiredAt: nowStr, + }, + }, + } + + body, err := json.Marshal(agentModel) + require.NoError(t, err) + + _, err = bulker.Create( + context.Background(), index, agentID, body, bulk.WithRefresh()) + require.NoError(t, err) + + createdAgents = append(createdAgents, agentID) + } + + return createdAgents +} + +func createSomePolicies(t *testing.T, n int, index string, bulker bulk.Bulk) []string { + t.Helper() + + var created []string + + for i := 0; i < n; i++ { + now := time.Now().UTC() + nowStr := now.Format(time.RFC3339) + + policyModel := model.Policy{ + ESDocument: model.ESDocument{}, + CoordinatorIdx: int64(i), + Data: nil, + DefaultFleetServer: false, + PolicyID: fmt.Sprint(i), + RevisionIdx: 1, + Timestamp: nowStr, + UnenrollTimeout: 0, + } + + body, err := json.Marshal(policyModel) + require.NoError(t, err) + + policyDocID, err := bulker.Create( + context.Background(), index, "", body, bulk.WithRefresh()) + require.NoError(t, err) + + created = append(created, policyDocID) + } + + return created +} + +func TestPolicyCoordinatorIdx(t *testing.T) { + index, bulker := ftesting.SetupCleanIndex(context.Background(), t, FleetPolicies) + + docIDs := createSomePolicies(t, 25, index, bulker) + + migrated, err := migrate(context.Background(), bulker, migratePolicyCoordinatorIdx) + require.NoError(t, err) + + require.Equal(t, len(docIDs), migrated) + + for i := range docIDs { + policies, err := QueryLatestPolicies( + context.Background(), bulker, WithIndexName(index)) + if err != nil { + assert.NoError(t, err, "failed to query latest policies") // we want to continue even if a single agent fails + continue + } + + var got model.Policy + for _, p := range policies { + if p.PolicyID == fmt.Sprint(i) { + got = p + } + } + + assert.Equal(t, int64(i+1), got.CoordinatorIdx) + } +} + +func TestMigrateOutputs_withDefaultAPIKeyHistory(t *testing.T) { + now, err := time.Parse(time.RFC3339, nowStr) + require.NoError(t, err, "could not parse time "+nowStr) + timeNow = func() time.Time { + return now + } + + index, bulker := ftesting.SetupCleanIndex(context.Background(), t, FleetAgents) + apiKey := bulk.APIKey{ + ID: "testAgent_", + Key: "testAgent_key_", + } + + agentIDs := createSomeAgents(t, 25, apiKey, index, bulker) + + migratedAgents, err := migrate(context.Background(), bulker, migrateAgentOutputs) + require.NoError(t, err) + + assert.Equal(t, len(agentIDs), migratedAgents) + + for i, id := range agentIDs { + wantOutputType := "elasticsearch" //nolint:goconst // test cases have some duplication + + res, err := SearchWithOneParam(context.Background(), bulker, QueryAgentByID, index, FieldID, id) + require.NoError(t, err) + require.Len(t, res.Hits, 1) + + var got model.Agent + err = res.Hits[0].Unmarshal(&got) + require.NoError(t, err, "could not unmarshal ES document into model.Agent") + + gotDeprecatedFields := struct { + // Deprecated. Use Outputs instead. API key the Elastic Agent uses to authenticate with elasticsearch + DefaultAPIKey *string `json:"default_api_key,omitempty"` + + // Deprecated. Use Outputs instead. Default API Key History + DefaultAPIKeyHistory []model.ToRetireAPIKeyIdsItems `json:"default_api_key_history,omitempty"` + + // Deprecated. Use Outputs instead. ID of the API key the Elastic Agent uses to authenticate with elasticsearch + DefaultAPIKeyID *string `json:"default_api_key_id,omitempty"` + + // Deprecated. Use Outputs instead. The policy output permissions hash + PolicyOutputPermissionsHash *string `json:"policy_output_permissions_hash,omitempty"` + }{} + err = res.Hits[0].Unmarshal(&gotDeprecatedFields) + require.NoError(t, err, "could not unmarshal ES document into gotDeprecatedFields") + + wantToRetireAPIKeyIds := []model.ToRetireAPIKeyIdsItems{ + { + // Current API should be marked to retire after the migration + ID: fmt.Sprintf("%s%d", apiKey.ID, i), + RetiredAt: timeNow().UTC().Format(time.RFC3339)}, + { + ID: fmt.Sprintf("old_%s%d", apiKey.ID, i), + RetiredAt: nowStr}, + } + + // Assert new fields + require.Len(t, got.Outputs, 1) + // Default API key is empty to force fleet-server to regenerate them. + assert.Empty(t, got.Outputs["default"].APIKey) + assert.Empty(t, got.Outputs["default"].APIKeyID) + + assert.Equal(t, wantOutputType, got.Outputs["default"].Type) + assert.Equal(t, + fmt.Sprint("a_output_permission_SHA_", i), + got.Outputs["default"].PermissionsHash) + + // Assert ToRetireAPIKeyIds contains the expected values, regardless of the order. + for _, want := range wantToRetireAPIKeyIds { + var found bool + for _, got := range got.Outputs["default"].ToRetireAPIKeyIds { + found = found || cmp.Equal(want, got) + } + if !found { + t.Errorf("could not find %#v, in %#v", + want, got.Outputs["default"].ToRetireAPIKeyIds) + } + } + + // Assert deprecated fields + assert.Nil(t, gotDeprecatedFields.DefaultAPIKey) + assert.Nil(t, gotDeprecatedFields.DefaultAPIKeyID) + assert.Nil(t, gotDeprecatedFields.PolicyOutputPermissionsHash) + assert.Nil(t, gotDeprecatedFields.DefaultAPIKeyHistory) + } +} + +func TestMigrateOutputs_dontMigrateTwice(t *testing.T) { + now, err := time.Parse(time.RFC3339, nowStr) + require.NoError(t, err, "could not parse time "+nowStr) + timeNow = func() time.Time { + return now + } + + index, bulker := ftesting.SetupCleanIndex(context.Background(), t, FleetAgents) + apiKey := bulk.APIKey{ + ID: "testAgent_", + Key: "testAgent_key_", + } + + agentIDs := createSomeAgents(t, 25, apiKey, index, bulker) + + migratedAgents, err := migrate(context.Background(), bulker, migrateAgentOutputs) + require.NoError(t, err) + assert.Equal(t, len(agentIDs), migratedAgents) + + migratedAgents2, err := migrate(context.Background(), bulker, migrateAgentOutputs) + require.NoError(t, err) + + assert.Equal(t, 0, migratedAgents2) +} + +func TestMigrateOutputs_nil_DefaultAPIKeyHistory(t *testing.T) { + wantOutputType := "elasticsearch" + + now, err := time.Parse(time.RFC3339, nowStr) + require.NoError(t, err, "could not parse time "+nowStr) + timeNow = func() time.Time { + return now + } + + index, bulker := ftesting.SetupCleanIndex(context.Background(), t, FleetAgents) + apiKey := bulk.APIKey{ + ID: "testAgent_", + Key: "testAgent_key_", + } + + i := 0 + outputAPIKey := bulk.APIKey{ + ID: fmt.Sprint(apiKey.ID, i), + Key: fmt.Sprint(apiKey.Key, i), + } + + agentID := uuid.Must(uuid.NewV4()).String() + policyID := uuid.Must(uuid.NewV4()).String() + + agentModel := model.Agent{ + PolicyID: policyID, + Active: true, + LastCheckin: nowStr, + LastCheckinStatus: "", + UpdatedAt: nowStr, + EnrolledAt: nowStr, + DefaultAPIKeyID: outputAPIKey.ID, + DefaultAPIKey: outputAPIKey.Agent(), + PolicyOutputPermissionsHash: fmt.Sprint("a_output_permission_SHA_", i), + } + + body, err := json.Marshal(agentModel) + require.NoError(t, err) + + _, err = bulker.Create( + context.Background(), index, agentID, body, bulk.WithRefresh()) + require.NoError(t, err) + + migratedAgents, err := migrate(context.Background(), bulker, migrateAgentOutputs) + require.NoError(t, err) + + res, err := SearchWithOneParam( + context.Background(), bulker, QueryAgentByID, index, FieldID, agentID) + require.NoError(t, err, "failed to find agent ID %q", agentID) + require.Len(t, res.Hits, 1) + + var got model.Agent + err = res.Hits[0].Unmarshal(&got) + require.NoError(t, err, "could not unmarshal ES document into model.Agent") + + gotDeprecatedFields := struct { + // Deprecated. Use Outputs instead. API key the Elastic Agent uses to authenticate with elasticsearch + DefaultAPIKey *string `json:"default_api_key,omitempty"` + + // Deprecated. Use Outputs instead. Default API Key History + DefaultAPIKeyHistory []model.ToRetireAPIKeyIdsItems `json:"default_api_key_history,omitempty"` + + // Deprecated. Use Outputs instead. ID of the API key the Elastic Agent uses to authenticate with elasticsearch + DefaultAPIKeyID *string `json:"default_api_key_id,omitempty"` + + // Deprecated. Use Outputs instead. The policy output permissions hash + PolicyOutputPermissionsHash *string `json:"policy_output_permissions_hash,omitempty"` + }{} + err = res.Hits[0].Unmarshal(&gotDeprecatedFields) + require.NoError(t, err, "could not unmarshal ES document into gotDeprecatedFields") + + assert.Equal(t, 1, migratedAgents) + + // Assert new fields + require.Len(t, got.Outputs, 1) + // Default API key is empty to force fleet-server to regenerate them. + assert.Empty(t, got.Outputs["default"].APIKey) + assert.Empty(t, got.Outputs["default"].APIKeyID) + assert.Equal(t, wantOutputType, got.Outputs["default"].Type) + assert.Equal(t, + fmt.Sprint("a_output_permission_SHA_", i), + got.Outputs["default"].PermissionsHash) + + // Assert ToRetireAPIKeyIds contains the expected values, regardless of the order. + if assert.Len(t, got.Outputs["default"].ToRetireAPIKeyIds, 1) { + assert.Equal(t, + model.ToRetireAPIKeyIdsItems{ID: outputAPIKey.ID, RetiredAt: nowStr}, + got.Outputs["default"].ToRetireAPIKeyIds[0]) + } + + // Assert deprecated fields + assert.Nil(t, gotDeprecatedFields.DefaultAPIKey) + assert.Nil(t, gotDeprecatedFields.DefaultAPIKey) + assert.Nil(t, gotDeprecatedFields.PolicyOutputPermissionsHash) + assert.Nil(t, gotDeprecatedFields.DefaultAPIKeyHistory) +} + +func TestMigrateOutputs_no_agent_document(t *testing.T) { + now, err := time.Parse(time.RFC3339, nowStr) + require.NoError(t, err, "could not parse time "+nowStr) + timeNow = func() time.Time { + return now + } + + _, bulker := ftesting.SetupCleanIndex(context.Background(), t, FleetAgents) + + migratedAgents, err := migrate(context.Background(), bulker, migrateAgentOutputs) + require.NoError(t, err) + + assert.Equal(t, 0, migratedAgents) +} diff --git a/internal/pkg/dl/servers.go b/internal/pkg/dl/servers.go index 409654e60..2686bf560 100644 --- a/internal/pkg/dl/servers.go +++ b/internal/pkg/dl/servers.go @@ -55,5 +55,5 @@ func EnsureServer(ctx context.Context, bulker bulk.Bulk, version string, agent m if err != nil { return err } - return bulker.Update(ctx, o.indexName, agent.ID, data) + return bulker.Update(ctx, o.indexName, agent.ID, data, bulk.WithRefresh(), bulk.WithRetryOnConflict(3)) } diff --git a/internal/pkg/es/bulk_update_api_key.go b/internal/pkg/es/bulk_update_api_key.go new file mode 100644 index 000000000..e75df2996 --- /dev/null +++ b/internal/pkg/es/bulk_update_api_key.go @@ -0,0 +1,109 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +// Code generated from specification version 7.x: DO NOT EDIT + +// This is a copy of api.search.go file from go-elasticsearch library +// It was modified for /_fleet/_fleet_search experimental API, +// implemented by the custom fleet plugin https://github.com/elastic/elasticsearch/pull/73134 +// This file can be removed and replaced with the official client library wrapper once it is available + +package es + +import ( + "context" + "io" + "net/http" + "strings" + + "github.com/elastic/go-elasticsearch/v7/esapi" +) + +const updateAPIKeyPath = "/_security/api_key/_bulk_update" + +type UpdateApiKeyBulk func(o ...func(*UpdateApiKeyBulkRequest)) (*Response, error) + +type UpdateApiKeyBulkRequest struct { + Body io.Reader + + Header http.Header + + ctx context.Context +} + +// Do executes the request and returns response or error. +// +func (r UpdateApiKeyBulkRequest) Do(ctx context.Context, transport esapi.Transport) (*esapi.Response, error) { + var path strings.Builder + + path.Grow(len(updateAPIKeyPath)) + path.WriteString(updateAPIKeyPath) + + req, err := newRequest(http.MethodPost, path.String(), r.Body) + if err != nil { + return nil, err + } + + if r.Body != nil { + req.Header[headerContentType] = headerContentTypeJSON + } + + if len(r.Header) > 0 { + if len(req.Header) == 0 { + req.Header = r.Header + } else { + for k, vv := range r.Header { + for _, v := range vv { + req.Header.Add(k, v) + } + } + } + } + + if ctx != nil { + req = req.WithContext(ctx) + } + + res, err := transport.Perform(req) + if err != nil { + return nil, err + } + + response := esapi.Response{ + StatusCode: res.StatusCode, + Body: res.Body, + Header: res.Header, + } + + return &response, nil +} + +// WithContext sets the request context. +// +func (f UpdateApiKeyBulkRequest) WithContext(v context.Context) func(*UpdateApiKeyBulkRequest) { + return func(r *UpdateApiKeyBulkRequest) { + r.ctx = v + } +} + +// WithBody - The search definition using the Query DSL. +// +func (f UpdateApiKeyBulkRequest) WithBody(v io.Reader) func(*UpdateApiKeyBulkRequest) { + return func(r *UpdateApiKeyBulkRequest) { + r.Body = v + } +} + +// WithHeader adds the headers to the HTTP request. +// +func (f UpdateApiKeyBulkRequest) WithHeader(h map[string]string) func(*UpdateApiKeyBulkRequest) { + return func(r *UpdateApiKeyBulkRequest) { + if r.Header == nil { + r.Header = make(http.Header) + } + for k, v := range h { + r.Header.Add(k, v) + } + } +} diff --git a/internal/pkg/es/error.go b/internal/pkg/es/error.go index 79b07499c..a5e575df5 100644 --- a/internal/pkg/es/error.go +++ b/internal/pkg/es/error.go @@ -37,17 +37,25 @@ func (e ErrElastic) Error() string { // Otherwise were getting: "elastic fail 404::" msg := "elastic fail " var b strings.Builder - b.Grow(len(msg) + 5 + len(e.Type) + len(e.Reason)) + b.Grow(len(msg) + 11 + len(e.Type) + len(e.Reason) + len(e.Cause.Type) + len(e.Cause.Reason)) b.WriteString(msg) b.WriteString(strconv.Itoa(e.Status)) if e.Type != "" { - b.WriteString(":") + b.WriteString(": ") b.WriteString(e.Type) } if e.Reason != "" { - b.WriteString(":") + b.WriteString(": ") b.WriteString(e.Reason) } + if e.Cause.Type != "" { + b.WriteString(": ") + b.WriteString(e.Cause.Type) + } + if e.Cause.Reason != "" { + b.WriteString(": ") + b.WriteString(e.Cause.Reason) + } return b.String() } @@ -83,8 +91,8 @@ func TranslateError(status int, e *ErrorT) error { Type string Reason string }{ - e.Cause.Type, - e.Cause.Reason, + Type: e.Cause.Type, + Reason: e.Cause.Reason, }, } } diff --git a/internal/pkg/limit/error.go b/internal/pkg/limit/error.go new file mode 100644 index 000000000..65bea753b --- /dev/null +++ b/internal/pkg/limit/error.go @@ -0,0 +1,51 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +package limit + +import ( + "encoding/json" + "errors" + "net/http" + + "github.com/rs/zerolog/log" +) + +var ( + ErrRateLimit = errors.New("rate limit") + ErrMaxLimit = errors.New("max limit") +) + +// writeError recreates the behaviour of api/error.go. +// It is defined separately here to stop a circular import +func writeError(w http.ResponseWriter, err error) error { + resp := struct { + Status int `json:"statusCode"` + Error string `json:"error"` + Message string `json:"message"` + }{ + Status: http.StatusTooManyRequests, + Error: "UnknownLimiterError", + Message: "unknown limiter error encountered", + } + switch { + case errors.Is(err, ErrRateLimit): + resp.Error = "RateLimit" + resp.Message = "exceeded the rate limit" + case errors.Is(err, ErrMaxLimit): + resp.Error = "MaxLimit" + resp.Message = "exceeded the max limit" + default: + log.Error().Err(err).Msg("Encountered unknown limiter error") + } + p, wErr := json.Marshal(&resp) + if wErr != nil { + return wErr + } + w.Header().Set("Content-Type", "application/json; charset=utf-8") + w.Header().Set("X-Content-Type-Options", "nosniff") + w.WriteHeader(http.StatusTooManyRequests) + _, wErr = w.Write(p) + return wErr +} diff --git a/internal/pkg/limit/error_test.go b/internal/pkg/limit/error_test.go new file mode 100644 index 000000000..829e99e79 --- /dev/null +++ b/internal/pkg/limit/error_test.go @@ -0,0 +1,56 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +package limit + +import ( + "encoding/json" + "errors" + "net/http" + "net/http/httptest" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestWriteError(t *testing.T) { + tests := []struct { + name string + err error + want string + }{{ + name: "unknown", + err: errors.New("unknown"), + want: "UnknownLimiterError", + }, { + name: "rate limit", + err: ErrRateLimit, + want: "RateLimit", + }, { + name: "max limit", + err: ErrMaxLimit, + want: "MaxLimit", + }} + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + w := httptest.NewRecorder() + + err := writeError(w, tt.err) + require.NoError(t, err) + resp := w.Result() + defer resp.Body.Close() + require.Equal(t, http.StatusTooManyRequests, resp.StatusCode) + + var body struct { + Status int `json:"statusCode"` + Error string `json:"error"` + } + dec := json.NewDecoder(resp.Body) + err = dec.Decode(&body) + require.NoError(t, err) + require.Equal(t, http.StatusTooManyRequests, body.Status) + require.Equal(t, tt.want, body.Error) + }) + } +} diff --git a/internal/pkg/limit/httpwrapper.go b/internal/pkg/limit/httpwrapper.go new file mode 100644 index 000000000..5f9860c0b --- /dev/null +++ b/internal/pkg/limit/httpwrapper.go @@ -0,0 +1,65 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +package limit + +import ( + "github.com/elastic/fleet-server/v7/internal/pkg/config" + "github.com/julienschmidt/httprouter" + "github.com/rs/zerolog" + "github.com/rs/zerolog/log" +) + +// HTTPWrapper enforces rate limits for each API endpoint. +type HTTPWrapper struct { + checkin *limiter + artifact *limiter + enroll *limiter + ack *limiter + status *limiter + log zerolog.Logger +} + +// Create a new HTTPWrapper using the specified limits. +func NewHTTPWrapper(addr string, cfg *config.ServerLimits) *HTTPWrapper { + return &HTTPWrapper{ + checkin: newLimiter(&cfg.CheckinLimit), + artifact: newLimiter(&cfg.ArtifactLimit), + enroll: newLimiter(&cfg.EnrollLimit), + ack: newLimiter(&cfg.AckLimit), + status: newLimiter(&cfg.StatusLimit), + log: log.With().Str("addr", addr).Logger(), + } +} + +// WhapCheckin wraps the checkin handler with the rate limiter and tracks statistics for the endpoint. +func (l *HTTPWrapper) WrapCheckin(h httprouter.Handle, i StatIncer) httprouter.Handle { + return l.checkin.wrap(l.log.With().Str("route", "checkin").Logger(), zerolog.WarnLevel, h, i) +} + +// WhapArtifact wraps the artifact handler with the rate limiter and tracks statistics for the endpoint. +func (l *HTTPWrapper) WrapArtifact(h httprouter.Handle, i StatIncer) httprouter.Handle { + return l.artifact.wrap(l.log.With().Str("route", "artifact").Logger(), zerolog.DebugLevel, h, i) +} + +// WhapEnroll wraps the enroll handler with the rate limiter and tracks statistics for the endpoint. +func (l *HTTPWrapper) WrapEnroll(h httprouter.Handle, i StatIncer) httprouter.Handle { + return l.enroll.wrap(l.log.With().Str("route", "enroll").Logger(), zerolog.DebugLevel, h, i) +} + +// WhapAck wraps the ack handler with the rate limiter and tracks statistics for the endpoint. +func (l *HTTPWrapper) WrapAck(h httprouter.Handle, i StatIncer) httprouter.Handle { + return l.ack.wrap(l.log.With().Str("route", "ack").Logger(), zerolog.DebugLevel, h, i) +} + +// WhapStatus wraps the checkin handler with the rate limiter and tracks statistics for the endpoint. +func (l *HTTPWrapper) WrapStatus(h httprouter.Handle, i StatIncer) httprouter.Handle { + return l.status.wrap(l.log.With().Str("route", "status").Logger(), zerolog.DebugLevel, h, i) +} + +// StatIncer is the interface used to count statistics associated with an endpoint. +type StatIncer interface { + IncError(error) + IncStart() func() +} diff --git a/internal/pkg/limit/limiter.go b/internal/pkg/limit/limiter.go index 05a4a8262..98aabd540 100644 --- a/internal/pkg/limit/limiter.go +++ b/internal/pkg/limit/limiter.go @@ -2,38 +2,34 @@ // or more contributor license agreements. Licensed under the Elastic License; // you may not use this file except in compliance with the Elastic License. -// Package limit provides the ability to set the maximum connections that a server should handle +// Package limit provides the ability to rate limit the api server. package limit import ( - "errors" + "net/http" "time" "github.com/elastic/fleet-server/v7/internal/pkg/config" + "github.com/julienschmidt/httprouter" + "github.com/rs/zerolog" "golang.org/x/sync/semaphore" "golang.org/x/time/rate" ) -type Limiter struct { +type releaseFunc func() + +type limiter struct { rateLimit *rate.Limiter maxLimit *semaphore.Weighted } -type ReleaseFunc func() - -var ( - ErrRateLimit = errors.New("rate limit") - ErrMaxLimit = errors.New("max limit") -) - -func NewLimiter(cfg *config.Limit) *Limiter { - +func newLimiter(cfg *config.Limit) *limiter { if cfg == nil { - return &Limiter{} + return &limiter{} } - l := &Limiter{} + l := &limiter{} if cfg.Interval != time.Duration(0) { l.rateLimit = rate.NewLimiter(rate.Every(cfg.Interval), cfg.Burst) @@ -46,7 +42,7 @@ func NewLimiter(cfg *config.Limit) *Limiter { return l } -func (l *Limiter) Acquire() (ReleaseFunc, error) { +func (l *limiter) acquire() (releaseFunc, error) { releaseFunc := noop if l.rateLimit != nil && !l.rateLimit.Allow() { @@ -63,11 +59,30 @@ func (l *Limiter) Acquire() (ReleaseFunc, error) { return releaseFunc, nil } -func (l *Limiter) release() { +func (l *limiter) release() { if l.maxLimit != nil { l.maxLimit.Release(1) } } +func (l *limiter) wrap(logger zerolog.Logger, level zerolog.Level, h httprouter.Handle, i StatIncer) httprouter.Handle { + return func(w http.ResponseWriter, r *http.Request, p httprouter.Params) { + dfunc := i.IncStart() + defer dfunc() + + lf, err := l.acquire() + if err != nil { + logger.WithLevel(level).Err(err).Msg("limit reached") + if wErr := writeError(w, err); wErr != nil { + logger.Error().Err(wErr).Msg("fail writing error response") + } + i.IncError(err) + return + } + defer lf() + h(w, r, p) + } +} + func noop() { } diff --git a/internal/pkg/limit/limiter_test.go b/internal/pkg/limit/limiter_test.go new file mode 100644 index 000000000..6c4df66ef --- /dev/null +++ b/internal/pkg/limit/limiter_test.go @@ -0,0 +1,97 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +package limit + +import ( + "net/http" + "net/http/httptest" + "testing" + + "github.com/julienschmidt/httprouter" + "github.com/rs/zerolog" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/mock" + "golang.org/x/sync/semaphore" + "golang.org/x/time/rate" +) + +type mockIncer struct { + mock.Mock +} + +func (m *mockIncer) IncError(err error) { + m.Called(err) +} + +func (m *mockIncer) IncStart() func() { + args := m.Called() + return args.Get(0).(func()) +} + +func stubHandle() httprouter.Handle { + return func(w http.ResponseWriter, r *http.Request, p httprouter.Params) { + w.WriteHeader(http.StatusOK) + } +} + +func TestWrap(t *testing.T) { + t.Run("no limits reached", func(t *testing.T) { + var b bool + var fdec = func() { b = true } + i := &mockIncer{} + i.On("IncStart").Return(fdec).Once() + l := &limiter{} + + h := l.wrap(zerolog.Nop(), zerolog.DebugLevel, stubHandle(), i) + w := httptest.NewRecorder() + h(w, &http.Request{}, httprouter.Params{}) + + resp := w.Result() + resp.Body.Close() + i.AssertExpectations(t) + assert.True(t, b, "expected dec func to have been called") + assert.Equal(t, http.StatusOK, resp.StatusCode) + }) + t.Run("max limit reached", func(t *testing.T) { + var b bool + var fdec = func() { b = true } + i := &mockIncer{} + i.On("IncStart").Return(fdec).Once() + i.On("IncError", ErrMaxLimit).Once() + l := &limiter{ + maxLimit: semaphore.NewWeighted(0), + } + + h := l.wrap(zerolog.Nop(), zerolog.DebugLevel, stubHandle(), i) + w := httptest.NewRecorder() + h(w, &http.Request{}, httprouter.Params{}) + + resp := w.Result() + resp.Body.Close() + i.AssertExpectations(t) + assert.True(t, b, "expected dec func to have been called") + assert.Equal(t, http.StatusTooManyRequests, resp.StatusCode) + }) + t.Run("rate limit reached", func(t *testing.T) { + var b bool + var fdec = func() { b = true } + i := &mockIncer{} + i.On("IncStart").Return(fdec).Once() + i.On("IncError", ErrRateLimit).Once() + l := &limiter{ + rateLimit: rate.NewLimiter(rate.Limit(0), 0), + } + + h := l.wrap(zerolog.Nop(), zerolog.DebugLevel, stubHandle(), i) + w := httptest.NewRecorder() + h(w, &http.Request{}, httprouter.Params{}) + + resp := w.Result() + resp.Body.Close() + i.AssertExpectations(t) + assert.True(t, b, "expected dec func to have been called") + assert.Equal(t, http.StatusTooManyRequests, resp.StatusCode) + }) +} diff --git a/internal/pkg/model/ext.go b/internal/pkg/model/ext.go index d89787855..4a11bbe08 100644 --- a/internal/pkg/model/ext.go +++ b/internal/pkg/model/ext.go @@ -27,14 +27,36 @@ func (m *Server) SetTime(t time.Time) { } // CheckDifferentVersion returns Agent version if it is different from ver, otherwise return empty string -func (m *Agent) CheckDifferentVersion(ver string) string { - if m == nil { +func (a *Agent) CheckDifferentVersion(ver string) string { + if a == nil { return "" } - if m.Agent == nil || ver != m.Agent.Version { + if a.Agent == nil || ver != a.Agent.Version { return ver } return "" } + +// APIKeyIDs returns all the API keys, the valid, in-use as well as the one +// marked to be retired. +func (a *Agent) APIKeyIDs() []string { + if a == nil { + return nil + } + keys := make([]string, 0, len(a.Outputs)+1) + if a.AccessAPIKeyID != "" { + keys = append(keys, a.AccessAPIKeyID) + } + + for _, output := range a.Outputs { + keys = append(keys, output.APIKeyID) + for _, key := range output.ToRetireAPIKeyIds { + keys = append(keys, key.ID) + } + } + + return keys + +} diff --git a/internal/pkg/model/ext_test.go b/internal/pkg/model/ext_test.go index e48194b30..527570270 100644 --- a/internal/pkg/model/ext_test.go +++ b/internal/pkg/model/ext_test.go @@ -2,15 +2,13 @@ // or more contributor license agreements. Licensed under the Elastic License; // you may not use this file except in compliance with the Elastic License. -//go:build !integration -// +build !integration - package model import ( "testing" "github.com/google/go-cmp/cmp" + "github.com/stretchr/testify/assert" ) func TestAgentGetNewVersion(t *testing.T) { @@ -85,3 +83,54 @@ func TestAgentGetNewVersion(t *testing.T) { }) } } + +func TestAgentAPIKeyIDs(t *testing.T) { + tcs := []struct { + name string + agent Agent + want []string + }{ + { + name: "no API key marked to be retired", + agent: Agent{ + AccessAPIKeyID: "access_api_key_id", + Outputs: map[string]*PolicyOutput{ + "p1": {APIKeyID: "p1_api_key_id"}, + "p2": {APIKeyID: "p2_api_key_id"}, + }, + }, + want: []string{"access_api_key_id", "p1_api_key_id", "p2_api_key_id"}, + }, + { + name: "with API key marked to be retired", + agent: Agent{ + AccessAPIKeyID: "access_api_key_id", + Outputs: map[string]*PolicyOutput{ + "p1": { + APIKeyID: "p1_api_key_id", + ToRetireAPIKeyIds: []ToRetireAPIKeyIdsItems{{ + ID: "p1_to_retire_key", + }}}, + "p2": { + APIKeyID: "p2_api_key_id", + ToRetireAPIKeyIds: []ToRetireAPIKeyIdsItems{{ + ID: "p2_to_retire_key", + }}}, + }, + }, + want: []string{ + "access_api_key_id", "p1_api_key_id", "p2_api_key_id", + "p1_to_retire_key", "p2_to_retire_key"}, + }, + } + + for _, tc := range tcs { + t.Run(tc.name, func(t *testing.T) { + got := tc.agent.APIKeyIDs() + + // if A contains B and B contains A => A = B + assert.Subset(t, tc.want, got) + assert.Subset(t, got, tc.want) + }) + } +} diff --git a/internal/pkg/model/schema.go b/internal/pkg/model/schema.go index 51856533e..ae45c1c0a 100644 --- a/internal/pkg/model/schema.go +++ b/internal/pkg/model/schema.go @@ -127,13 +127,13 @@ type Agent struct { // Elastic Agent components detailed status information Components json.RawMessage `json:"components,omitempty"` - // API key the Elastic Agent uses to authenticate with elasticsearch + // Deprecated. Use Outputs instead. API key the Elastic Agent uses to authenticate with elasticsearch DefaultAPIKey string `json:"default_api_key,omitempty"` - // Default API Key History - DefaultAPIKeyHistory []DefaultAPIKeyHistoryItems `json:"default_api_key_history,omitempty"` + // Deprecated. Use Outputs instead. Default API Key History + DefaultAPIKeyHistory []ToRetireAPIKeyIdsItems `json:"default_api_key_history,omitempty"` - // ID of the API key the Elastic Agent uses to authenticate with elasticsearch + // Deprecated. Use Outputs instead. ID of the API key the Elastic Agent uses to authenticate with elasticsearch DefaultAPIKeyID string `json:"default_api_key_id,omitempty"` // Date/time the Elastic Agent enrolled @@ -145,7 +145,7 @@ type Agent struct { // Last checkin message LastCheckinMessage string `json:"last_checkin_message,omitempty"` - // Lst checkin status + // Last checkin status LastCheckinStatus string `json:"last_checkin_status,omitempty"` // Date/time the Elastic Agent was last updated @@ -154,6 +154,9 @@ type Agent struct { // Local metadata information for the Elastic Agent LocalMetadata json.RawMessage `json:"local_metadata,omitempty"` + // Outputs is the policy output data, mapping the output name to its data + Outputs map[string]*PolicyOutput `json:"outputs,omitempty"` + // Packages array Packages []string `json:"packages,omitempty"` @@ -163,7 +166,7 @@ type Agent struct { // The policy ID for the Elastic Agent PolicyID string `json:"policy_id,omitempty"` - // The policy output permissions hash + // Deprecated. Use Outputs instead. The policy output permissions hash PolicyOutputPermissionsHash string `json:"policy_output_permissions_hash,omitempty"` // The current policy revision_idx for the Elastic Agent @@ -193,6 +196,9 @@ type Agent struct { // Date/time the Elastic Agent started the current upgrade UpgradeStartedAt string `json:"upgrade_started_at,omitempty"` + // Upgrade status + UpgradeStatus string `json:"upgrade_status,omitempty"` + // Date/time the Elastic Agent was last upgraded UpgradedAt string `json:"upgraded_at,omitempty"` @@ -257,16 +263,6 @@ type Components struct { type Data struct { } -// DefaultAPIKeyHistoryItems -type DefaultAPIKeyHistoryItems struct { - - // API Key identifier - ID string `json:"id,omitempty"` - - // Date/time the API key was retired - RetiredAt string `json:"retired_at,omitempty"` -} - // EnrollmentAPIKey An Elastic Agent enrollment API key type EnrollmentAPIKey struct { ESDocument @@ -343,6 +339,26 @@ type PolicyLeader struct { Timestamp string `json:"@timestamp,omitempty"` } +// PolicyOutput holds the needed data to manage the output API keys +type PolicyOutput struct { + ESDocument + + // API key the Elastic Agent uses to authenticate with elasticsearch + APIKey string `json:"api_key"` + + // ID of the API key the Elastic Agent uses to authenticate with elasticsearch + APIKeyID string `json:"api_key_id"` + + // The policy output permissions hash + PermissionsHash string `json:"permissions_hash"` + + // API keys to be invalidated on next agent ack + ToRetireAPIKeyIds []ToRetireAPIKeyIdsItems `json:"to_retire_api_key_ids,omitempty"` + + // Type is the output type. Currently only Elasticsearch is supported. + Type string `json:"type"` +} + // Server A Fleet Server type Server struct { ESDocument @@ -364,6 +380,16 @@ type ServerMetadata struct { Version string `json:"version"` } +// ToRetireAPIKeyIdsItems the Output API Keys that were replaced and should be retired +type ToRetireAPIKeyIdsItems struct { + + // API Key identifier + ID string `json:"id,omitempty"` + + // Date/time the API key was retired + RetiredAt string `json:"retired_at,omitempty"` +} + // UserProvidedMetadata User provided metadata information for the Elastic Agent type UserProvidedMetadata struct { } diff --git a/internal/pkg/policy/parsed_policy.go b/internal/pkg/policy/parsed_policy.go index dbf5d3801..029298ef5 100644 --- a/internal/pkg/policy/parsed_policy.go +++ b/internal/pkg/policy/parsed_policy.go @@ -42,7 +42,7 @@ type ParsedPolicy struct { Policy model.Policy Fields map[string]json.RawMessage Roles RoleMapT - Outputs map[string]PolicyOutput + Outputs map[string]Output Default ParsedPolicyDefaults } @@ -91,8 +91,8 @@ func NewParsedPolicy(p model.Policy) (*ParsedPolicy, error) { return pp, nil } -func constructPolicyOutputs(outputsRaw json.RawMessage, roles map[string]RoleT) (map[string]PolicyOutput, error) { - result := make(map[string]PolicyOutput) +func constructPolicyOutputs(outputsRaw json.RawMessage, roles map[string]RoleT) (map[string]Output, error) { + result := make(map[string]Output) outputsMap, err := smap.Parse(outputsRaw) if err != nil { @@ -102,7 +102,7 @@ func constructPolicyOutputs(outputsRaw json.RawMessage, roles map[string]RoleT) for k := range outputsMap { v := outputsMap.GetMap(k) - p := PolicyOutput{ + p := Output{ Name: k, Type: v.GetString(FieldOutputType), } @@ -126,13 +126,13 @@ func parsePerms(permsRaw json.RawMessage) (RoleMapT, error) { // iterate across the keys m := make(RoleMapT, len(permMap)) for k := range permMap { - v := permMap.GetMap(k) if v != nil { var r RoleT // Stable hash on permissions payload + // permission hash created here if r.Sha2, err = v.Hash(); err != nil { return nil, err } diff --git a/internal/pkg/policy/parsed_policy_test.go b/internal/pkg/policy/parsed_policy_test.go index 547cfcf7a..957a24911 100644 --- a/internal/pkg/policy/parsed_policy_test.go +++ b/internal/pkg/policy/parsed_policy_test.go @@ -6,14 +6,12 @@ package policy import ( "encoding/json" - "fmt" "testing" "github.com/elastic/fleet-server/v7/internal/pkg/model" ) func TestNewParsedPolicy(t *testing.T) { - // Run two formatting of the same payload to validate that the sha2 remains the same payloads := []string{ testPolicy, @@ -51,7 +49,7 @@ func TestNewParsedPolicy(t *testing.T) { for _, f := range fields { if _, ok := pp.Fields[f]; !ok { - t.Error(fmt.Sprintf("Missing field %s", f)) + t.Errorf("Missing field %s", f) } } @@ -71,7 +69,7 @@ func TestNewParsedPolicy(t *testing.T) { expectedSha2 := "d4d0840fe28ca4900129a749b56cee729562c0a88c935192c659252b5b0d762a" if defaultOutput.Role.Sha2 != expectedSha2 { - t.Fatal(fmt.Sprintf("Expected sha2: '%s', got '%s'.", expectedSha2, defaultOutput.Role.Sha2)) + t.Fatalf("Expected sha2: '%s', got '%s'.", expectedSha2, defaultOutput.Role.Sha2) } } } @@ -105,7 +103,7 @@ func TestNewParsedPolicyNoES(t *testing.T) { for _, f := range fields { if _, ok := pp.Fields[f]; !ok { - t.Error(fmt.Sprintf("Missing field %s", f)) + t.Errorf("Missing field %s", f) } } diff --git a/internal/pkg/policy/policy_output.go b/internal/pkg/policy/policy_output.go index 8115d22ec..4f5b99ae5 100644 --- a/internal/pkg/policy/policy_output.go +++ b/internal/pkg/policy/policy_output.go @@ -32,118 +32,345 @@ var ( ErrFailInjectAPIKey = errors.New("fail inject api key") ) -type PolicyOutput struct { +type Output struct { Name string Type string Role *RoleT } -func (p *PolicyOutput) Prepare(ctx context.Context, zlog zerolog.Logger, bulker bulk.Bulk, agent *model.Agent, outputMap smap.Map) error { +// Prepare prepares the output p to be sent to the elastic-agent +// The agent might be mutated for an elasticsearch output +func (p *Output) Prepare(ctx context.Context, zlog zerolog.Logger, bulker bulk.Bulk, agent *model.Agent, outputMap smap.Map) error { + zlog = zlog.With(). + Str("fleet.agent.id", agent.Id). + Str("fleet.policy.output.name", p.Name).Logger() + switch p.Type { case OutputTypeElasticsearch: zlog.Debug().Msg("preparing elasticsearch output") + if err := p.prepareElasticsearch(ctx, zlog, bulker, agent, outputMap); err != nil { + return fmt.Errorf("failed to prepare elasticsearch output %q: %w", p.Name, err) + } + case OutputTypeLogstash: + zlog.Debug().Msg("preparing logstash output") + zlog.Info().Msg("no actions required for logstash output preparation") + default: + zlog.Error().Msgf("unknown output type: %s; skipping preparation", p.Type) + return fmt.Errorf("encountered unexpected output type while preparing outputs: %s", p.Type) + } + return nil +} - // The role is required to do api key management - if p.Role == nil { - zlog.Error().Str("name", p.Name).Msg("policy does not contain required output permission section") - return ErrNoOutputPerms +func (p *Output) prepareElasticsearch( + ctx context.Context, + zlog zerolog.Logger, + bulker bulk.Bulk, + agent *model.Agent, + outputMap smap.Map) error { + // The role is required to do api key management + if p.Role == nil { + zlog.Error(). + Msg("policy does not contain required output permission section") + return ErrNoOutputPerms + } + + output, foundOutput := agent.Outputs[p.Name] + if !foundOutput { + if agent.Outputs == nil { + agent.Outputs = map[string]*model.PolicyOutput{} } - // Determine whether we need to generate an output ApiKey. - // This is accomplished by comparing the sha2 hash stored in the agent - // record with the precalculated sha2 hash of the role. + zlog.Debug().Msgf("creating agent.Outputs[%s]", p.Name) + output = &model.PolicyOutput{} + agent.Outputs[p.Name] = output + } + + // Determine whether we need to generate an output ApiKey. + // This is accomplished by comparing the sha2 hash stored in the corresponding + // output in the agent record with the precalculated sha2 hash of the role. + + // Note: This will need to be updated when doing multi-cluster elasticsearch support + // Currently, we assume all ES outputs are the same ES fleet-server is connected to. + needNewKey := false + needUpdateKey := false + switch { + case output.APIKey == "": + zlog.Debug().Msg("must generate api key as default API key is not present") + needNewKey = true + case p.Role.Sha2 != output.PermissionsHash: + // the is actually the OutputPermissionsHash for the default hash. The Agent + // document on ES does not have OutputPermissionsHash for any other output + // besides the default one. It seems to me error-prone to rely on the default + // output permissions hash to generate new API keys for other outputs. + zlog.Debug().Msg("must generate api key as policy output permissions changed") + needUpdateKey = true + default: + zlog.Debug().Msg("policy output permissions are the same") + } - // Note: This will need to be updated when doing multi-cluster elasticsearch support - // Currently, we only have access to the token for the elasticsearch instance fleet-server - // is monitors. When updating for multiple ES instances we need to tie the token to the output. - needNewKey := true - switch { - case agent.DefaultAPIKey == "": - zlog.Debug().Msg("must generate api key as default API key is not present") - case p.Role.Sha2 != agent.PolicyOutputPermissionsHash: - zlog.Debug().Msg("must generate api key as policy output permissions changed") - default: - needNewKey = false - zlog.Debug().Msg("policy output permissions are the same") + if needUpdateKey { + zlog.Debug(). + RawJSON("roles", p.Role.Raw). + Str("oldHash", output.PermissionsHash). + Str("newHash", p.Role.Sha2). + Msg("Generating a new API key") + + // query current api key for roles so we don't lose permissions in the meantime + currentRoles, err := fetchAPIKeyRoles(ctx, bulker, output.APIKeyID) + if err != nil { + zlog.Error(). + Str("apiKeyID", output.APIKeyID). + Err(err).Msg("fail fetching roles for key") + return err } - if needNewKey { - zlog.Debug(). - RawJSON("roles", p.Role.Raw). - Str("oldHash", agent.PolicyOutputPermissionsHash). - Str("newHash", p.Role.Sha2). - Msg("Generating a new API key") - - outputAPIKey, err := generateOutputAPIKey(ctx, bulker, agent.Id, p.Name, p.Role.Raw) - if err != nil { - zlog.Error().Err(err).Msg("fail generate output key") - return err - } + // merge roles with p.Role + newRoles, err := mergeRoles(zlog, currentRoles, p.Role) + if err != nil { + zlog.Error(). + Str("apiKeyID", output.APIKeyID). + Err(err).Msg("fail merging roles for key") + return err + } - agent.DefaultAPIKey = outputAPIKey.Agent() + // hash provided is only for merging request together and not persisted + err = bulker.APIKeyUpdate(ctx, output.APIKeyID, newRoles.Sha2, newRoles.Raw) + if err != nil { + zlog.Error().Err(err).Msg("fail generate output key") + zlog.Debug().RawJSON("roles", newRoles.Raw).Str("sha", newRoles.Sha2).Err(err).Msg("roles not updated") + return err + } - // When a new keys is generated we need to update the Agent record, - // this will need to be updated when multiples Elasticsearch output - // are used. - zlog.Info(). - Str("hash.sha256", p.Role.Sha2). - Str(logger.DefaultOutputAPIKeyID, outputAPIKey.ID). - Msg("Updating agent record to pick up default output key.") + output.PermissionsHash = p.Role.Sha2 // for the sake of consistency + zlog.Debug(). + Str("hash.sha256", p.Role.Sha2). + Str("roles", string(p.Role.Raw)). + Msg("Updating agent record to pick up most recent roles.") - fields := map[string]interface{}{ - dl.FieldDefaultAPIKey: outputAPIKey.Agent(), - dl.FieldDefaultAPIKeyID: outputAPIKey.ID, - dl.FieldPolicyOutputPermissionsHash: p.Role.Sha2, - } - if agent.DefaultAPIKeyID != "" { - fields[dl.FieldDefaultAPIKeyHistory] = model.DefaultAPIKeyHistoryItems{ - ID: agent.DefaultAPIKeyID, - RetiredAt: time.Now().UTC().Format(time.RFC3339), - } + fields := map[string]interface{}{ + dl.FieldPolicyOutputPermissionsHash: p.Role.Sha2, + } + + // Using painless script to update permission hash for updated key + body, err := renderUpdatePainlessScript(p.Name, fields) + if err != nil { + return err + } + + if err = bulker.Update(ctx, dl.FleetAgents, agent.Id, body, bulk.WithRefresh(), bulk.WithRetryOnConflict(3)); err != nil { + zlog.Error().Err(err).Msg("fail update agent record") + return err + } + + } else if needNewKey { + zlog.Debug(). + RawJSON("fleet.policy.roles", p.Role.Raw). + Str("fleet.policy.default.oldHash", output.PermissionsHash). + Str("fleet.policy.default.newHash", p.Role.Sha2). + Msg("Generating a new API key") + + ctx := zlog.WithContext(ctx) + outputAPIKey, err := + generateOutputAPIKey(ctx, bulker, agent.Id, p.Name, p.Role.Raw) + if err != nil { + return fmt.Errorf("failed generate output API key: %w", err) + } + + // When a new keys is generated we need to update the Agent record, + // this will need to be updated when multiples remote Elasticsearch output + // are supported. + zlog.Info(). + Str("fleet.policy.role.hash.sha256", p.Role.Sha2). + Str(logger.DefaultOutputAPIKeyID, outputAPIKey.ID). + Msg("Updating agent record to pick up default output key.") + + fields := map[string]interface{}{ + dl.FieldPolicyOutputAPIKey: outputAPIKey.Agent(), + dl.FieldPolicyOutputAPIKeyID: outputAPIKey.ID, + dl.FieldPolicyOutputPermissionsHash: p.Role.Sha2, + } + + if !foundOutput { + fields[dl.FiledType] = OutputTypeElasticsearch + } + if output.APIKeyID != "" { + fields[dl.FieldPolicyOutputToRetireAPIKeyIDs] = model.ToRetireAPIKeyIdsItems{ + ID: output.APIKeyID, + RetiredAt: time.Now().UTC().Format(time.RFC3339), } + } + + // Using painless script to append the old keys to the history + body, err := renderUpdatePainlessScript(p.Name, fields) + if err != nil { + return fmt.Errorf("could no tupdate painless script: %w", err) + } - // Using painless script to append the old keys to the history - body, err := renderUpdatePainlessScript(fields) + if err = bulker.Update(ctx, dl.FleetAgents, agent.Id, body, bulk.WithRefresh(), bulk.WithRetryOnConflict(3)); err != nil { + zlog.Error().Err(err).Msg("fail update agent record") + return fmt.Errorf("fail update agent record: %w", err) + } + + // Now that all is done, we can update the output on the agent variable + // Right not it's more for consistency and to ensure the in-memory agent + // data is correct and in sync with ES, so it can be safely used after + // this method returns. + output.Type = OutputTypeElasticsearch + output.APIKey = outputAPIKey.Agent() + output.APIKeyID = outputAPIKey.ID + output.PermissionsHash = p.Role.Sha2 // for the sake of consistency + } + + // Always insert the `api_key` as part of the output block, this is required + // because only fleet server knows the api key for the specific agent, if we don't + // add it the agent will not receive the `api_key` and will not be able to connect + // to Elasticsearch. + // + // We need to investigate allocation with the new LS output, we had optimization + // in place to reduce number of agent policy allocation when sending the updated + // agent policy to multiple agents. + // See: https://github.com/elastic/fleet-server/issues/1301 + if err := setMapObj(outputMap, output.APIKey, p.Name, "api_key"); err != nil { + return err + } - if err != nil { - return err + return nil +} + +func fetchAPIKeyRoles(ctx context.Context, b bulk.Bulk, apiKeyID string) (*RoleT, error) { + res, err := b.APIKeyRead(ctx, apiKeyID, true) + if err != nil { + return nil, err + } + + roleMap, err := smap.Parse(res.RoleDescriptors) + if err != nil { + return nil, err + } + r := &RoleT{ + Raw: res.RoleDescriptors, + } + + // Stable hash on permissions payload + if r.Sha2, err = roleMap.Hash(); err != nil { + return nil, err + } + + return r, nil +} + +// mergeRoles takes old and new role sets and merges them following these rules: +// - take all new roles +// - append all old roles +// to avoid name collisions every old entry has a `rdstale` suffix +// if rdstale suffix already exists it uses `{index}-rdstale` to avoid further collisions +// everything ending with `rdstale` is removed on ack. +// in case we have key `123` in both old and new result will be: {"123", "123-0-rdstale"} +// in case old contains {"123", "123-0-rdstale"} and new contains {"123"} result is: {"123", "123-rdstale", "123-0-rdstale"} +func mergeRoles(zlog zerolog.Logger, old, new *RoleT) (*RoleT, error) { + if old == nil { + return new, nil + } + if new == nil { + return old, nil + } + + oldMap, err := smap.Parse(old.Raw) + if err != nil { + return nil, err + } + if oldMap == nil { + return new, nil + } + + newMap, err := smap.Parse(new.Raw) + if err != nil { + return nil, err + } + if newMap == nil { + return old, nil + } + + destMap := smap.Map{} + // copy all from new + for k, v := range newMap { + destMap[k] = v + } + + findNewKey := func(m smap.Map, candidate string) string { + if strings.HasSuffix(candidate, "-rdstale") { + candidate = strings.TrimSuffix(candidate, "-rdstale") + dashIdx := strings.LastIndex(candidate, "-") + if dashIdx >= 0 { + candidate = candidate[:dashIdx] } - if err = bulker.Update(ctx, dl.FleetAgents, agent.Id, body); err != nil { - zlog.Error().Err(err).Msg("fail update agent record") - return err + } + + // 1 should be enough, 100 is just to have some space + for i := 0; i < 100; i++ { + c := fmt.Sprintf("%s-%d-rdstale", candidate, i) + + if _, exists := m[c]; !exists { + return c } } - // Always insert the `api_key` as part of the output block, this is required - // because only fleet server knows the api key for the specific agent, if we don't - // add it the agent will not receive the `api_key` and will not be able to connect - // to Elasticsearch. - // - // We need to investigate allocation with the new LS output, we had optimization - // in place to reduce number of agent policy allocation when sending the updated - // agent policy to multiple agents. - // See: https://github.com/elastic/fleet-server/issues/1301 - if ok := setMapObj(outputMap, agent.DefaultAPIKey, p.Name, "api_key"); !ok { - return ErrFailInjectAPIKey + return "" + } + // copy old + for k, v := range oldMap { + newKey := findNewKey(destMap, k) + if newKey == "" { + zlog.Warn().Msg("Failed to find a key for role assignement.") + + zlog.Debug(). + RawJSON("roles", new.Raw). + Str("candidate", k). + Msg("roles not included.") + + continue } - case OutputTypeLogstash: - zlog.Debug().Msg("preparing logstash output") - zlog.Info().Msg("no actions required for logstash output preparation") - default: - zlog.Error().Msgf("unknown output type: %s; skipping preparation", p.Type) - return fmt.Errorf("encountered unexpected output type while preparing outputs: %s", p.Type) + destMap[newKey] = v } - return nil + + r := &RoleT{} + if r.Sha2, err = destMap.Hash(); err != nil { + return nil, err + } + if r.Raw, err = json.Marshal(destMap); err != nil { + return nil, err + } + + return r, nil } -func renderUpdatePainlessScript(fields map[string]interface{}) ([]byte, error) { +func renderUpdatePainlessScript(outputName string, fields map[string]interface{}) ([]byte, error) { var source strings.Builder + + // prepare agent.elasticsearch_outputs[OUTPUT_NAME] + source.WriteString(fmt.Sprintf(` +if (ctx._source['outputs']==null) + {ctx._source['outputs']=new HashMap();} +if (ctx._source['outputs']['%s']==null) + {ctx._source['outputs']['%s']=new HashMap();} +`, outputName, outputName)) + for field := range fields { - if field == dl.FieldDefaultAPIKeyHistory { - source.WriteString(fmt.Sprint("if (ctx._source.", field, "==null) {ctx._source.", field, "=new ArrayList();} ctx._source.", field, ".add(params.", field, ");")) + if field == dl.FieldPolicyOutputToRetireAPIKeyIDs { + // dl.FieldPolicyOutputToRetireAPIKeyIDs is a special case. + // It's an array that gets deleted when the keys are invalidated. + // Thus, append the old API key ID, create the field if necessary. + source.WriteString(fmt.Sprintf(` +if (ctx._source['outputs']['%s'].%s==null) + {ctx._source['outputs']['%s'].%s=new ArrayList();} +ctx._source['outputs']['%s'].%s.add(params.%s); +`, outputName, field, outputName, field, outputName, field, field)) } else { - source.WriteString(fmt.Sprint("ctx._source.", field, "=", "params.", field, ";")) + // Update the other fields + source.WriteString(fmt.Sprintf(` +ctx._source['outputs']['%s'].%s=params.%s;`, + outputName, field, field)) } } @@ -158,36 +385,45 @@ func renderUpdatePainlessScript(fields map[string]interface{}) ([]byte, error) { return body, err } -func generateOutputAPIKey(ctx context.Context, bulk bulk.Bulk, agentID, outputName string, roles []byte) (*apikey.APIKey, error) { +func generateOutputAPIKey( + ctx context.Context, + bulk bulk.Bulk, + agentID, + outputName string, + roles []byte) (*apikey.APIKey, error) { name := fmt.Sprintf("%s:%s", agentID, outputName) + zerolog.Ctx(ctx).Info().Msgf("generating output API key %s for agent ID %s", + name, agentID) return bulk.APIKeyCreate( ctx, name, "", roles, - apikey.NewMetadata(agentID, apikey.TypeOutput), + apikey.NewMetadata(agentID, outputName, apikey.TypeOutput), ) } -func setMapObj(obj map[string]interface{}, val interface{}, keys ...string) bool { +func setMapObj(obj map[string]interface{}, val interface{}, keys ...string) error { if len(keys) == 0 { - return false + return fmt.Errorf("no key to be updated: %w", ErrFailInjectAPIKey) } for _, k := range keys[:len(keys)-1] { v, ok := obj[k] if !ok { - return false + return fmt.Errorf("no key %q not present on MapObj: %w", + k, ErrFailInjectAPIKey) } obj, ok = v.(map[string]interface{}) if !ok { - return false + return fmt.Errorf("cannot cast %T to map[string]interface{}: %w", + obj, ErrFailInjectAPIKey) } } k := keys[len(keys)-1] obj[k] = val - return true + return nil } diff --git a/internal/pkg/policy/policy_output_integration_test.go b/internal/pkg/policy/policy_output_integration_test.go new file mode 100644 index 000000000..5c8a254b8 --- /dev/null +++ b/internal/pkg/policy/policy_output_integration_test.go @@ -0,0 +1,200 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +//go:build integration + +package policy + +import ( + "context" + "encoding/json" + "testing" + "time" + + "github.com/gofrs/uuid" + "github.com/rs/zerolog" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/elastic/fleet-server/v7/internal/pkg/bulk" + "github.com/elastic/fleet-server/v7/internal/pkg/dl" + "github.com/elastic/fleet-server/v7/internal/pkg/model" + "github.com/elastic/fleet-server/v7/internal/pkg/smap" + ftesting "github.com/elastic/fleet-server/v7/internal/pkg/testing" +) + +func TestRenderUpdatePainlessScript(t *testing.T) { + tts := []struct { + name string + + existingToRetireAPIKeyIds []model.ToRetireAPIKeyIdsItems + }{ + { + name: "to_retire_api_key_ids is empty", + }, + { + name: "to_retire_api_key_ids is not empty", + existingToRetireAPIKeyIds: []model.ToRetireAPIKeyIdsItems{{ + ID: "pre_existing_ID", RetiredAt: "pre_existing__RetiredAt"}}, + }, + } + + for _, tt := range tts { + t.Run(tt.name, func(t *testing.T) { + outputPermissionSha := "new_permissionSHA_" + tt.name + outputName := "output_" + tt.name + outputAPIKey := bulk.APIKey{ID: "new_ID", Key: "new-key"} + + index, bulker := ftesting.SetupCleanIndex(context.Background(), t, dl.FleetAgents) + + now := time.Now().UTC() + nowStr := now.Format(time.RFC3339) + + agentID := uuid.Must(uuid.NewV4()).String() + policyID := uuid.Must(uuid.NewV4()).String() + + previousAPIKey := bulk.APIKey{ + ID: "old_" + outputAPIKey.ID, + Key: "old_" + outputAPIKey.Key, + } + + wantOutputs := map[string]*model.PolicyOutput{ + outputName: { + APIKey: outputAPIKey.Agent(), + APIKeyID: outputAPIKey.ID, + PermissionsHash: outputPermissionSha, + Type: OutputTypeElasticsearch, + ToRetireAPIKeyIds: append(tt.existingToRetireAPIKeyIds, + model.ToRetireAPIKeyIdsItems{ + ID: previousAPIKey.ID, RetiredAt: nowStr}), + }, + } + + agentModel := model.Agent{ + PolicyID: policyID, + Active: true, + LastCheckin: nowStr, + LastCheckinStatus: "", + UpdatedAt: nowStr, + EnrolledAt: nowStr, + Outputs: map[string]*model.PolicyOutput{ + outputName: { + Type: OutputTypeElasticsearch, + APIKey: previousAPIKey.Agent(), + APIKeyID: previousAPIKey.ID, + PermissionsHash: "old_" + outputPermissionSha, + }, + }, + } + if tt.existingToRetireAPIKeyIds != nil { + agentModel.Outputs[outputName].ToRetireAPIKeyIds = + tt.existingToRetireAPIKeyIds + } + + body, err := json.Marshal(agentModel) + require.NoError(t, err) + + _, err = bulker.Create( + context.Background(), index, agentID, body, bulk.WithRefresh()) + require.NoError(t, err) + + fields := map[string]interface{}{ + dl.FieldPolicyOutputAPIKey: outputAPIKey.Agent(), + dl.FieldPolicyOutputAPIKeyID: outputAPIKey.ID, + dl.FieldPolicyOutputPermissionsHash: outputPermissionSha, + dl.FieldPolicyOutputToRetireAPIKeyIDs: model.ToRetireAPIKeyIdsItems{ + ID: previousAPIKey.ID, RetiredAt: nowStr}, + } + + got, err := renderUpdatePainlessScript(outputName, fields) + require.NoError(t, err, "renderUpdatePainlessScript returned an unexpected error") + + err = bulker.Update(context.Background(), dl.FleetAgents, agentID, got) + require.NoError(t, err, "bulker.Update failed") + + // there is some refresh thing that needs time, I didn't manage to find + // how ot fix it at the requests to ES level, thus this timeout here. + time.Sleep(time.Second) + + gotAgent, err := dl.FindAgent( + context.Background(), bulker, dl.QueryAgentByID, dl.FieldID, agentID, dl.WithIndexName(index)) + require.NoError(t, err) + + assert.Equal(t, agentID, gotAgent.Id) + assert.Len(t, gotAgent.Outputs, len(wantOutputs)) + assert.Equal(t, wantOutputs, gotAgent.Outputs) + }) + } +} + +func TestPolicyOutputESPrepareRealES(t *testing.T) { + index, bulker := ftesting.SetupCleanIndex(context.Background(), t, dl.FleetAgents) + + agentID := createAgent(t, index, bulker) + agent, err := dl.FindAgent( + context.Background(), bulker, dl.QueryAgentByID, dl.FieldID, agentID, dl.WithIndexName(index)) + if err != nil { + require.NoError(t, err, "failed to find agent ID %q", agentID) + } + + output := Output{ + Type: OutputTypeElasticsearch, + Name: "test output", + Role: &RoleT{ + Sha2: "new-hash", + Raw: TestPayload, + }, + } + policyMap := smap.Map{ + "test output": map[string]interface{}{}, + } + + err = output.prepareElasticsearch( + context.Background(), zerolog.Nop(), bulker, &agent, policyMap) + require.NoError(t, err) + + // need to wait a bit before querying the agent again + // TODO: find a better way to query the updated agent + time.Sleep(time.Second) + + got, err := dl.FindAgent( + context.Background(), bulker, dl.QueryAgentByID, dl.FieldID, agentID, dl.WithIndexName(index)) + if err != nil { + require.NoError(t, err, "failed to find agent ID %q", agentID) + } + + gotOutput, ok := got.Outputs[output.Name] + require.True(t, ok, "no '%s' output fouled on agent document", output.Name) + + assert.Empty(t, gotOutput.ToRetireAPIKeyIds) + assert.Equal(t, gotOutput.Type, OutputTypeElasticsearch) + assert.Equal(t, gotOutput.PermissionsHash, output.Role.Sha2) + assert.NotEmpty(t, gotOutput.APIKey) + assert.NotEmpty(t, gotOutput.APIKeyID) +} + +func createAgent(t *testing.T, index string, bulker bulk.Bulk) string { + const nowStr = "2022-08-12T16:50:05Z" + + agentID := uuid.Must(uuid.NewV4()).String() + policyID := uuid.Must(uuid.NewV4()).String() + + agentModel := model.Agent{ + PolicyID: policyID, + Active: true, + LastCheckin: nowStr, + LastCheckinStatus: "", + UpdatedAt: nowStr, + EnrolledAt: nowStr, + } + + body, err := json.Marshal(agentModel) + require.NoError(t, err) + + _, err = bulker.Create( + context.Background(), index, agentID, body, bulk.WithRefresh()) + require.NoError(t, err) + + return agentID +} diff --git a/internal/pkg/policy/policy_output_test.go b/internal/pkg/policy/policy_output_test.go index be8f8105d..f74d57b3e 100644 --- a/internal/pkg/policy/policy_output_test.go +++ b/internal/pkg/policy/policy_output_test.go @@ -8,6 +8,7 @@ import ( "context" "testing" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" "github.com/stretchr/testify/require" @@ -23,7 +24,7 @@ var TestPayload []byte func TestPolicyLogstashOutputPrepare(t *testing.T) { logger := testlog.SetLogger(t) bulker := ftesting.NewMockBulk() - po := PolicyOutput{ + po := Output{ Type: OutputTypeLogstash, Name: "test output", Role: &RoleT{ @@ -39,7 +40,7 @@ func TestPolicyLogstashOutputPrepare(t *testing.T) { func TestPolicyLogstashOutputPrepareNoRole(t *testing.T) { logger := testlog.SetLogger(t) bulker := ftesting.NewMockBulk() - po := PolicyOutput{ + po := Output{ Type: OutputTypeLogstash, Name: "test output", Role: nil, @@ -54,7 +55,7 @@ func TestPolicyLogstashOutputPrepareNoRole(t *testing.T) { func TestPolicyDefaultLogstashOutputPrepare(t *testing.T) { logger := testlog.SetLogger(t) bulker := ftesting.NewMockBulk() - po := PolicyOutput{ + po := Output{ Type: OutputTypeLogstash, Name: "test output", Role: &RoleT{ @@ -71,7 +72,7 @@ func TestPolicyDefaultLogstashOutputPrepare(t *testing.T) { func TestPolicyESOutputPrepareNoRole(t *testing.T) { logger := testlog.SetLogger(t) bulker := ftesting.NewMockBulk() - po := PolicyOutput{ + po := Output{ Type: OutputTypeElasticsearch, Name: "test output", Role: nil, @@ -86,8 +87,11 @@ func TestPolicyOutputESPrepare(t *testing.T) { t.Run("Permission hash == Agent Permission Hash no need to regenerate the key", func(t *testing.T) { logger := testlog.SetLogger(t) bulker := ftesting.NewMockBulk() + + apiKey := bulk.APIKey{ID: "test_id_existing", Key: "existing-key"} + hashPerm := "abc123" - po := PolicyOutput{ + output := Output{ Type: OutputTypeElasticsearch, Name: "test output", Role: &RoleT{ @@ -101,29 +105,64 @@ func TestPolicyOutputESPrepare(t *testing.T) { } testAgent := &model.Agent{ - DefaultAPIKey: "test_id:EXISTING-KEY", - PolicyOutputPermissionsHash: hashPerm, + Outputs: map[string]*model.PolicyOutput{ + output.Name: { + ESDocument: model.ESDocument{}, + APIKey: apiKey.Agent(), + ToRetireAPIKeyIds: nil, + APIKeyID: apiKey.ID, + PermissionsHash: hashPerm, + Type: OutputTypeElasticsearch, + }, + }, } - err := po.Prepare(context.Background(), logger, bulker, testAgent, policyMap) + err := output.Prepare(context.Background(), logger, bulker, testAgent, policyMap) require.NoError(t, err, "expected prepare to pass") - key, ok := policyMap.GetMap("test output")["api_key"].(string) + key, ok := policyMap.GetMap(output.Name)["api_key"].(string) + gotOutput := testAgent.Outputs[output.Name] - require.True(t, ok, "unable to case api key") - require.Equal(t, testAgent.DefaultAPIKey, key) - bulker.AssertNotCalled(t, "Update", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything) - bulker.AssertNotCalled(t, "APIKeyCreate", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything) + require.True(t, ok, "api key not present on policy map") + assert.Equal(t, apiKey.Agent(), key) + + assert.Equal(t, apiKey.Agent(), gotOutput.APIKey) + assert.Equal(t, apiKey.ID, gotOutput.APIKeyID) + assert.Equal(t, output.Role.Sha2, gotOutput.PermissionsHash) + assert.Equal(t, output.Type, gotOutput.Type) + assert.Empty(t, gotOutput.ToRetireAPIKeyIds) + + // Old model must always remain empty + assert.Empty(t, testAgent.DefaultAPIKey) + assert.Empty(t, testAgent.DefaultAPIKeyID) + assert.Empty(t, testAgent.DefaultAPIKeyHistory) + assert.Empty(t, testAgent.PolicyOutputPermissionsHash) + + bulker.AssertNotCalled(t, "Update", + mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything) + bulker.AssertNotCalled(t, "APIKeyCreate", + mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything) bulker.AssertExpectations(t) }) - t.Run("Permission hash != Agent Permission Hash need to regenerate the key", func(t *testing.T) { + t.Run("Permission hash != Agent Permission Hash need to regenerate permissions", func(t *testing.T) { logger := testlog.SetLogger(t) bulker := ftesting.NewMockBulk() - bulker.On("Update", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(nil).Once() - bulker.On("APIKeyCreate", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(&bulk.APIKey{ID: "abc", Key: "new-key"}, nil).Once() //nolint:govet // test case - po := PolicyOutput{ + oldAPIKey := bulk.APIKey{ID: "test_id", Key: "EXISTING-KEY"} + wantAPIKey := bulk.APIKey{ID: "test_id", Key: "EXISTING-KEY"} + hashPerm := "old-HASH" + + bulker. + On("APIKeyRead", mock.Anything, mock.Anything, mock.Anything). + Return(&bulk.APIKeyMetadata{ID: "test_id", RoleDescriptors: TestPayload}, nil). + Once() + bulker.On("Update", + mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything). + Return(nil).Once() + bulker.On("APIKeyUpdate", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(nil).Once() + + output := Output{ Type: OutputTypeElasticsearch, Name: "test output", Role: &RoleT{ @@ -137,27 +176,55 @@ func TestPolicyOutputESPrepare(t *testing.T) { } testAgent := &model.Agent{ - DefaultAPIKey: "test_id:EXISTING-KEY", - PolicyOutputPermissionsHash: "old-HASH", + Outputs: map[string]*model.PolicyOutput{ + output.Name: { + ESDocument: model.ESDocument{}, + APIKey: oldAPIKey.Agent(), + ToRetireAPIKeyIds: nil, + APIKeyID: oldAPIKey.ID, + PermissionsHash: hashPerm, + Type: OutputTypeElasticsearch, + }, + }, } - err := po.Prepare(context.Background(), logger, bulker, testAgent, policyMap) + err := output.Prepare(context.Background(), logger, bulker, testAgent, policyMap) require.NoError(t, err, "expected prepare to pass") - key, ok := policyMap.GetMap("test output")["api_key"].(string) + key, ok := policyMap.GetMap(output.Name)["api_key"].(string) + gotOutput := testAgent.Outputs[output.Name] require.True(t, ok, "unable to case api key") - require.Equal(t, "abc:new-key", key) + require.Equal(t, wantAPIKey.Agent(), key) + + assert.Equal(t, wantAPIKey.Agent(), gotOutput.APIKey) + assert.Equal(t, wantAPIKey.ID, gotOutput.APIKeyID) + assert.Equal(t, output.Role.Sha2, gotOutput.PermissionsHash) + assert.Equal(t, output.Type, gotOutput.Type) + + // assert.Contains(t, gotOutput.ToRetireAPIKeyIds, oldAPIKey.ID) // TODO: assert on bulker.Update + + // Old model must always remain empty + assert.Empty(t, testAgent.DefaultAPIKey) + assert.Empty(t, testAgent.DefaultAPIKeyID) + assert.Empty(t, testAgent.DefaultAPIKeyHistory) + assert.Empty(t, testAgent.PolicyOutputPermissionsHash) + bulker.AssertExpectations(t) }) t.Run("Generate API Key on new Agent", func(t *testing.T) { logger := testlog.SetLogger(t) bulker := ftesting.NewMockBulk() - bulker.On("Update", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(nil).Once() - bulker.On("APIKeyCreate", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(&bulk.APIKey{ID: "abc", Key: "new-key"}, nil).Once() //nolint:govet // test case - - po := PolicyOutput{ + bulker.On("Update", + mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything). + Return(nil).Once() + apiKey := bulk.APIKey{ID: "abc", Key: "new-key"} + bulker.On("APIKeyCreate", + mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything). + Return(&apiKey, nil).Once() + + output := Output{ Type: OutputTypeElasticsearch, Name: "test output", Role: &RoleT{ @@ -170,15 +237,29 @@ func TestPolicyOutputESPrepare(t *testing.T) { "test output": map[string]interface{}{}, } - testAgent := &model.Agent{} + testAgent := &model.Agent{Outputs: map[string]*model.PolicyOutput{}} - err := po.Prepare(context.Background(), logger, bulker, testAgent, policyMap) + err := output.Prepare(context.Background(), logger, bulker, testAgent, policyMap) require.NoError(t, err, "expected prepare to pass") - key, ok := policyMap.GetMap("test output")["api_key"].(string) + key, ok := policyMap.GetMap(output.Name)["api_key"].(string) + gotOutput := testAgent.Outputs[output.Name] require.True(t, ok, "unable to case api key") - require.Equal(t, "abc:new-key", key) + assert.Equal(t, apiKey.Agent(), key) + + assert.Equal(t, apiKey.Agent(), gotOutput.APIKey) + assert.Equal(t, apiKey.ID, gotOutput.APIKeyID) + assert.Equal(t, output.Role.Sha2, gotOutput.PermissionsHash) + assert.Equal(t, output.Type, gotOutput.Type) + assert.Empty(t, gotOutput.ToRetireAPIKeyIds) + + // Old model must always remain empty + assert.Empty(t, testAgent.DefaultAPIKey) + assert.Empty(t, testAgent.DefaultAPIKeyID) + assert.Empty(t, testAgent.DefaultAPIKeyHistory) + assert.Empty(t, testAgent.PolicyOutputPermissionsHash) + bulker.AssertExpectations(t) }) } diff --git a/internal/pkg/policy/self.go b/internal/pkg/policy/self.go index 295879e5e..da1c40887 100644 --- a/internal/pkg/policy/self.go +++ b/internal/pkg/policy/self.go @@ -228,7 +228,6 @@ func (m *selfMonitorT) updateState(ctx context.Context) (client.UnitState, error if err != nil { return client.UnitStateFailed, err } - tokens = filterActiveTokens(tokens) if len(tokens) == 0 { // no tokens created for the policy, still starting if m.policyID == "" { @@ -271,13 +270,3 @@ func (d *policyData) HasType(val string) bool { func findEnrollmentAPIKeys(ctx context.Context, bulker bulk.Bulk, policyID string) ([]model.EnrollmentAPIKey, error) { return dl.FindEnrollmentAPIKeys(ctx, bulker, dl.QueryEnrollmentAPIKeyByPolicyID, dl.FieldPolicyID, policyID) } - -func filterActiveTokens(tokens []model.EnrollmentAPIKey) []model.EnrollmentAPIKey { - active := make([]model.EnrollmentAPIKey, 0, len(tokens)) - for _, t := range tokens { - if t.Active { - active = append(active, t) - } - } - return active -} diff --git a/internal/pkg/policy/self_test.go b/internal/pkg/policy/self_test.go index e87ba56b5..88a44b39a 100644 --- a/internal/pkg/policy/self_test.go +++ b/internal/pkg/policy/self_test.go @@ -262,21 +262,6 @@ func TestSelfMonitor_DefaultPolicy_Degraded(t *testing.T) { t.Fatal(err) } - // add inactive token that should be filtered out - inactiveToken := model.EnrollmentAPIKey{ - ESDocument: model.ESDocument{ - Id: xid.New().String(), - }, - Active: false, - APIKey: "d2JndlFIWUJJUVVxWDVia2NJTV86X0d6ZmljZGNTc1d4R1otbklrZFFRZw==", - APIKeyID: xid.New().String(), - Name: "Inactive", - PolicyID: policyID, - } - tokenLock.Lock() - tokenResult = append(tokenResult, inactiveToken) - tokenLock.Unlock() - go func() { chHitT <- []es.HitT{{ ID: rId, @@ -578,21 +563,6 @@ func TestSelfMonitor_SpecificPolicy_Degraded(t *testing.T) { t.Fatal(err) } - // add inactive token that should be filtered out - inactiveToken := model.EnrollmentAPIKey{ - ESDocument: model.ESDocument{ - Id: xid.New().String(), - }, - Active: false, - APIKey: "d2JndlFIWUJJUVVxWDVia2NJTV86X0d6ZmljZGNTc1d4R1otbklrZFFRZw==", - APIKeyID: xid.New().String(), - Name: "Inactive", - PolicyID: policyID, - } - tokenLock.Lock() - tokenResult = append(tokenResult, inactiveToken) - tokenLock.Unlock() - go func() { chHitT <- []es.HitT{{ ID: rId, diff --git a/internal/pkg/server/agent.go b/internal/pkg/server/agent.go new file mode 100644 index 000000000..dcfddf807 --- /dev/null +++ b/internal/pkg/server/agent.go @@ -0,0 +1,389 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +package server + +import ( + "context" + "errors" + "fmt" + "github.com/elastic/fleet-server/v7/internal/pkg/sleep" + "github.com/elastic/fleet-server/v7/internal/pkg/state" + "io" + "sync" + "time" + + "github.com/elastic/elastic-agent-client/v7/pkg/client" + "github.com/elastic/fleet-server/v7/internal/pkg/build" + "github.com/elastic/fleet-server/v7/internal/pkg/config" + "github.com/elastic/fleet-server/v7/internal/pkg/reload" + "github.com/elastic/go-ucfg" + "github.com/rs/zerolog/log" +) + +const ( + kAgentModeRestartLoopDelay = 2 * time.Second + + kFleetServer = "fleet-server" + kElasticsearch = "elasticsearch" + + kStopped = "Stopped" +) + +type firstCfg struct { + cfg *config.Config + err error +} + +// Agent is a fleet-server that runs under the elastic-agent. +// An Agent instance will retrieve connection information from the passed reader (normally stdin). +// Agent uses client.StateInterface to gather config data and manage its lifecylce. +type Agent struct { + cliCfg *ucfg.Config + bi build.Info + reloadables []reload.Reloadable + + agent client.V2 + + outputUnit *client.Unit + inputUnit *client.Unit + + srv *Fleet + srvCtx context.Context + srvCanceller context.CancelFunc + srvDone chan bool +} + +// NewAgent returns an Agent that will gather connection information from the passed reader. +func NewAgent(cliCfg *ucfg.Config, reader io.Reader, bi build.Info, reloadables ...reload.Reloadable) (*Agent, error) { + var err error + + a := &Agent{ + cliCfg: cliCfg, + bi: bi, + reloadables: reloadables, + } + a.agent, _, err = client.NewV2FromReader(reader, client.VersionInfo{ + Name: kFleetServer, + Version: bi.Version, + Meta: map[string]string{ + "commit": bi.Commit, + "build_time": bi.BuildTime.String(), + }, + }) + if err != nil { + return nil, err + } + return a, nil +} + +// Run starts a Server instance using config from the configured client. +func (a *Agent) Run(ctx context.Context) error { + subCtx, subCanceller := context.WithCancel(ctx) + defer subCanceller() + + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + + t := time.NewTicker(1 * time.Second) + defer t.Stop() + for { + select { + case <-subCtx.Done(): + return + case err := <-a.agent.Errors(): + if err != nil && !errors.Is(err, context.Canceled) && !errors.Is(err, io.EOF) { + log.Error().Err(err) + } + case change := <-a.agent.UnitChanges(): + switch change.Type { + case client.UnitChangedAdded: + err := a.unitAdded(subCtx, change.Unit) + if err != nil { + log.Error().Str("unit", change.Unit.ID()).Err(err) + _ = change.Unit.UpdateState(client.UnitStateFailed, err.Error(), nil) + } + case client.UnitChangedModified: + err := a.unitModified(subCtx, change.Unit) + if err != nil { + log.Error().Str("unit", change.Unit.ID()).Err(err) + _ = change.Unit.UpdateState(client.UnitStateFailed, err.Error(), nil) + } + case client.UnitChangedRemoved: + a.unitRemoved(change.Unit) + } + case <-t.C: + // Fleet Server is the only component that gets started by Elastic Agent without an Agent ID. We loop + // here on interval waiting for the Elastic Agent to enroll so then the Agent ID is then set. + agentInfo := a.agent.AgentInfo() + if agentInfo != nil && agentInfo.ID != "" { + // Agent ID is not set for the component. + t.Stop() + err := a.reconfigure(subCtx) + if err != nil { + log.Error().Err(err) + } + } + } + } + }() + + log.Info().Msg("starting communication connection back to Elastic Agent") + err := a.agent.Start(subCtx) + if err != nil { + return err + } + + <-subCtx.Done() + wg.Wait() + + return nil +} + +// UpdateState updates the state of the message and payload. +func (a *Agent) UpdateState(state client.UnitState, message string, payload map[string]interface{}) error { + if a.inputUnit != nil { + _ = a.inputUnit.UpdateState(state, message, payload) + } + if a.outputUnit != nil { + _ = a.outputUnit.UpdateState(state, message, payload) + } + return nil +} + +func (a *Agent) unitAdded(ctx context.Context, unit *client.Unit) error { + if unit.Type() == client.UnitTypeInput { + _, _, cfg := unit.Expected() + if cfg.Type != kFleetServer { + // not support input type + _ = unit.UpdateState(client.UnitStateFailed, fmt.Sprintf("%s is an unsupported input type", cfg.Type), nil) + return nil + } + if a.inputUnit != nil { + // original input unit is being stopped; swapping in this unit as the new input unit + _ = a.inputUnit.UpdateState(client.UnitStateStopped, kStopped, nil) + } + a.inputUnit = unit + if a.outputUnit == nil { + // waiting for output unit to really start Fleet Server + _ = unit.UpdateState(client.UnitStateStarting, "waiting for output unit", nil) + return nil + } + return a.start(ctx) + } + if unit.Type() == client.UnitTypeOutput { + _, _, cfg := unit.Expected() + if cfg.Type != kElasticsearch { + // not support output type + _ = unit.UpdateState(client.UnitStateFailed, fmt.Sprintf("%s is an unsupported output type", cfg.Type), nil) + return nil + } + if a.outputUnit != nil { + // original output unit is being stopped; swapping in this unit as the new output unit + _ = a.outputUnit.UpdateState(client.UnitStateStopped, kStopped, nil) + } + a.outputUnit = unit + if a.inputUnit == nil { + // waiting for input unit to really start Fleet Server + _ = unit.UpdateState(client.UnitStateStarting, "waiting for input unit", nil) + return nil + } + return a.start(ctx) + } + return fmt.Errorf("unknown unit type %v", unit.Type()) +} + +func (a *Agent) unitModified(ctx context.Context, unit *client.Unit) error { + state, _, _ := unit.Expected() + if unit.Type() == client.UnitTypeInput { + if a.inputUnit != unit { + // not our input unit; would have been marked failed in unitAdded; do nothing + return nil + } + if state == client.UnitStateHealthy { + if a.outputUnit == nil { + // still no output unit; would have been marked starting already; do nothing + return nil + } + + // configuration modified (should still be running) + return a.reconfigure(ctx) + } else if state == client.UnitStateStopped { + // unit should be stopped + a.stop() + return nil + } + return fmt.Errorf("unknown unit state %v", state) + } + if unit.Type() == client.UnitTypeOutput { + if a.outputUnit != unit { + // not our output unit; would have been marked failed in unitAdded; do nothing + return nil + } + if state == client.UnitStateHealthy { + if a.inputUnit == nil { + // still no input unit; would have been marked starting already; do nothing + return nil + } + + // configuration modified (should still be running) + return a.reconfigure(ctx) + } else if state == client.UnitStateStopped { + // unit should be stopped + a.stop() + return nil + } + return fmt.Errorf("unknown unit state %v", state) + } + return fmt.Errorf("unknown unit type %v", unit.Type()) +} + +func (a *Agent) unitRemoved(unit *client.Unit) { + stop := false + if a.inputUnit == unit || a.outputUnit == unit { + stop = true + } + if stop { + a.stop() + } + if a.inputUnit == unit { + a.inputUnit = nil + } + if a.outputUnit == unit { + a.outputUnit = nil + } +} + +func (a *Agent) start(ctx context.Context) error { + if a.srv != nil { + return a.reconfigure(ctx) + } + + cfg, err := a.configFromUnits() + if err != nil { + return err + } + + // reload the generic reloadables + for _, r := range a.reloadables { + err = r.Reload(ctx, cfg) + if err != nil { + return err + } + } + + srvDone := make(chan bool) + srvCtx, srvCanceller := context.WithCancel(ctx) + srv, err := NewFleet(a.bi, state.NewChained(state.NewLog(), a)) + if err != nil { + close(srvDone) + srvCanceller() + return err + } + + go func() { + defer close(srvDone) + for { + err := srv.Run(srvCtx, cfg) + if err == nil || errors.Is(err, context.Canceled) { + return + } + // sleep some before calling Run again + _ = sleep.WithContext(srvCtx, kAgentModeRestartLoopDelay) + } + }() + + a.srv = srv + a.srvCtx = srvCtx + a.srvCanceller = srvCanceller + a.srvDone = srvDone + return nil +} + +func (a *Agent) reconfigure(ctx context.Context) error { + if a.srv == nil { + return a.start(ctx) + } + + cfg, err := a.configFromUnits() + if err != nil { + return err + } + + // reload the generic reloadables + for _, r := range a.reloadables { + err = r.Reload(ctx, cfg) + if err != nil { + return err + } + } + + return a.srv.Reload(ctx, cfg) +} + +func (a *Agent) stop() { + if a.srvCanceller == nil { + return + } + + canceller := a.srvCanceller + a.srvCanceller = nil + a.srvCtx = nil + a.srv = nil + canceller() + <-a.srvDone + a.srvDone = nil + + if a.inputUnit != nil { + _ = a.inputUnit.UpdateState(client.UnitStateStopped, kStopped, nil) + } + if a.outputUnit != nil { + _ = a.outputUnit.UpdateState(client.UnitStateStopped, kStopped, nil) + } +} + +// configFromUnits takes both inputUnit and outputUnit and creates a single configuration just like fleet server was +// being started from a configuration file. +func (a *Agent) configFromUnits() (*config.Config, error) { + agentID := "" + agentVersion := "" + agentInfo := a.agent.AgentInfo() + if agentInfo != nil { + agentID = agentInfo.ID + agentVersion = agentInfo.Version + } + _, inputLevel, inputCfg := a.inputUnit.Expected() + _, outputLevel, outputCfg := a.outputUnit.Expected() + logLevel := inputLevel + if outputLevel > logLevel { + logLevel = outputLevel + } + + cfgData, err := ucfg.NewFrom(map[string]interface{}{ + "fleet": map[string]interface{}{ + "agent": map[string]interface{}{ + "id": agentID, + "version": agentVersion, + "logging": map[string]interface{}{ + "level": logLevel.String(), + }, + }, + }, + "output": map[string]interface{}{ + "elasticsearch": outputCfg.Source.AsMap(), + }, + "inputs": []interface{}{ + inputCfg.Source.AsMap(), + }, + "logging": map[string]interface{}{ + "level": logLevel.String(), + }, + }) + if err != nil { + return nil, err + } + return config.FromConfig(cfgData) +} diff --git a/cmd/fleet/main_integration_test.go b/internal/pkg/server/agent_integration_test.go similarity index 99% rename from cmd/fleet/main_integration_test.go rename to internal/pkg/server/agent_integration_test.go index 27d4c183a..00862f4a8 100644 --- a/cmd/fleet/main_integration_test.go +++ b/internal/pkg/server/agent_integration_test.go @@ -5,7 +5,7 @@ //go:build integration // +build integration -package fleet +package server import ( "context" @@ -55,7 +55,7 @@ var policyData = []byte(` } `) -func TestAgentMode(t *testing.T) { +func TestAgent(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -106,7 +106,7 @@ func TestAgentMode(t *testing.T) { go func() { defer wg.Done() - a := &AgentMode{ + a := &Agent{ cliCfg: ucfg.New(), bi: biInfo, } diff --git a/internal/pkg/server/fleet.go b/internal/pkg/server/fleet.go new file mode 100644 index 000000000..27ada184d --- /dev/null +++ b/internal/pkg/server/fleet.go @@ -0,0 +1,585 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +package server + +import ( + "context" + "errors" + "fmt" + "github.com/elastic/elastic-agent-client/v7/pkg/client" + "github.com/elastic/fleet-server/v7/internal/pkg/state" + "net/url" + "os" + "reflect" + "runtime/debug" + "time" + + "go.elastic.co/apm" + apmtransport "go.elastic.co/apm/transport" + + "github.com/elastic/fleet-server/v7/internal/pkg/action" + "github.com/elastic/fleet-server/v7/internal/pkg/api" + "github.com/elastic/fleet-server/v7/internal/pkg/build" + "github.com/elastic/fleet-server/v7/internal/pkg/bulk" + "github.com/elastic/fleet-server/v7/internal/pkg/cache" + "github.com/elastic/fleet-server/v7/internal/pkg/checkin" + "github.com/elastic/fleet-server/v7/internal/pkg/config" + "github.com/elastic/fleet-server/v7/internal/pkg/coordinator" + "github.com/elastic/fleet-server/v7/internal/pkg/dl" + "github.com/elastic/fleet-server/v7/internal/pkg/es" + "github.com/elastic/fleet-server/v7/internal/pkg/gc" + "github.com/elastic/fleet-server/v7/internal/pkg/monitor" + "github.com/elastic/fleet-server/v7/internal/pkg/policy" + "github.com/elastic/fleet-server/v7/internal/pkg/profile" + "github.com/elastic/fleet-server/v7/internal/pkg/scheduler" + "github.com/elastic/fleet-server/v7/internal/pkg/ver" + + "github.com/hashicorp/go-version" + "github.com/rs/zerolog" + "github.com/rs/zerolog/log" + "golang.org/x/sync/errgroup" +) + +const kUAFleetServer = "Fleet-Server" + +// Fleet is an instance of the fleet-server. +type Fleet struct { + bi build.Info + verCon version.Constraints + + cfgCh chan *config.Config + cache cache.Cache + reporter state.Reporter +} + +// NewFleet creates the actual fleet server service. +func NewFleet(bi build.Info, reporter state.Reporter) (*Fleet, error) { + verCon, err := api.BuildVersionConstraint(bi.Version) + if err != nil { + return nil, err + } + + return &Fleet{ + bi: bi, + verCon: verCon, + cfgCh: make(chan *config.Config, 1), + reporter: reporter, + }, nil +} + +type runFunc func(context.Context) error + +type runFuncCfg func(context.Context, *config.Config) error + +// Run runs the fleet server +func (f *Fleet) Run(ctx context.Context, initCfg *config.Config) error { + err := initCfg.LoadServerLimits() + if err != nil { + return fmt.Errorf("encountered error while loading server limits: %w", err) + } + cacheCfg := config.CopyCache(initCfg) + log.Info().Interface("cfg", cacheCfg).Msg("Setting cache config options") + cache, err := cache.New(cacheCfg) + if err != nil { + return err + } + f.cache = cache + + var curCfg *config.Config + newCfg := initCfg + + // Replace context with cancellable ctx + // in order to automatically cancel all the go routines + // that were started in the scope of this function on function exit + ctx, cn := context.WithCancel(ctx) + defer cn() + + stop := func(cn context.CancelFunc, g *errgroup.Group) { + if cn != nil { + cn() + } + if g != nil { + err := g.Wait() + if err != nil { + log.Error().Err(err).Msg("error encountered while stopping server") + } + } + } + + start := func(ctx context.Context, runfn runFuncCfg, cfg *config.Config, ech chan<- error) (*errgroup.Group, context.CancelFunc) { + ctx, cn = context.WithCancel(ctx) + g, ctx := errgroup.WithContext(ctx) + + g.Go(func() error { + err := runfn(ctx, cfg) + if err != nil { + ech <- err + } + return err + }) + return g, cn + } + + var ( + proCancel, srvCancel context.CancelFunc + proEg, srvEg *errgroup.Group + ) + + started := false + +LOOP: + for { + ech := make(chan error, 2) + if started { + f.reporter.UpdateState(client.UnitStateConfiguring, "Re-configuring", nil) //nolint:errcheck // unclear on what should we do if updating the status fails? + } else { + started = true + f.reporter.UpdateState(client.UnitStateStarting, "Starting", nil) //nolint:errcheck // unclear on what should we do if updating the status fails? + } + + err := newCfg.LoadServerLimits() + if err != nil { + return fmt.Errorf("encountered error while loading server limits: %w", err) + } + + // Create or recreate cache + if configCacheChanged(curCfg, newCfg) { + log.Info().Msg("reconfigure cache on configuration change") + cacheCfg := config.CopyCache(newCfg) + err := f.cache.Reconfigure(cacheCfg) + log.Info().Err(err).Interface("cfg", cacheCfg).Msg("reconfigure cache complete") + if err != nil { + return err + } + } + + // Start or restart profiler + if configChangedProfiler(curCfg, newCfg) { + if proCancel != nil { + log.Info().Msg("stopping profiler on configuration change") + stop(proCancel, proEg) + } + proEg, proCancel = nil, nil + if newCfg.Inputs[0].Server.Profiler.Enabled { + log.Info().Msg("starting profiler on configuration change") + proEg, proCancel = start(ctx, func(ctx context.Context, cfg *config.Config) error { + return profile.RunProfiler(ctx, cfg.Inputs[0].Server.Profiler.Bind) + }, newCfg, ech) + } + } + + // Start or restart server + if configChangedServer(curCfg, newCfg) { + if srvCancel != nil { + log.Info().Msg("stopping server on configuration change") + stop(srvCancel, srvEg) + } + log.Info().Msg("starting server on configuration change") + srvEg, srvCancel = start(ctx, func(ctx context.Context, cfg *config.Config) error { + return f.runServer(ctx, cfg) + }, newCfg, ech) + } + + curCfg = newCfg + + select { + case newCfg = <-f.cfgCh: + log.Info().Msg("Server configuration update") + case err := <-ech: + f.reporter.UpdateState(client.UnitStateFailed, fmt.Sprintf("Error - %s", err), nil) //nolint:errcheck // unclear on what should we do if updating the status fails? + log.Error().Err(err).Msg("Fleet Server failed") + return err + case <-ctx.Done(): + f.reporter.UpdateState(client.UnitStateStopping, "Stopping", nil) //nolint:errcheck // unclear on what should we do if updating the status fails? + break LOOP + } + } + + // Server is coming down; wait for the server group to exit cleanly. + // Timeout if something is locked up. + err = safeWait(srvEg, time.Second) + + // Eat cancel error to minimize confusion in logs + if errors.Is(err, context.Canceled) { + err = nil + } + + log.Info().Err(err).Msg("Fleet Server exited") + return err +} + +func configChangedProfiler(curCfg, newCfg *config.Config) bool { + changed := true + + switch { + case curCfg == nil: + case curCfg.Inputs[0].Server.Profiler.Enabled != newCfg.Inputs[0].Server.Profiler.Enabled: + case curCfg.Inputs[0].Server.Profiler.Bind != newCfg.Inputs[0].Server.Profiler.Bind: + default: + changed = false + } + + return changed +} + +func configCacheChanged(curCfg, newCfg *config.Config) bool { + if curCfg == nil { + return false + } + return curCfg.Inputs[0].Cache != newCfg.Inputs[0].Cache +} + +func configChangedServer(curCfg, newCfg *config.Config) bool { + zlog := log.With().Interface("new", newCfg.Redact()).Logger() + + changed := true + switch { + case curCfg == nil: + zlog.Info().Msg("initial server configuration") + case !reflect.DeepEqual(curCfg.Fleet, newCfg.Fleet): + zlog.Info(). + Interface("old", curCfg.Redact()). + Msg("fleet configuration has changed") + case !reflect.DeepEqual(curCfg.Output, newCfg.Output): + zlog.Info(). + Interface("old", curCfg.Redact()). + Msg("output configuration has changed") + case !reflect.DeepEqual(curCfg.Inputs[0].Server, newCfg.Inputs[0].Server): + zlog.Info(). + Interface("old", curCfg.Redact()). + Msg("server configuration has changed") + default: + changed = false + } + + return changed +} + +func safeWait(g *errgroup.Group, to time.Duration) error { + var err error + waitCh := make(chan error) + go func() { + waitCh <- g.Wait() + }() + + select { + case err = <-waitCh: + case <-time.After(to): + log.Warn().Msg("deadlock: goroutine locked up on errgroup.Wait()") + err = errors.New("group wait timeout") + } + + return err +} + +func loggedRunFunc(ctx context.Context, tag string, runfn runFunc) func() error { + return func() error { + + log.Debug().Msg(tag + " started") + + err := runfn(ctx) + + lvl := zerolog.DebugLevel + switch { + case err == nil: + case errors.Is(err, context.Canceled): + err = nil + default: + lvl = zerolog.ErrorLevel + } + + log.WithLevel(lvl).Err(err).Msg(tag + " exited") + return err + } +} + +func initRuntime(cfg *config.Config) { + gcPercent := cfg.Inputs[0].Server.Runtime.GCPercent + if gcPercent != 0 { + old := debug.SetGCPercent(gcPercent) + + log.Info(). + Int("old", old). + Int("new", gcPercent). + Msg("SetGCPercent") + } +} + +func (f *Fleet) initBulker(ctx context.Context, tracer *apm.Tracer, cfg *config.Config) (*bulk.Bulker, error) { + es, err := es.NewClient(ctx, cfg, false, elasticsearchOptions( + cfg.Inputs[0].Server.Instrumentation.Enabled, f.bi, + )...) + if err != nil { + return nil, err + } + + blk := bulk.NewBulker(es, tracer, bulk.BulkOptsFromCfg(cfg)...) + return blk, nil +} + +func (f *Fleet) runServer(ctx context.Context, cfg *config.Config) (err error) { + initRuntime(cfg) + + // The metricsServer is only enabled if http.enabled is set in the config + metricsServer, err := api.InitMetrics(ctx, cfg, f.bi) + switch { + case err != nil: + return err + case metricsServer != nil: + defer func() { + _ = metricsServer.Stop() + }() + } + + // Bulker is started in its own context and managed in the scope of this function. This is done so + // when the `ctx` is cancelled, the bulker will remain executing until this function exits. + // This allows the child subsystems to continue to write to the data store while tearing down. + bulkCtx, bulkCancel := context.WithCancel(context.Background()) + defer bulkCancel() + + // Create the APM tracer. + tracer, err := f.initTracer(cfg.Inputs[0].Server.Instrumentation) + if err != nil { + return err + } + + // Create the bulker subsystem + bulker, err := f.initBulker(bulkCtx, tracer, cfg) + if err != nil { + return err + } + + // Execute the bulker engine in a goroutine with its orphaned context. + // Create an error channel for the case where the bulker exits + // unexpectedly (ie. not cancelled by the bulkCancel context). + errCh := make(chan error) + + go func() { + runFunc := loggedRunFunc(bulkCtx, "Bulker", bulker.Run) + + // Emit the error from bulker.Run to the local error channel. + // The error group will be listening for it. (see comments below) + errCh <- runFunc() + }() + + // Wrap context with an error group context to manage the lifecycle + // of the subsystems. An error from any subsystem, or if the + // parent context is cancelled, will cancel the group. + // see https://pkg.go.dev/golang.org/x/sync/errgroup#Group.Go + g, ctx := errgroup.WithContext(ctx) + + // Stub a function for inclusion in the errgroup that exits when + // the bulker exits. If the bulker exits before the error group, + // this will tear down the error group and g.Wait() will return. + // Otherwise it will be a noop. + g.Go(func() (err error) { + select { + case err = <-errCh: + case <-ctx.Done(): + err = ctx.Err() + } + return + }) + + if tracer != nil { + go func() { + <-ctx.Done() + log.Info().Msg("flushing instrumentation tracer...") + tracer.Flush(nil) + tracer.Close() + }() + } + + if err = f.runSubsystems(ctx, cfg, g, bulker, tracer); err != nil { + return err + } + + return g.Wait() +} + +func (f *Fleet) runSubsystems(ctx context.Context, cfg *config.Config, g *errgroup.Group, bulker bulk.Bulk, tracer *apm.Tracer) (err error) { + esCli := bulker.Client() + + // Check version compatibility with Elasticsearch + remoteVersion, err := ver.CheckCompatibility(ctx, esCli, f.bi.Version) + if err != nil { + if len(remoteVersion) != 0 { + return fmt.Errorf("failed version compatibility check with elasticsearch (Agent: %s, Elasticsearch: %s): %w", + f.bi.Version, remoteVersion, err) + } + return fmt.Errorf("failed version compatibility check with elasticsearch: %w", err) + } + + // Run migrations + loggedMigration := loggedRunFunc(ctx, "Migrations", func(ctx context.Context) error { + return dl.Migrate(ctx, bulker) + }) + if err = loggedMigration(); err != nil { + return fmt.Errorf("failed to run subsystems: %w", err) + } + + // Run scheduler for periodic GC/cleanup + gcCfg := cfg.Inputs[0].Server.GC + sched, err := scheduler.New(gc.Schedules(bulker, gcCfg.ScheduleInterval, gcCfg.CleanupAfterExpiredInterval)) + if err != nil { + return fmt.Errorf("failed to create elasticsearch GC: %w", err) + } + g.Go(loggedRunFunc(ctx, "Elasticsearch GC", sched.Run)) + + // Monitoring es client, longer timeout, no retries + monCli, err := es.NewClient(ctx, cfg, true, elasticsearchOptions( + cfg.Inputs[0].Server.Instrumentation.Enabled, f.bi, + )...) + if err != nil { + return err + } + + // Coordinator policy monitor + pim, err := monitor.New(dl.FleetPolicies, esCli, monCli, + monitor.WithFetchSize(cfg.Inputs[0].Monitor.FetchSize), + monitor.WithPollTimeout(cfg.Inputs[0].Monitor.PollTimeout), + ) + if err != nil { + return err + } + + g.Go(loggedRunFunc(ctx, "Policy index monitor", pim.Run)) + cord := coordinator.NewMonitor(cfg.Fleet, f.bi.Version, bulker, pim, coordinator.NewCoordinatorZero) + g.Go(loggedRunFunc(ctx, "Coordinator policy monitor", cord.Run)) + + // Policy monitor + pm := policy.NewMonitor(bulker, pim, cfg.Inputs[0].Server.Limits.PolicyThrottle) + g.Go(loggedRunFunc(ctx, "Policy monitor", pm.Run)) + + // Policy self monitor + sm := policy.NewSelfMonitor(cfg.Fleet, bulker, pim, cfg.Inputs[0].Policy.ID, f.reporter) + g.Go(loggedRunFunc(ctx, "Policy self monitor", sm.Run)) + + // Actions monitoring + var am monitor.SimpleMonitor + var ad *action.Dispatcher + var tr *action.TokenResolver + + am, err = monitor.NewSimple(dl.FleetActions, esCli, monCli, + monitor.WithExpiration(true), + monitor.WithFetchSize(cfg.Inputs[0].Monitor.FetchSize), + monitor.WithPollTimeout(cfg.Inputs[0].Monitor.PollTimeout), + ) + if err != nil { + return err + } + g.Go(loggedRunFunc(ctx, "Revision monitor", am.Run)) + + ad = action.NewDispatcher(am) + g.Go(loggedRunFunc(ctx, "Revision dispatcher", ad.Run)) + tr, err = action.NewTokenResolver(bulker) + if err != nil { + return err + } + + bc := checkin.NewBulk(bulker) + g.Go(loggedRunFunc(ctx, "Bulk checkin", bc.Run)) + + ct := api.NewCheckinT(f.verCon, &cfg.Inputs[0].Server, f.cache, bc, pm, am, ad, tr, bulker) + et, err := api.NewEnrollerT(f.verCon, &cfg.Inputs[0].Server, bulker, f.cache) + if err != nil { + return err + } + + at := api.NewArtifactT(&cfg.Inputs[0].Server, bulker, f.cache) + ack := api.NewAckT(&cfg.Inputs[0].Server, bulker, f.cache) + st := api.NewStatusT(&cfg.Inputs[0].Server, bulker, f.cache) + + router := api.NewRouter(&cfg.Inputs[0].Server, bulker, ct, et, at, ack, st, sm, tracer, f.bi) + + g.Go(loggedRunFunc(ctx, "Http server", func(ctx context.Context) error { + return router.Run(ctx) + })) + + return err +} + +// Reload reloads the fleet server with the latest configuration. +func (f *Fleet) Reload(ctx context.Context, cfg *config.Config) error { + select { + case f.cfgCh <- cfg: + case <-ctx.Done(): + } + return nil +} + +func (f *Fleet) initTracer(cfg config.Instrumentation) (*apm.Tracer, error) { + if !cfg.Enabled { + return nil, nil + } + + log.Info().Msg("fleet-server instrumentation is enabled") + + // TODO(marclop): Ideally, we'd use apmtransport.NewHTTPTransportOptions() + // but it doesn't exist today. Update this code once we have something + // available via the APM Go agent. + const ( + envVerifyServerCert = "ELASTIC_APM_VERIFY_SERVER_CERT" + envServerCert = "ELASTIC_APM_SERVER_CERT" + envCACert = "ELASTIC_APM_SERVER_CA_CERT_FILE" + envGlobalLabels = "ELASTIC_APM_GLOBAL_LABELS" + envTransactionSampleRate = "ELASTIC_APM_TRANSACTION_SAMPLE_RATE" + ) + if cfg.TLS.SkipVerify { + os.Setenv(envVerifyServerCert, "false") + defer os.Unsetenv(envVerifyServerCert) + } + if cfg.TLS.ServerCertificate != "" { + os.Setenv(envServerCert, cfg.TLS.ServerCertificate) + defer os.Unsetenv(envServerCert) + } + if cfg.TLS.ServerCA != "" { + os.Setenv(envCACert, cfg.TLS.ServerCA) + defer os.Unsetenv(envCACert) + } + if cfg.GlobalLabels != "" { + os.Setenv(envGlobalLabels, cfg.GlobalLabels) + defer os.Unsetenv(envGlobalLabels) + } + if cfg.TransactionSampleRate != "" { + os.Setenv(envTransactionSampleRate, cfg.TransactionSampleRate) + defer os.Unsetenv(envTransactionSampleRate) + } + transport, err := apmtransport.NewHTTPTransport() + if err != nil { + return nil, err + } + + if len(cfg.Hosts) > 0 { + hosts := make([]*url.URL, 0, len(cfg.Hosts)) + for _, host := range cfg.Hosts { + u, err := url.Parse(host) + if err != nil { + return nil, fmt.Errorf("failed parsing %s: %w", host, err) + } + hosts = append(hosts, u) + } + transport.SetServerURL(hosts...) + } + if cfg.APIKey != "" { + transport.SetAPIKey(cfg.APIKey) + } else { + transport.SetSecretToken(cfg.SecretToken) + } + return apm.NewTracerOptions(apm.TracerOptions{ + ServiceName: "fleet-server", + ServiceVersion: f.bi.Version, + ServiceEnvironment: cfg.Environment, + Transport: transport, + }) +} + +func elasticsearchOptions(instumented bool, bi build.Info) []es.ConfigOption { + options := []es.ConfigOption{es.WithUserAgent(kUAFleetServer, bi)} + if instumented { + options = append(options, es.InstrumentRoundTripper()) + } + return options +} diff --git a/cmd/fleet/server_integration_test.go b/internal/pkg/server/fleet_integration_test.go similarity index 98% rename from cmd/fleet/server_integration_test.go rename to internal/pkg/server/fleet_integration_test.go index c34ae2c12..4191db0c7 100644 --- a/cmd/fleet/server_integration_test.go +++ b/internal/pkg/server/fleet_integration_test.go @@ -5,7 +5,7 @@ //go:build integration // +build integration -package fleet +package server import ( "bytes" @@ -47,7 +47,7 @@ const ( type tserver struct { cfg *config.Config g *errgroup.Group - srv *FleetServer + srv *Fleet } func (s *tserver) baseURL() string { @@ -67,7 +67,7 @@ func (s *tserver) waitExit() error { func startTestServer(t *testing.T, ctx context.Context) (*tserver, error) { t.Helper() - cfg, err := config.LoadFile("../../fleet-server.yml") + cfg, err := config.LoadFile("../../../fleet-server.yml") if err != nil { return nil, fmt.Errorf("config load error: %w", err) } @@ -110,7 +110,7 @@ func startTestServer(t *testing.T, ctx context.Context) (*tserver, error) { cfg.Inputs[0].Server = *srvcfg log.Info().Uint16("port", port).Msg("Test fleet server") - srv, err := NewFleetServer(build.Info{Version: serverVersion}, state.NewLog()) + srv, err := NewFleet(build.Info{Version: serverVersion}, state.NewLog()) if err != nil { return nil, fmt.Errorf("unable to create server: %w", err) } diff --git a/internal/pkg/testing/bulk.go b/internal/pkg/testing/bulk.go index 1123232b7..724d54086 100644 --- a/internal/pkg/testing/bulk.go +++ b/internal/pkg/testing/bulk.go @@ -83,7 +83,7 @@ func (m *MockBulk) APIKeyCreate(ctx context.Context, name, ttl string, roles []b return args.Get(0).(*bulk.APIKey), args.Error(1) } -func (m *MockBulk) APIKeyRead(ctx context.Context, id string) (*bulk.APIKeyMetadata, error) { +func (m *MockBulk) APIKeyRead(ctx context.Context, id string, _ bool) (*bulk.APIKeyMetadata, error) { args := m.Called(ctx, id) return args.Get(0).(*bulk.APIKeyMetadata), args.Error(1) } @@ -98,4 +98,9 @@ func (m *MockBulk) APIKeyInvalidate(ctx context.Context, ids ...string) error { return args.Error(0) } +func (m *MockBulk) APIKeyUpdate(ctx context.Context, id, outputPolicyHash string, roles []byte) error { + args := m.Called(ctx, id) + return args.Error(0) +} + var _ bulk.Bulk = (*MockBulk)(nil) diff --git a/internal/pkg/testing/esutil/bootstrap.go b/internal/pkg/testing/esutil/bootstrap.go index e2aafce76..978f95a75 100644 --- a/internal/pkg/testing/esutil/bootstrap.go +++ b/internal/pkg/testing/esutil/bootstrap.go @@ -10,7 +10,7 @@ import ( "github.com/elastic/go-elasticsearch/v7" ) -// EnsureIndex sets up the index if it doesn't exists, utilized for integration tests at the moment +// EnsureIndex sets up the index if it doesn't exist. It's utilized for integration tests at the moment. func EnsureIndex(ctx context.Context, cli *elasticsearch.Client, name, mapping string) error { err := EnsureTemplate(ctx, cli, name, mapping, false) if err != nil { diff --git a/internal/pkg/testing/setup.go b/internal/pkg/testing/setup.go index 8dac38cdc..8f38ba7e6 100644 --- a/internal/pkg/testing/setup.go +++ b/internal/pkg/testing/setup.go @@ -98,7 +98,7 @@ func SetupCleanIndex(ctx context.Context, t *testing.T, index string, opts ...bu func CleanIndex(ctx context.Context, t *testing.T, bulker bulk.Bulk, index string) string { t.Helper() - t.Helper() + tmpl := dsl.NewTmpl() root := dsl.NewRoot() root.Query().MatchAll() @@ -106,25 +106,25 @@ func CleanIndex(ctx context.Context, t *testing.T, bulker bulk.Bulk, index strin query, err := q.Render(make(map[string]interface{})) if err != nil { - t.Fatal(err) + t.Fatalf("could not clean index: failed to render query template: %v", err) } cli := bulker.Client() + res, err := cli.API.DeleteByQuery([]string{index}, bytes.NewReader(query), cli.API.DeleteByQuery.WithContext(ctx), cli.API.DeleteByQuery.WithRefresh(true), ) - if err != nil { - t.Fatal(err) + t.Fatalf("could not clean index %s, DeleteByQuery failed: %v", + index, err) } defer res.Body.Close() var esres es.DeleteByQueryResponse - err = json.NewDecoder(res.Body).Decode(&esres) if err != nil { - t.Fatal(err) + t.Fatalf("could not decode ES response: %v", err) } if res.IsError() { @@ -135,9 +135,9 @@ func CleanIndex(ctx context.Context, t *testing.T, bulker bulk.Bulk, index strin } } } - if err != nil { - t.Fatal(err) + t.Fatalf("ES returned an error: %v. body: %q", err, res) } + return index } diff --git a/model/schema.json b/model/schema.json index fe390db9e..d1a3db241 100644 --- a/model/schema.json +++ b/model/schema.json @@ -244,6 +244,7 @@ "name" ] }, + "server-metadata": { "title": "Server Metadata", "description": "A Fleet Server metadata", @@ -264,6 +265,7 @@ "version" ] }, + "server": { "title": "Server", "description": "A Fleet Server", @@ -284,6 +286,7 @@ "server" ] }, + "policy": { "title": "Policy", "description": "A policy that an Elastic Agent is attached to", @@ -329,6 +332,7 @@ "default_fleet_server" ] }, + "policy-leader": { "title": "Policy Leader", "description": "The current leader Fleet Server for a policy", @@ -345,6 +349,60 @@ "server" ] }, + + "to_retire_api_key_ids": { + "type": "array", + "items": { + "description": "the Output API Keys that were replaced and should be retired", + "type": "object", + "properties": { + "id": { + "description": "API Key identifier", + "type": "string" + }, + "retired_at": { + "description": "Date/time the API key was retired", + "type": "string", + "format": "date-time" + } + } + } + }, + + "policy_output" : { + "type": "object", + "description": "holds the needed data to manage the output API keys", + "properties": { + "api_key": { + "description": "API key the Elastic Agent uses to authenticate with elasticsearch", + "type": "string" + }, + "to_retire_api_key_ids": { + "description": "API keys to be invalidated on next agent ack", + "$ref": "#/definitions/to_retire_api_key_ids" + }, + "api_key_id": { + "description": "ID of the API key the Elastic Agent uses to authenticate with elasticsearch", + "type": "string" + }, + "permissions_hash": { + "description": "The policy output permissions hash", + "type": "string" + }, + "type": { + "description": "Type is the output type. Currently only Elasticsearch is supported.", + "type": "string" + } + }, + "required": [ + "api_key", + "api_key_history", + "api_key_id", + "permissions_hash", + "type" + ] + }, + "agent": { "title": "Agent", "description": "An Elastic Agent that has enrolled into Fleet", @@ -401,6 +459,10 @@ "type": "string", "format": "date-time" }, + "upgrade_status": { + "description": "Upgrade status", + "type": "string" + }, "access_api_key_id": { "description": "ID of the API key the Elastic Agent must used to contact Fleet Server", "type": "string" @@ -437,7 +499,7 @@ "type": "integer" }, "policy_output_permissions_hash": { - "description": "The policy output permissions hash", + "description": "Deprecated. Use Outputs instead. The policy output permissions hash", "type": "string" }, "last_updated": { @@ -451,7 +513,11 @@ "format": "date-time" }, "last_checkin_status": { - "description": "Lst checkin status", + "description": "Last checkin status", + "type": "string" + }, + "last_checkin_message": { + "description": "Last checkin message", "type": "string" }, "last_checkin_message": { @@ -464,30 +530,21 @@ "format": "raw" }, "default_api_key_id": { - "description": "ID of the API key the Elastic Agent uses to authenticate with elasticsearch", + "description": "Deprecated. Use Outputs instead. ID of the API key the Elastic Agent uses to authenticate with elasticsearch", "type": "string" }, "default_api_key": { - "description": "API key the Elastic Agent uses to authenticate with elasticsearch", + "description": "Deprecated. Use Outputs instead. API key the Elastic Agent uses to authenticate with elasticsearch", "type": "string" }, "default_api_key_history": { - "description": "Default API Key History", - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "description": "API Key identifier", - "type": "string" - }, - "retired_at": { - "description": "Date/time the API key was retired", - "type": "string", - "format": "date-time" - } - } - } + "description": "Deprecated. Use Outputs instead. Default API Key History", + "$ref": "#/definitions/to_retire_api_key_ids" + }, + "outputs": { + "description": "Outputs is the policy output data, mapping the output name to its data", + "type": "object", + "additionalProperties": { "$ref": "#/definitions/policy_output"} }, "updated_at": { "description": "Date/time the Elastic Agent was last updated", @@ -517,6 +574,7 @@ "status" ] }, + "enrollment_api_key": { "title": "Enrollment API key", "description": "An Elastic Agent enrollment API key", @@ -560,6 +618,7 @@ ] } }, + "checkin": { "title": "Checkin", "description": "An Elastic Agent checkin to Fleet",