diff --git a/docker/grafana/Dockerfile b/docker/grafana/Dockerfile index 8ec094c156..18960eb800 100644 --- a/docker/grafana/Dockerfile +++ b/docker/grafana/Dockerfile @@ -10,6 +10,8 @@ COPY ./integrations/grafana/m3db_dashboard.json /tmp/grafana_dashboards/m3db_das COPY ./integrations/grafana/temporal_function_comparison.json /tmp/grafana_dashboards/temporal_function_comparison.json COPY ./integrations/grafana/m3aggregator_dashboard.json /tmp/grafana_dashboards/m3aggregator_dashboard.json COPY ./integrations/grafana/m3aggregator_end_to_end_details.json /tmp/grafana_dashboards/m3aggregator_end_to_end_details.json +COPY ./scripts/development/m3_prom_remote_stack/prom_remote_demo_dashboard.json /tmp/grafana_dashboards/prom_remote_demo_dashboard.json.json + # Need to replace datasource template variable with name of actual data source so auto-import # JustWorksTM. Use a temporary directory to host the dashboards since the default diff --git a/docker/grafana/datasource.yaml b/docker/grafana/datasource.yaml index 5b0e8e69de..5166551f54 100644 --- a/docker/grafana/datasource.yaml +++ b/docker/grafana/datasource.yaml @@ -1,4 +1,12 @@ datasources: + - name: Prometheus Aggregated + type: prometheus + access: proxy + url: http://prometheusagg:9090 + - name: Prometheus Unaggregated + type: prometheus + access: proxy + url: http://prometheusraw:9090 - name: Prometheus type: prometheus access: proxy diff --git a/scripts/development/m3_prom_remote_stack/README.md b/scripts/development/m3_prom_remote_stack/README.md new file mode 100644 index 0000000000..49a6fdf8ce --- /dev/null +++ b/scripts/development/m3_prom_remote_stack/README.md @@ -0,0 +1,34 @@ +# Local Development + +This docker-compose file will setup the following environment: + +1. 1 ETCD +2. 1 M3Coordinator node +3. 1 M3 Aggregator node +4. 2 Prometheus instances with `remote-write-receiver` feature enabled in order to accept remote writes. +5. 1 Grafana node (with a pre-configured Prometheus source) +6. 1 Prometheus node that scrapes the M3 Components and Prometheus instances. + In addition, it scrapes CAdvisor /metrics endpoints exposed by using `kubectl proxy`. + +The environment variables that let's you configure this setup are: +- `BUILD_M3AGGREGATOR=true`: forces build of local M3 Aggregator. +- `BUILD_M3COORDINATOR=true`: forces build of local M3 Aggregator. +- `WIPE_DATA=true`: cleans up docker state when using `stop_m3.sh`. + +## Usage + +Use the `start_m3.sh` and `stop_m3.sh` scripts. + +If you want to scrape CAdvisor metrics from Kubernetes cluster: +- Run `port_forward_kube.sh` +- Run `start_m3.sh` + +## Grafana + +Use Grafana by navigating to `http://localhost:3000` and using `admin` for both the username and password. The M3DB dashboard should already be populated and working. + +To pickup latest grafana Docker build remove existing image `docker image rm m3grafana --force`. + +## Containers Hanging / Unresponsive + +Running the entire stack can be resource intensive. If the containers are unresponsive try increasing the amount of cores and memory that the docker daemon is allowed to use. diff --git a/scripts/development/m3_prom_remote_stack/docker-compose.yml b/scripts/development/m3_prom_remote_stack/docker-compose.yml new file mode 100644 index 0000000000..40c428f373 --- /dev/null +++ b/scripts/development/m3_prom_remote_stack/docker-compose.yml @@ -0,0 +1,110 @@ +version: "3.5" +services: + m3aggregator01: + expose: + - "6001" + ports: + - "127.0.0.1:6001:6001" + networks: + - backend + environment: + - M3AGGREGATOR_HOST_ID=m3aggregator01 + build: + context: ../../../bin + dockerfile: ./docker/m3aggregator/development.Dockerfile + image: m3aggregator:dev + volumes: + - "./m3aggregator.yml:/etc/m3aggregator/m3aggregator.yml" + m3coordinator01: + expose: + - "7201" + ports: + - "0.0.0.0:7201:7201" + networks: + - backend + build: + context: ../../../bin + dockerfile: ./docker/m3coordinator/development.Dockerfile + image: m3coordinator:dev + volumes: + - "./m3coordinator.yml.tmp:/etc/m3coordinator/m3coordinator.yml" + prometheusscraper: + networks: + - backend + image: prom/prometheus:latest + volumes: + - "./prometheus-scraper.yml.tmp:/etc/prometheus/prometheus.yml" + command: + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.path=/prometheus" + - "--web.console.libraries=/usr/share/prometheus/console_libraries" + - "--web.console.templates=/usr/share/prometheus/consoles" + - "--log.level=debug" + prometheusraw: + networks: + - backend + image: prom/prometheus:latest + volumes: + - "./prometheus.yml:/etc/prometheus/prometheus.yml" + command: + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.path=/prometheus" + - "--web.console.libraries=/usr/share/prometheus/console_libraries" + - "--web.console.templates=/usr/share/prometheus/consoles" + - "--enable-feature=remote-write-receiver" + prometheusagg: + networks: + - backend + image: prom/prometheus:latest + volumes: + - "./prometheus.yml:/etc/prometheus/prometheus.yml" + command: + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.path=/prometheus" + - "--web.console.libraries=/usr/share/prometheus/console_libraries" + - "--web.console.templates=/usr/share/prometheus/consoles" + - "--enable-feature=remote-write-receiver" + etcd01: + expose: + - "2379-2380" + ports: + - "0.0.0.0:2379-2380:2379-2380" + networks: + - backend + image: quay.io/coreos/etcd:v3.4.3 + command: + - "etcd" + - "--name" + - "etcd01" + - "--listen-peer-urls" + - "http://0.0.0.0:2380" + - "--listen-client-urls" + - "http://0.0.0.0:2379" + - "--advertise-client-urls" + - "http://etcd01:2379" + - "--initial-cluster-token" + - "etcd-cluster-1" + - "--initial-advertise-peer-urls" + - "http://etcd01:2380" + - "--initial-cluster" + - "etcd01=http://etcd01:2380" + - "--initial-cluster-state" + - "new" + - "--data-dir" + - "/var/lib/etcd" + grafana: + build: + context: ../../../ + dockerfile: ./docker/grafana/Dockerfile + expose: + - "3000" + ports: + - "0.0.0.0:3000:3000" + networks: + - backend + image: m3grafana:latest +networks: + backend: +volumes: + prom-raw-data: + prom-agg-data: diff --git a/scripts/development/m3_prom_remote_stack/emit_scrape_configs.sh b/scripts/development/m3_prom_remote_stack/emit_scrape_configs.sh new file mode 100755 index 0000000000..17fd5febf1 --- /dev/null +++ b/scripts/development/m3_prom_remote_stack/emit_scrape_configs.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +set -xe + +nodes=() +while IFS='' read -r line; do nodes+=("$line"); done < <(curl http://localhost:8001/api/v1/nodes | jq '.items[].metadata.name' | tr -d \") + + +cp prometheus-scraper.yml prometheus-scraper.yml.tmp + +limit=10 + +i=0 +for node in "${nodes[@]}" ; do + i=$((i+1)) + if [ "$i" -gt "$limit" ]; then + break; + fi + echo " + - job_name: cadvisor_${node} + metrics_path: /api/v1/nodes/${node}/proxy/metrics/cadvisor + static_configs: + - targets: + - host.docker.internal:8001 + labels: + instance: ${node} + " >> prometheus-scraper.yml.tmp +done diff --git a/scripts/development/m3_prom_remote_stack/m3aggregator.yml b/scripts/development/m3_prom_remote_stack/m3aggregator.yml new file mode 100644 index 0000000000..c8f3de7530 --- /dev/null +++ b/scripts/development/m3_prom_remote_stack/m3aggregator.yml @@ -0,0 +1,220 @@ +logging: + level: info + +metrics: + scope: + prefix: m3aggregator + prometheus: + onError: none + handlerPath: /metrics + listenAddress: 0.0.0.0:6002 + timerType: histogram + sanitization: prometheus + samplingRate: 1.0 + extended: none + +m3msg: + server: + listenAddress: 0.0.0.0:6000 + retry: + maxBackoff: 10s + jitter: true + consumer: + messagePool: + size: 16384 + watermark: + low: 0.2 + high: 0.5 + +http: + listenAddress: 0.0.0.0:6001 + readTimeout: 60s + writeTimeout: 60s + +kvClient: + etcd: + env: default_env + zone: embedded + service: m3aggregator + cacheDir: /var/lib/m3kv + etcdClusters: + - zone: embedded + endpoints: + - etcd01:2379 + +runtimeOptions: + kvConfig: + environment: default_env + zone: embedded + writeValuesPerMetricLimitPerSecondKey: write-values-per-metric-limit-per-second + writeValuesPerMetricLimitPerSecond: 0 + writeNewMetricLimitClusterPerSecondKey: write-new-metric-limit-cluster-per-second + writeNewMetricLimitClusterPerSecond: 0 + writeNewMetricNoLimitWarmupDuration: 0 + +aggregator: + hostID: + resolver: environment + envVarName: M3AGGREGATOR_HOST_ID + instanceID: + type: host_id + verboseErrors: true + metricPrefix: "" + counterPrefix: "" + timerPrefix: "" + gaugePrefix: "" + aggregationTypes: + counterTransformFnType: empty + timerTransformFnType: suffix + gaugeTransformFnType: empty + aggregationTypesPool: + size: 1024 + quantilesPool: + buckets: + - count: 256 + capacity: 4 + - count: 128 + capacity: 8 + stream: + eps: 0.001 + capacity: 32 + streamPool: + size: 4096 + samplePool: + size: 4096 + floatsPool: + buckets: + - count: 4096 + capacity: 16 + - count: 2048 + capacity: 32 + - count: 1024 + capacity: 64 + client: + type: m3msg + m3msg: + producer: + writer: + messageRetry: + initialBackoff: 10s + topicName: aggregator_ingest + topicServiceOverride: + zone: embedded + environment: default_env + placement: + isStaged: true + placementServiceOverride: + namespaces: + placement: /placement + messagePool: + size: 16384 + watermark: + low: 0.2 + high: 0.5 + placementManager: + kvConfig: + namespace: /placement + environment: default_env + zone: embedded + placementWatcher: + key: m3aggregator + initWatchTimeout: 10s + hashType: murmur32 + bufferDurationBeforeShardCutover: 10m + bufferDurationAfterShardCutoff: 10m + bufferDurationForFutureTimedMetric: 10m # Allow test to write into future. + bufferDurationForPastTimedMetric: 10s # Don't wait too long for timed metrics to flush. + resignTimeout: 1m + flushTimesManager: + kvConfig: + environment: default_env + zone: embedded + flushTimesKeyFmt: shardset/%d/flush + flushTimesPersistRetrier: + initialBackoff: 100ms + backoffFactor: 2.0 + maxBackoff: 2s + maxRetries: 3 + electionManager: + election: + leaderTimeout: 10s + resignTimeout: 10s + ttlSeconds: 10 + serviceID: + name: m3aggregator + environment: default_env + zone: embedded + electionKeyFmt: shardset/%d/lock + campaignRetrier: + initialBackoff: 100ms + backoffFactor: 2.0 + maxBackoff: 2s + forever: true + jitter: true + changeRetrier: + initialBackoff: 100ms + backoffFactor: 2.0 + maxBackoff: 5s + forever: true + jitter: true + resignRetrier: + initialBackoff: 100ms + backoffFactor: 2.0 + maxBackoff: 5s + forever: true + jitter: true + campaignStateCheckInterval: 1s + shardCutoffCheckOffset: 30s + flushManager: + checkEvery: 1s + jitterEnabled: true + maxJitters: + - flushInterval: 5s + maxJitterPercent: 1.0 + - flushInterval: 10s + maxJitterPercent: 0.5 + - flushInterval: 1m + maxJitterPercent: 0.5 + - flushInterval: 10m + maxJitterPercent: 0.5 + - flushInterval: 1h + maxJitterPercent: 0.25 + numWorkersPerCPU: 0.5 + flushTimesPersistEvery: 10s + maxBufferSize: 10m + forcedFlushWindowSize: 10s + flush: + handlers: + - dynamicBackend: + name: m3msg + hashType: murmur32 + producer: + writer: + messageRetry: + initialBackoff: 10s + topicName: aggregated_metrics + topicServiceOverride: + zone: embedded + environment: default_env + messagePool: + size: 16384 + watermark: + low: 0.2 + high: 0.5 + passthrough: + enabled: true + forwarding: + maxConstDelay: 1m # Need to add some buffer window, since timed metrics by default are delayed by 1min. + entryTTL: 1h + entryCheckInterval: 10m + maxTimerBatchSizePerWrite: 140 + maxNumCachedSourceSets: 2 + discardNaNAggregatedValues: true + entryPool: + size: 4096 + counterElemPool: + size: 4096 + timerElemPool: + size: 4096 + gaugeElemPool: + size: 4096 diff --git a/scripts/development/m3_prom_remote_stack/m3coordinator-admin.yml b/scripts/development/m3_prom_remote_stack/m3coordinator-admin.yml new file mode 100644 index 0000000000..c3c08c0104 --- /dev/null +++ b/scripts/development/m3_prom_remote_stack/m3coordinator-admin.yml @@ -0,0 +1,29 @@ +listenAddress: 0.0.0.0:7201 + +logging: + level: info + +metrics: + scope: + prefix: "coordinator" + prometheus: + handlerPath: /metrics + listenAddress: 0.0.0.0:7203 # until https://github.com/m3db/m3/issues/682 is resolved + sanitization: prometheus + samplingRate: 1.0 + extended: none + +backend: noop-etcd +clusterManagement: + etcd: + env: default_env + zone: embedded + service: m3db + cacheDir: /var/lib/m3kv + etcdClusters: + - zone: embedded + endpoints: + - etcd01:2379 + +tagOptions: + idScheme: quoted diff --git a/scripts/development/m3_prom_remote_stack/m3coordinator.yml b/scripts/development/m3_prom_remote_stack/m3coordinator.yml new file mode 100644 index 0000000000..b48fb15eae --- /dev/null +++ b/scripts/development/m3_prom_remote_stack/m3coordinator.yml @@ -0,0 +1,113 @@ +listenAddress: 0.0.0.0:7201 + +logging: + level: info + +metrics: + scope: + prefix: "coordinator" + prometheus: + handlerPath: /metrics + listenAddress: 0.0.0.0:7203 # until https://github.com/m3db/m3/issues/682 is resolved + sanitization: prometheus + samplingRate: 1.0 + extended: none + +backend: prom-remote + +prometheusRemoteBackend: + endpoints: + - name: raw + address: "http://prometheusraw:9090/api/v1/write" + - name: aggregated + address: "http://prometheusagg:9090/api/v1/write" + storagePolicy: + retention: 1h + resolution: 1m + downsample: + all: false + +clusterManagement: + etcd: + env: default_env + zone: embedded + service: m3db + cacheDir: /var/lib/m3kv + etcdClusters: + - zone: embedded + endpoints: + - etcd01:2379 + +tagOptions: + idScheme: quoted + +downsample: + rules: + mappingRules: + - name: "drop all cadvisor metrics" + filter: "job:cadvisor_*" + drop: True + - name: "cpu metric" + filter: "__name__:container_cpu_usage_seconds_total" + storagePolicies: + - resolution: 1m + retention: 1h + rollupRules: + - name: "container_cpu_usage_seconds_total rolled up" + filter: "__name__:container_cpu_usage_seconds_total" + transforms: + - transform: + type: "Increase" + - rollup: + metricName: "container_cpu_usage_seconds_total_rolled_up" + groupBy: ["container", "namespace", "cpu"] + aggregations: ["Sum"] + - transform: + type: "Add" + storagePolicies: + - resolution: 1m + retention: 1h + matcher: + requireNamespaceWatchOnInit: false + remoteAggregator: + client: + type: m3msg + m3msg: + producer: + writer: + messageRetry: + initialBackoff: 10s + topicName: aggregator_ingest + topicServiceOverride: + zone: embedded + environment: default_env + placement: + isStaged: true + placementServiceOverride: + namespaces: + placement: /placement + connection: + numConnections: 4 + messagePool: + size: 16384 + watermark: + low: 0.2 + high: 0.5 + +ingest: + ingester: + workerPoolSize: 10000 + opPool: + size: 10000 + retry: + maxRetries: 3 + jitter: true + logSampleRate: 0.01 + m3msg: + server: + listenAddress: "0.0.0.0:7507" + retry: + maxBackoff: 10s + jitter: true + +storeMetricsType: true \ No newline at end of file diff --git a/scripts/development/m3_prom_remote_stack/port_forward_kube.sh b/scripts/development/m3_prom_remote_stack/port_forward_kube.sh new file mode 100755 index 0000000000..e6051712a7 --- /dev/null +++ b/scripts/development/m3_prom_remote_stack/port_forward_kube.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash + +set -xe + +kubectl proxy --accept-hosts='^localhost$,^127\.0\.0\.1$,^\[::1\],^host\.docker\.internal$' \ No newline at end of file diff --git a/scripts/development/m3_prom_remote_stack/prom_remote_demo_dashboard.json b/scripts/development/m3_prom_remote_stack/prom_remote_demo_dashboard.json new file mode 100644 index 0000000000..868deda7fd --- /dev/null +++ b/scripts/development/m3_prom_remote_stack/prom_remote_demo_dashboard.json @@ -0,0 +1,229 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "links": [], + "panels": [ + { + "datasource": "Prometheus Aggregated", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "rate(container_cpu_usage_seconds_total_rolled_up[1m])", + "interval": "", + "legendFormat": "{{container}}", + "refId": "A" + } + ], + "title": "Container CPU Usage (Rolled Up)", + "type": "timeseries" + }, + { + "datasource": "Prometheus Aggregated", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "m3aggregator" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 9 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "rate(container_cpu_usage_seconds_total[1m])", + "interval": "", + "legendFormat": "{{container}}", + "refId": "A" + } + ], + "title": "Container CPU Usage ", + "type": "timeseries" + } + ], + "schemaVersion": 30, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Prom Remote Storage Demo", + "uid": "um6GxRvnz", + "version": 1 +} \ No newline at end of file diff --git a/scripts/development/m3_prom_remote_stack/prometheus-scraper.yml b/scripts/development/m3_prom_remote_stack/prometheus-scraper.yml new file mode 100644 index 0000000000..b0baac6ffc --- /dev/null +++ b/scripts/development/m3_prom_remote_stack/prometheus-scraper.yml @@ -0,0 +1,38 @@ +global: + external_labels: + role: "remote" + scrape_interval: 10s + evaluation_interval: 10s + +# Alertmanager configuration +alerting: + alertmanagers: + - static_configs: + - targets: + # - alertmanager:9093 + +# Load rules once and periodically evaluate them according to the global 'evaluation_interval'. +rule_files: +# - "first_rules.yml" +# - "second_rules.yml" + +remote_write: + - url: http://m3coordinator01:7201/api/v1/prom/remote/write + remote_timeout: 30s + +# A scrape configuration containing exactly one endpoint to scrape: +# Here it's Prometheus itself. +scrape_configs: + # The job name is added as a label `job=` to any timeseries scraped from this config. + - job_name: 'prometheus' + static_configs: + - targets: ['prometheusagg:9090', 'prometheusraw:9090'] + + - job_name: 'coordinator' + static_configs: + - targets: + - m3coordinator01:7203 + + - job_name: 'aggregator' + static_configs: + - targets: ['m3aggregator01:6002'] diff --git a/scripts/development/m3_prom_remote_stack/prometheus.yml b/scripts/development/m3_prom_remote_stack/prometheus.yml new file mode 100644 index 0000000000..2111a7afe5 --- /dev/null +++ b/scripts/development/m3_prom_remote_stack/prometheus.yml @@ -0,0 +1,19 @@ +global: + external_labels: + role: "remote" + +# Alertmanager configuration +alerting: + alertmanagers: + - static_configs: + - targets: + # - alertmanager:9093 + +# Load rules once and periodically evaluate them according to the global 'evaluation_interval'. +rule_files: +# - "first_rules.yml" +# - "second_rules.yml" + +# A scrape configuration containing exactly one endpoint to scrape: +# Here it's Prometheus itself. +scrape_configs: diff --git a/scripts/development/m3_prom_remote_stack/start_m3.sh b/scripts/development/m3_prom_remote_stack/start_m3.sh new file mode 100755 index 0000000000..ae551bb7c5 --- /dev/null +++ b/scripts/development/m3_prom_remote_stack/start_m3.sh @@ -0,0 +1,148 @@ +#!/usr/bin/env bash + +set -xe + +source "$(pwd)/../../docker-integration-tests/common.sh" + +# Locally don't care if we hot loop faster +export MAX_TIMEOUT=4 + +RELATIVE="./../../.." +prepare_build_cmd() { + build_cmd="cd $RELATIVE && make clean-build docker-dev-prep && cp -r ./docker ./bin/ && $1" +} +DOCKER_ARGS="--detach --renew-anon-volumes" +PROMETHEUS_DOCKER_ARGS="--detach" + +M3COORDINATOR_DEV_IMG=$(docker images m3coordinator:dev | fgrep -iv repository | wc -l | xargs) +M3AGGREGATOR_DEV_IMG=$(docker images m3aggregator:dev | fgrep -iv repository | wc -l | xargs) + +docker-compose -f docker-compose.yml up $DOCKER_ARGS etcd01 + +cp ./m3coordinator-admin.yml ./m3coordinator.yml.tmp + +if [[ "$M3COORDINATOR_DEV_IMG" == "0" ]] || [[ "$FORCE_BUILD" == true ]] || [[ "$BUILD_M3COORDINATOR" == true ]]; then + prepare_build_cmd "make m3coordinator-linux-amd64" + echo "Building m3coordinator binary first" + bash -c "$build_cmd" + + docker-compose -f docker-compose.yml up --build $DOCKER_ARGS m3coordinator01 +else + docker-compose -f docker-compose.yml up $DOCKER_ARGS m3coordinator01 +fi + +echo "Wait for coordinator API to be up" +ATTEMPTS=10 MAX_TIMEOUT=4 TIMEOUT=1 retry_with_backoff \ + 'curl -vvvsSf localhost:7201/health' + +echo "Running aggregator pipeline" +curl -vvvsSf -X POST localhost:7201/api/v1/services/m3aggregator/placement/init -d '{ + "num_shards": 64, + "replication_factor": 1, + "instances": [ + { + "id": "m3aggregator01", + "isolation_group": "rack-a", + "zone": "embedded", + "weight": 1024, + "endpoint": "m3aggregator01:6000", + "hostname": "m3aggregator01", + "port": 6000 + } + ] +}' + +echo "Initializing m3msg inbound topic for m3aggregator ingestion from m3coordinators" +curl -vvvsSf -X POST -H "Topic-Name: aggregator_ingest" -H "Cluster-Environment-Name: default_env" localhost:7201/api/v1/topic/init -d '{ + "numberOfShards": 64 +}' + +echo "Adding m3aggregator as a consumer to the aggregator ingest topic" +curl -vvvsSf -X POST -H "Topic-Name: aggregator_ingest" -H "Cluster-Environment-Name: default_env" localhost:7201/api/v1/topic -d '{ + "consumerService": { + "serviceId": { + "name": "m3aggregator", + "environment": "default_env", + "zone": "embedded" + }, + "consumptionType": "REPLICATED", + "messageTtlNanos": "600000000000" + } +}' # msgs will be discarded after 600000000000ns = 10mins + +# Create outbound m3msg topic for m3 aggregators to coordinators +echo "Initializing m3msg outbound topic for m3 aggregators to coordinators" +curl -vvvsSf -X POST -H "Topic-Name: aggregated_metrics" -H "Cluster-Environment-Name: default_env" localhost:7201/api/v1/topic/init -d '{ + "numberOfShards": 64 +}' + +if [[ "$M3AGGREGATOR_DEV_IMG" == "0" ]] || [[ "$FORCE_BUILD" == true ]] || [[ "$BUILD_M3AGGREGATOR" == true ]]; then + prepare_build_cmd "make m3aggregator-linux-amd64" + echo "Building m3aggregator binary first" + bash -c "$build_cmd" + + docker-compose -f docker-compose.yml up --build $DOCKER_ARGS m3aggregator01 +else + docker-compose -f docker-compose.yml up $DOCKER_ARGS m3aggregator01 +fi + +echo "Initializing M3Coordinator topology" +curl -vvvsSf -X POST localhost:7201/api/v1/services/m3coordinator/placement/init -d '{ + "instances": [ + { + "id": "m3coordinator01", + "zone": "embedded", + "endpoint": "m3coordinator01:7507", + "hostname": "m3coordinator01", + "port": 7507 + } + ] +}' +echo "Done initializing M3Coordinator topology" + +echo "Validating M3Coordinator topology" +[ "$(curl -sSf localhost:7201/api/v1/services/m3coordinator/placement | jq .placement.instances.m3coordinator01.id)" == '"m3coordinator01"' ] +echo "Done validating topology" + +# Do this after placement for m3coordinator is created. +echo "Adding coordinator as a consumer to the aggregator outbound topic" +curl -vvvsSf -X POST -H "Topic-Name: aggregated_metrics" -H "Cluster-Environment-Name: default_env" localhost:7201/api/v1/topic -d '{ + "consumerService": { + "serviceId": { + "name": "m3coordinator", + "environment": "default_env", + "zone": "embedded" + }, + "consumptionType": "SHARED", + "messageTtlNanos": "600000000000" + } +}' # msgs will be discarded after 600000000000ns = 10mins + +# Restart with aggregator coordinator config +docker-compose -f docker-compose.yml stop m3coordinator01 + +# Note: Use ".tmp" suffix to be git ignored. +cp ./m3coordinator.yml ./m3coordinator.yml.tmp + +docker-compose -f docker-compose.yml up $DOCKER_ARGS m3coordinator01 + +./emit_scrape_configs.sh + +echo "Starting Prometheus" +if [[ "$FORCE_BUILD" == true ]] || [[ "$BUILD_PROMETHEUS" == true ]]; then + docker-compose -f docker-compose.yml up --build $PROMETHEUS_DOCKER_ARGS prometheusraw +else + docker-compose -f docker-compose.yml up $PROMETHEUS_DOCKER_ARGS prometheusraw +fi +docker-compose -f docker-compose.yml up $PROMETHEUS_DOCKER_ARGS prometheusagg +docker-compose -f docker-compose.yml up $DOCKER_ARGS prometheusscraper + +echo "Starting Grafana" +if [[ "$FORCE_BUILD" == true ]] || [[ "$BUILD_GRAFANA" == true ]]; then + docker-compose -f docker-compose.yml up --build $DOCKER_ARGS grafana +else + docker-compose -f docker-compose.yml up $DOCKER_ARGS grafana +fi + +echo "Grafana available at localhost:3000" +echo "Run ./stop.sh to shutdown nodes when done" diff --git a/scripts/development/m3_prom_remote_stack/stop_m3.sh b/scripts/development/m3_prom_remote_stack/stop_m3.sh new file mode 100755 index 0000000000..44d062b6f8 --- /dev/null +++ b/scripts/development/m3_prom_remote_stack/stop_m3.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +set -xe + +if [[ "$WIPE_DATA" == true ]]; then + docker-compose -f docker-compose.yml down +else + docker-compose -f docker-compose.yml stop +fi diff --git a/scripts/development/m3_stack/start_m3.sh b/scripts/development/m3_stack/start_m3.sh index 88b6c24c3e..740aca4c2d 100755 --- a/scripts/development/m3_stack/start_m3.sh +++ b/scripts/development/m3_stack/start_m3.sh @@ -364,7 +364,11 @@ if [[ "$USE_AGGREGATOR" = true ]]; then fi echo "Starting Prometheus" -docker-compose -f docker-compose.yml up $DOCKER_ARGS prometheus01 +if [[ "$FORCE_BUILD" == true ]] || [[ "$BUILD_PROMETHEUS" == true ]]; then + docker-compose -f docker-compose.yml up --build $DOCKER_ARGS prometheus01 +else + docker-compose -f docker-compose.yml up $DOCKER_ARGS prometheus01 +fi if [[ "$USE_PROMETHEUS_HA" = true ]] ; then echo "Starting Prometheus HA replica" @@ -372,7 +376,11 @@ if [[ "$USE_PROMETHEUS_HA" = true ]] ; then fi echo "Starting Grafana" -docker-compose -f docker-compose.yml up $DOCKER_ARGS grafana +if [[ "$FORCE_BUILD" == true ]] || [[ "$BUILD_GRAFANA" == true ]]; then + docker-compose -f docker-compose.yml up --build $DOCKER_ARGS grafana +else + docker-compose -f docker-compose.yml up $DOCKER_ARGS grafana +fi if [[ "$USE_JAEGER" = true ]] ; then echo "Jaeger UI available at localhost:16686"