From 60f284f8f51d5ca49bfa9dae6aae1b7af6b5c4fe Mon Sep 17 00:00:00 2001 From: Vladislav Polyakov Date: Sat, 26 Oct 2024 19:19:43 +0300 Subject: [PATCH] ci: migrate to new slo action --- .github/workflows/slo-report.yml | 22 +++++ .github/workflows/slo.yml | 128 +++++++++++++++++------------ tests/slo/Dockerfile | 11 --- tests/slo/src/jobs.py | 10 +-- tests/slo/src/metrics.py | 135 ++++++++++++++++++++----------- tests/slo/src/runner.py | 5 +- 6 files changed, 196 insertions(+), 115 deletions(-) create mode 100644 .github/workflows/slo-report.yml delete mode 100644 tests/slo/Dockerfile diff --git a/.github/workflows/slo-report.yml b/.github/workflows/slo-report.yml new file mode 100644 index 00000000..816d190b --- /dev/null +++ b/.github/workflows/slo-report.yml @@ -0,0 +1,22 @@ +name: SLO Report + +on: + workflow_run: + workflows: ['SLO'] + types: + - completed + +jobs: + ydb-slo-action-report: + runs-on: ubuntu-latest + name: Publish YDB SLO Report + permissions: + contents: read + pull-requests: write + if: github.event.workflow_run.conclusion == 'success' + steps: + - name: Publish YDB SLO Report + uses: ydb-platform/ydb-slo-action/report@main + with: + token: ${{ secrets.GITHUB_TOKEN }} + run_id: ${{ github.event.workflow_run.id }} diff --git a/.github/workflows/slo.yml b/.github/workflows/slo.yml index 4ca0adac..a4b90864 100644 --- a/.github/workflows/slo.yml +++ b/.github/workflows/slo.yml @@ -1,70 +1,98 @@ name: SLO on: + push: + branches: + - main pull_request: - branches: [main] + branches: + - main workflow_dispatch: + inputs: + github_pull_request_number: + required: true + slo_workload_duration_seconds: + default: '600' + required: false + slo_workload_read_max_rps: + default: '1000' + required: false + slo_workload_write_max_rps: + default: '100' + required: false jobs: - test-slo: - concurrency: - group: slo-${{ github.ref }} + ydb-slo-action: if: (!contains(github.event.pull_request.labels.*.name, 'no slo')) + name: Run YDB SLO Tests runs-on: ubuntu-latest - name: SLO test - permissions: - checks: write - pull-requests: write - contents: read - issues: write + + strategy: + matrix: + sdk: + - py-sync-table + - py-sync-query + + concurrency: + group: slo-${{ github.ref }}-${{ matrix.sdk }} + cancel-in-progress: true steps: - name: Checkout repository - uses: actions/checkout@v3 - if: env.DOCKER_REPO != null - env: - DOCKER_REPO: ${{ secrets.SLO_DOCKER_REPO }} + uses: actions/checkout@v4 - - name: Run SLO - uses: ydb-platform/slo-tests@main - if: env.DOCKER_REPO != null - env: - DOCKER_REPO: ${{ secrets.SLO_DOCKER_REPO }} - continue-on-error: true + - name: Install Python3 + uses: actions/setup-python@v5 with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - KUBECONFIG_B64: ${{ secrets.SLO_KUBE_CONFIG }} - AWS_CREDENTIALS_B64: ${{ secrets.SLO_AWS_CREDENTIALS }} - AWS_CONFIG_B64: ${{ secrets.SLO_AWS_CONFIG }} - DOCKER_USERNAME: ${{ secrets.SLO_DOCKER_USERNAME }} - DOCKER_PASSWORD: ${{ secrets.SLO_DOCKER_PASSWORD }} - DOCKER_REPO: ${{ secrets.SLO_DOCKER_REPO }} - DOCKER_FOLDER: ${{ secrets.SLO_DOCKER_FOLDER }} - s3_endpoint: ${{ secrets.SLO_S3_ENDPOINT }} - s3_images_folder: ${{ vars.SLO_S3_IMAGES_FOLDER }} - grafana_domain: ${{ vars.SLO_GRAFANA_DOMAIN }} - grafana_dashboard: ${{ vars.SLO_GRAFANA_DASHBOARD }} - ydb_version: 'newest' - timeBetweenPhases: 30 - shutdownTime: 30 + python-version: '3.8' + cache: 'pip' - language_id0: sync-python-table - language0: Python SDK over Table Service - workload_path0: tests/slo - workload_build_context0: ../.. - workload_build_options0: -f Dockerfile --build-arg SDK_SERVICE=sync-python-table + - name: Install dependencies + run: | + python -m pip install --no-cache-dir --upgrade pip + python -m pip install --no-cache-dir -e . + python -m pip install --no-cache-dir -r tests/slo/requirements.txt - language_id1: sync-python-query - language1: Python SDK over Query Service - workload_path1: tests/slo - workload_build_context1: ../.. - workload_build_options1: -f Dockerfile --build-arg SDK_SERVICE=sync-python-query + - name: Initialize YDB SLO + uses: ydb-platform/ydb-slo-action/init@main + with: + github_pull_request_number: ${{ github.event.inputs.github_pull_request_number }} + github_token: ${{ secrets.GITHUB_TOKEN }} + sdk_name: ${{ matrix.sdk }} + ydb_database_node_count: 5 + + - name: Prepare SLO Database + run: | + python ./tests/slo/src create grpc://localhost:2135 /Root/testdb - - uses: actions/upload-artifact@v3 - if: env.DOCKER_REPO != null + - name: Run SLO Tests env: - DOCKER_REPO: ${{ secrets.SLO_DOCKER_REPO }} + REF: '${{ github.head_ref || github.ref }}' + SDK_SERVICE: '${{ matrix.sdk }}' + run: | + python ./tests/slo/src run grpc://localhost:2135 /Root/testdb \ + --prom-pgw localhost:9091 \ + --report-period 250 \ + --time ${{inputs.slo_workload_duration_seconds || 600}} \ + --read-rps ${{inputs.slo_workload_read_max_rps || 1000}} \ + --write-rps ${{inputs.slo_workload_write_max_rps || 100}} \ + --read-timeout 1000 \ + --write-timeout 1000 || true + + - if: always() + name: Store ydb chaos testing logs + run: | + docker logs ydb-chaos > chaos-ydb.log + + - if: always() + uses: actions/upload-artifact@v4 with: - name: slo-logs - path: logs/ + name: ${{ matrix.sdk }}-chaos-ydb.log + path: ./chaos-ydb.log + retention-days: 1 + + - if: always() + name: Cleanup SLO Database + run: | + python ./tests/slo/src cleanup grpc://localhost:2135 /Root/testdb || true diff --git a/tests/slo/Dockerfile b/tests/slo/Dockerfile deleted file mode 100644 index 7a8cc494..00000000 --- a/tests/slo/Dockerfile +++ /dev/null @@ -1,11 +0,0 @@ -FROM python:3.8-slim -COPY . /src -WORKDIR /src -RUN python -m pip install --no-cache-dir --upgrade pip && \ - python -m pip install --no-cache-dir -e . && \ - python -m pip install --no-cache-dir -r tests/slo/requirements.txt -WORKDIR tests/slo -ARG SDK_SERVICE -ENV SDK_SERVICE=$SDK_SERVICE - -ENTRYPOINT ["python", "src"] diff --git a/tests/slo/src/jobs.py b/tests/slo/src/jobs.py index 4fe0cd37..c9bd7316 100644 --- a/tests/slo/src/jobs.py +++ b/tests/slo/src/jobs.py @@ -8,7 +8,7 @@ import threading -from metrics import Metrics, JOB_WRITE_LABEL, JOB_READ_LABEL +from metrics import Metrics, OP_TYPE_WRITE, OP_TYPE_READ from generator import RowGenerator @@ -106,7 +106,7 @@ def check_result(result): query=query, params=params, metrics=metrics, - labels=(JOB_READ_LABEL,), + labels=(OP_TYPE_READ,), request_settings=request_settings, retry_settings=retry_setting, check_result_cb=check_result, @@ -163,7 +163,7 @@ def check_result(result): query=query, params=params, metrics=metrics, - labels=(JOB_READ_LABEL,), + labels=(OP_TYPE_READ,), request_settings=request_settings, retry_settings=retry_setting, check_result_cb=check_result, @@ -220,7 +220,7 @@ def run_writes(driver, query, row_generator, metrics, limiter, runtime, timeout) query=query, params=params, metrics=metrics, - labels=(JOB_WRITE_LABEL,), + labels=(OP_TYPE_WRITE,), request_settings=request_settings, retry_settings=retry_setting, ) @@ -285,7 +285,7 @@ def check_result(result): query=query, params=params, metrics=metrics, - labels=(JOB_WRITE_LABEL,), + labels=(OP_TYPE_WRITE,), request_settings=request_settings, retry_settings=retry_setting, check_result_cb=check_result, diff --git a/tests/slo/src/metrics.py b/tests/slo/src/metrics.py index b9d33a5c..e89dbbec 100644 --- a/tests/slo/src/metrics.py +++ b/tests/slo/src/metrics.py @@ -7,13 +7,13 @@ environ["PROMETHEUS_DISABLE_CREATED_SERIES"] = "True" -from prometheus_client import CollectorRegistry, Gauge, Histogram, push_to_gateway # noqa: E402 -from summary import Summary # noqa: E402 +from prometheus_client import CollectorRegistry, Counter, Gauge, Histogram, push_to_gateway # noqa: E402 -JOB_READ_LABEL, JOB_WRITE_LABEL = "read", "write" -JOB_STATUS_OK, JOB_STATUS_ERR = "ok", "err" +OP_TYPE_READ, OP_TYPE_WRITE = "read", "write" +OP_STATUS_SUCCESS, OP_STATUS_FAILURE = "success", "err" -SDK_SERVICE_NAME = environ.get("SDK_SERVICE", "sync-python-table") +REF = environ.get("REF", "main") +SDK_SERVICE_NAME = environ.get("SDK_SERVICE", "py-sync-table") class Metrics: @@ -21,41 +21,83 @@ def __init__(self, push_gateway): self._push_gtw = push_gateway self._registry = CollectorRegistry() self._metrics = dict( - oks=Gauge( - "oks", - "amount of OK requests", - labelnames=("jobName",), + errors_total=Counter( + "sdk_errors_total", + "Total number of errors encountered, categorized by error type.", + labelnames=("operation_type", "error_type"), registry=self._registry, ), - not_oks=Gauge( - "not_oks", - "amount of not OK requests", - labelnames=("jobName",), + operations_total=Counter( + "sdk_operations_total", + "Total number of operations, categorized by type attempted by the SDK.", + labelnames=("operation_type",), registry=self._registry, ), - inflight=Gauge( - "inflight", - "amount of requests in flight", - labelnames=("jobName",), + operations_success_total=Counter( + "sdk_operations_success_total", + "Total number of successful operations, categorized by type.", + labelnames=("operation_type",), registry=self._registry, ), - latency=Summary( - "latency", - "summary of latencies in ms", - labelnames=("jobName", "status"), + operations_failure_total=Counter( + "sdk_operations_failure_total", + "Total number of failed operations, categorized by type.", + labelnames=("operation_type",), registry=self._registry, - objectives=( - (0.5, 0.01), - (0.99, 0.001), - (1.0, 0.0), + ), + operation_latency_seconds=Histogram( + "sdk_operation_latency_seconds", + "Latency of operations performed by the SDK in seconds, categorized by type and status.", + labelnames=( + "operation_type", + "operation_status", + ), + registry=self._registry, + buckets=( + 0.001, # 1 ms + 0.002, # 2 ms + 0.003, # 3 ms + 0.004, # 4 ms + 0.005, # 5 ms + 0.0075, # 7.5 ms + 0.010, # 10 ms + 0.020, # 20 ms + 0.050, # 50 ms + 0.100, # 100 ms + 0.200, # 200 ms + 0.500, # 500 ms + 1.000, # 1 s ), ), - attempts=Histogram( - "attempts", - "histogram of amount of requests", - labelnames=("jobName", "status"), + retry_attempts=Gauge( + "sdk_retry_attempts", + "Current retry attempts, categorized by operation type.", + labelnames=("operation_type",), + registry=self._registry, + ), + retry_attempts_total=Counter( + "sdk_retry_attempts_total", + "Total number of retry attempts, categorized by operation type.", + labelnames=("operation_type",), + registry=self._registry, + ), + retries_success_total=Counter( + "sdk_retries_success_total", + "Total number of successful retries, categorized by operation type.", + labelnames=("operation_type",), + registry=self._registry, + ), + retries_failure_total=Counter( + "sdk_retries_failure_total", + "Total number of failed retries, categorized by operation type.", + labelnames=("operation_type",), + registry=self._registry, + ), + pending_operations=Gauge( + "sdk_pending_operations", + "Current number of pending operations, categorized by type.", + labelnames=("operation_type",), registry=self._registry, - buckets=tuple(range(1, 11)), ), ) self.reset() @@ -81,25 +123,30 @@ def start(self, labels): if not isinstance(labels, Iterable): labels = (labels,) - self.inflight.labels(*labels).inc() + self.pending_operations.labels(*labels).inc() return time.time() def stop(self, labels, start_time, attempts=1, error=None): - runtime_ms = 1000 * (time.time() - start_time) + duration = time.time() - start_time if not isinstance(labels, Iterable): labels = (labels,) - self.inflight.labels(*labels).dec() + self.retry_attempts.labels(*labels).set(attempts) + self.operations_total.labels(*labels).inc() + self.pending_operations.labels(*labels).dec() + self.retry_attempts_total.labels(*labels).inc(attempts) if error: - self.not_oks.labels(*labels).inc() - self.latency.labels(*labels, JOB_STATUS_ERR).observe(runtime_ms) + self.errors_total.labels(*labels, type(error).__name__).inc() + self.retries_failure_total.labels(*labels).inc(attempts) + self.operations_failure_total.labels(*labels).inc() + self.operation_latency_seconds.labels(*labels, OP_STATUS_FAILURE).observe(duration) return - self.oks.labels(*labels).inc() - self.latency.labels(*labels, JOB_STATUS_OK).observe(runtime_ms) - self.attempts.labels(*labels, JOB_STATUS_OK).observe(attempts) + self.retries_success_total.labels(*labels).inc(attempts) + self.operations_success_total.labels(*labels).inc() + self.operation_latency_seconds.labels(*labels, OP_STATUS_SUCCESS).observe(duration) def push(self): push_to_gateway( @@ -107,18 +154,14 @@ def push(self): job=f"workload-{SDK_SERVICE_NAME}", registry=self._registry, grouping_key={ + "ref": REF, "sdk": SDK_SERVICE_NAME, - "sdkVersion": version("ydb"), + "sdk_version": version("ydb"), }, ) def reset(self): - for label in (JOB_READ_LABEL, JOB_WRITE_LABEL): - self.oks.labels(label).set(0) - self.not_oks.labels(label).set(0) - self.inflight.labels(label).set(0) - - self.latency.clear() - self.attempts.clear() + for m in self._metrics.values(): + m.clear() self.push() diff --git a/tests/slo/src/runner.py b/tests/slo/src/runner.py index b9380436..1472d49a 100644 --- a/tests/slo/src/runner.py +++ b/tests/slo/src/runner.py @@ -91,13 +91,13 @@ def run_slo(args, driver, tb_name): logger.info("Max ID: %s", max_id) metrics = Metrics(args.prom_pgw) - if SDK_SERVICE_NAME == "sync-python-table": + if SDK_SERVICE_NAME == "py-sync-table": futures = ( *run_read_jobs(args, driver, tb_name, max_id, metrics), *run_write_jobs(args, driver, tb_name, max_id, metrics), run_metric_job(args, metrics), ) - elif SDK_SERVICE_NAME == "sync-python-query": + elif SDK_SERVICE_NAME == "py-sync-query": futures = ( *run_read_jobs_query(args, driver, tb_name, max_id, metrics), *run_write_jobs_query(args, driver, tb_name, max_id, metrics), @@ -121,7 +121,6 @@ def run_from_args(args): driver_config = ydb.DriverConfig( args.endpoint, database=args.db, - credentials=ydb.credentials_from_env_variables(), grpc_keep_alive_timeout=5000, )