Skip to content

Commit

Permalink
ci: migrate to new slo action
Browse files Browse the repository at this point in the history
  • Loading branch information
polRk committed Oct 28, 2024
1 parent cea4669 commit a6d71a8
Show file tree
Hide file tree
Showing 6 changed files with 171 additions and 117 deletions.
22 changes: 22 additions & 0 deletions .github/workflows/slo-report.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
name: slo-report

on:
workflow_run:
workflows: ['slo']
types:
- completed

jobs:
test-ydb-slo-action:
runs-on: ubuntu-latest
name: Publish YDB SLO Report
permissions:
contents: read
pull-requests: write
if: github.event.workflow_run.conclusion == 'success'
steps:
- name: Publish YDB SLO Report
uses: ydb-platform/ydb-slo-action/report@main
with:
token: ${{ secrets.GITHUB_TOKEN }}
run_id: ${{ github.event.workflow_run.id }}
112 changes: 60 additions & 52 deletions .github/workflows/slo.yml
Original file line number Diff line number Diff line change
@@ -1,70 +1,78 @@
name: SLO
name: slo

on:
push:
branches:
- main
pull_request:
branches: [main]
branches:
- main
workflow_dispatch:
inputs:
github_pull_request_number:
required: true
slo_workload_duration_seconds:
default: '600'
required: false
slo_workload_read_max_rps:
default: '1000'
required: false
slo_workload_write_max_rps:
default: '100'
required: false

jobs:
test-slo:
concurrency:
group: slo-${{ github.ref }}
ydb-slo-action-init:
if: (!contains(github.event.pull_request.labels.*.name, 'no slo'))

concurrency:
group: slo-${{ github.ref }}-${{ matrix.sdk }}
cancel-in-progress: true

name: Run YDB SLO Tests
runs-on: ubuntu-latest
name: SLO test
permissions:
checks: write
pull-requests: write
contents: read
issues: write

strategy:
matrix:
sdk:
- py-sync-table
- py-sync-query

steps:
- name: Checkout repository
uses: actions/checkout@v3
if: env.DOCKER_REPO != null
env:
DOCKER_REPO: ${{ secrets.SLO_DOCKER_REPO }}
uses: actions/checkout@v4

- name: Run SLO
uses: ydb-platform/slo-tests@main
if: env.DOCKER_REPO != null
env:
DOCKER_REPO: ${{ secrets.SLO_DOCKER_REPO }}
continue-on-error: true
- name: Install Python3
uses: actions/setup-python@v5
with:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
KUBECONFIG_B64: ${{ secrets.SLO_KUBE_CONFIG }}
AWS_CREDENTIALS_B64: ${{ secrets.SLO_AWS_CREDENTIALS }}
AWS_CONFIG_B64: ${{ secrets.SLO_AWS_CONFIG }}
DOCKER_USERNAME: ${{ secrets.SLO_DOCKER_USERNAME }}
DOCKER_PASSWORD: ${{ secrets.SLO_DOCKER_PASSWORD }}
DOCKER_REPO: ${{ secrets.SLO_DOCKER_REPO }}
DOCKER_FOLDER: ${{ secrets.SLO_DOCKER_FOLDER }}
s3_endpoint: ${{ secrets.SLO_S3_ENDPOINT }}
s3_images_folder: ${{ vars.SLO_S3_IMAGES_FOLDER }}
grafana_domain: ${{ vars.SLO_GRAFANA_DOMAIN }}
grafana_dashboard: ${{ vars.SLO_GRAFANA_DASHBOARD }}
ydb_version: 'newest'
timeBetweenPhases: 30
shutdownTime: 30
python-version: '3.8'
cache: 'pip'

language_id0: sync-python-table
language0: Python SDK over Table Service
workload_path0: tests/slo
workload_build_context0: ../..
workload_build_options0: -f Dockerfile --build-arg SDK_SERVICE=sync-python-table
- name: Install dependencies
run: |
python -m pip install --no-cache-dir --upgrade pip
python -m pip install --no-cache-dir -e .
python -m pip install --no-cache-dir -r tests/slo/requirements.txt
language_id1: sync-python-query
language1: Python SDK over Query Service
workload_path1: tests/slo
workload_build_context1: ../..
workload_build_options1: -f Dockerfile --build-arg SDK_SERVICE=sync-python-query
- name: Initialize YDB SLO
uses: ydb-platform/ydb-slo-action/init@main
with:
github_pull_request_number: ${{ github.event.inputs.github_pull_request_number }}
github_token: ${{ secrets.GITHUB_TOKEN }}
sdk_name: ${{ matrix.sdk }}

- uses: actions/upload-artifact@v3
if: env.DOCKER_REPO != null
- name: Run SLO Tests
env:
DOCKER_REPO: ${{ secrets.SLO_DOCKER_REPO }}
with:
name: slo-logs
path: logs/
REF: '${{ github.head_ref || github.ref }}'
SDK_SERVICE: '${{ matrix.sdk }}'
run: |
python ./tests/slo/src create grpc://localhost:2135 /Root/testdb
python ./tests/slo/src run grpc://localhost:2135 /Root/testdb \
--prom-pgw localhost:9091 \
--report-period 250 \
--read-rps ${{inputs.slo_workload_read_max_rps || 1000}} \
--write-rps ${{inputs.slo_workload_write_max_rps || 100}} \
--read-timeout 10000 \
--write-timeout 10000 \
--time ${{inputs.slo_workload_duration_seconds || 600}}
python ./tests/slo/src cleanup grpc://localhost:2135 /Root/testdb
11 changes: 0 additions & 11 deletions tests/slo/Dockerfile

This file was deleted.

10 changes: 5 additions & 5 deletions tests/slo/src/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import threading

from metrics import Metrics, JOB_WRITE_LABEL, JOB_READ_LABEL
from metrics import Metrics, OP_TYPE_WRITE, OP_TYPE_READ

from generator import RowGenerator

Expand Down Expand Up @@ -106,7 +106,7 @@ def check_result(result):
query=query,
params=params,
metrics=metrics,
labels=(JOB_READ_LABEL,),
labels=(OP_TYPE_READ,),
request_settings=request_settings,
retry_settings=retry_setting,
check_result_cb=check_result,
Expand Down Expand Up @@ -163,7 +163,7 @@ def check_result(result):
query=query,
params=params,
metrics=metrics,
labels=(JOB_READ_LABEL,),
labels=(OP_TYPE_READ,),
request_settings=request_settings,
retry_settings=retry_setting,
check_result_cb=check_result,
Expand Down Expand Up @@ -220,7 +220,7 @@ def run_writes(driver, query, row_generator, metrics, limiter, runtime, timeout)
query=query,
params=params,
metrics=metrics,
labels=(JOB_WRITE_LABEL,),
labels=(OP_TYPE_WRITE,),
request_settings=request_settings,
retry_settings=retry_setting,
)
Expand Down Expand Up @@ -285,7 +285,7 @@ def check_result(result):
query=query,
params=params,
metrics=metrics,
labels=(JOB_WRITE_LABEL,),
labels=(OP_TYPE_WRITE,),
request_settings=request_settings,
retry_settings=retry_setting,
check_result_cb=check_result,
Expand Down
128 changes: 82 additions & 46 deletions tests/slo/src/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,55 +7,91 @@

environ["PROMETHEUS_DISABLE_CREATED_SERIES"] = "True"

from prometheus_client import CollectorRegistry, Gauge, Histogram, push_to_gateway # noqa: E402
from summary import Summary # noqa: E402
from prometheus_client import CollectorRegistry, Counter, Gauge, Histogram, push_to_gateway # noqa: E402

JOB_READ_LABEL, JOB_WRITE_LABEL = "read", "write"
JOB_STATUS_OK, JOB_STATUS_ERR = "ok", "err"
OP_TYPE_READ, OP_TYPE_WRITE = "read", "write"
OP_STATUS_SUCCESS, OP_STATUS_FAILURE = "success", "err"

SDK_SERVICE_NAME = environ.get("SDK_SERVICE", "sync-python-table")
REF = environ.get("REF", "main")
SDK_SERVICE_NAME = environ.get("SDK_SERVICE", "py-sync-table")


class Metrics:
def __init__(self, push_gateway):
self._push_gtw = push_gateway
self._registry = CollectorRegistry()
self._metrics = dict(
oks=Gauge(
"oks",
"amount of OK requests",
labelnames=("jobName",),
errors_total=Counter(
"sdk_errors_total",
"Total number of errors encountered, categorized by error type.",
labelnames=("operation_type", "error_type"),
registry=self._registry,
),
not_oks=Gauge(
"not_oks",
"amount of not OK requests",
labelnames=("jobName",),
operations_total=Counter(
"sdk_operations_total",
"Total number of operations, categorized by type attempted by the SDK.",
labelnames=("operation_type",),
registry=self._registry,
),
inflight=Gauge(
"inflight",
"amount of requests in flight",
labelnames=("jobName",),
operations_success_total=Counter(
"sdk_operations_success_total",
"Total number of successful operations, categorized by type.",
labelnames=("operation_type",),
registry=self._registry,
),
latency=Summary(
"latency",
"summary of latencies in ms",
labelnames=("jobName", "status"),
operations_failure_total=Counter(
"sdk_operations_failure_total",
"Total number of failed operations, categorized by type.",
labelnames=("operation_type",),
registry=self._registry,
objectives=(
(0.5, 0.01),
(0.99, 0.001),
(1.0, 0.0),
),
operation_latency_seconds=Histogram(
"sdk_operation_latency_seconds",
"Latency of operations performed by the SDK in seconds, categorized by type and status.",
labelnames=(
"operation_type",
"operation_status",
),
registry=self._registry,
buckets=(
0.001, # 1 ms
0.002, # 2 ms
0.003, # 3 ms
0.004, # 4 ms
0.005, # 5 ms
0.0075, # 7.5 ms
0.010, # 10 ms
0.020, # 20 ms
0.050, # 50 ms
0.100, # 100 ms
0.200, # 200 ms
0.500, # 500 ms
1.000, # 1 s
),
),
attempts=Histogram(
"attempts",
"histogram of amount of requests",
labelnames=("jobName", "status"),
retry_attempts_total=Counter(
"sdk_retry_attempts_total",
"Total number of retry attempts, categorized by operation type.",
labelnames=("operation_type",),
registry=self._registry,
),
retries_success_total=Counter(
"sdk_retries_success_total",
"Total number of successful retries, categorized by operation type.",
labelnames=("operation_type",),
registry=self._registry,
),
retries_failure_total=Counter(
"sdk_retries_failure_total",
"Total number of failed retries, categorized by operation type.",
labelnames=("operation_type",),
registry=self._registry,
),
pending_operations=Gauge(
"sdk_pending_operations",
"Current number of pending operations, categorized by type.",
labelnames=("operation_type",),
registry=self._registry,
buckets=tuple(range(1, 11)),
),
)
self.reset()
Expand All @@ -81,44 +117,44 @@ def start(self, labels):
if not isinstance(labels, Iterable):
labels = (labels,)

self.inflight.labels(*labels).inc()
self.pending_operations.labels(*labels).inc()
return time.time()

def stop(self, labels, start_time, attempts=1, error=None):
runtime_ms = 1000 * (time.time() - start_time)
duration = time.time() - start_time

if not isinstance(labels, Iterable):
labels = (labels,)

self.inflight.labels(*labels).dec()
self.operations_total.labels(*labels).inc()
self.pending_operations.labels(*labels).dec()
self.retry_attempts_total.labels(*labels).inc(attempts)

if error:
self.not_oks.labels(*labels).inc()
self.latency.labels(*labels, JOB_STATUS_ERR).observe(runtime_ms)
self.errors_total.labels(*labels, type(error).__name__).inc()
self.retries_failure_total.labels(*labels).inc(attempts)
self.operations_failure_total.labels(*labels).inc()
self.operation_latency_seconds.labels(*labels, OP_STATUS_FAILURE).observe(duration)
return

self.oks.labels(*labels).inc()
self.latency.labels(*labels, JOB_STATUS_OK).observe(runtime_ms)
self.attempts.labels(*labels, JOB_STATUS_OK).observe(attempts)
self.retries_success_total.labels(*labels).inc(attempts)
self.operations_success_total.labels(*labels).inc()
self.operation_latency_seconds.labels(*labels, OP_STATUS_SUCCESS).observe(duration)

def push(self):
push_to_gateway(
self._push_gtw,
job=f"workload-{SDK_SERVICE_NAME}",
registry=self._registry,
grouping_key={
"ref": REF,
"sdk": SDK_SERVICE_NAME,
"sdkVersion": version("ydb"),
"sdk_version": version("ydb"),
},
)

def reset(self):
for label in (JOB_READ_LABEL, JOB_WRITE_LABEL):
self.oks.labels(label).set(0)
self.not_oks.labels(label).set(0)
self.inflight.labels(label).set(0)

self.latency.clear()
self.attempts.clear()
for m in self._metrics.values():
m.clear()

self.push()
Loading

0 comments on commit a6d71a8

Please sign in to comment.