Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Send metrics to datadog #22

Closed
wants to merge 42 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
9b8585f
Debug node type
gerardsegarra Apr 24, 2024
4210a49
fixup! Debug node type
gerardsegarra Apr 24, 2024
6ae575c
Install all gql
gerardsegarra Apr 24, 2024
a08b561
remove
gerardsegarra Apr 24, 2024
c635da5
Tag sha
gerardsegarra Apr 24, 2024
3756e6d
Add query node
gerardsegarra Apr 24, 2024
7cd92ea
Add node elements
gerardsegarra Apr 24, 2024
6cf98dd
Log tracked jobs
gerardsegarra Apr 24, 2024
d91d8a8
fixup! Log tracked jobs
gerardsegarra Apr 24, 2024
90b0f12
Discard
gerardsegarra Apr 24, 2024
0653e17
Run
gerardsegarra Apr 24, 2024
5d7c793
Fix context and return
gerardsegarra Apr 24, 2024
fb3da6f
Fix context and return
gerardsegarra Apr 24, 2024
78ab199
Fix context and return
gerardsegarra Apr 24, 2024
af772ce
Monitor queue
gerardsegarra Apr 25, 2024
e2f888b
Make latest only on not PR
gerardsegarra Apr 25, 2024
f08329f
fixup! Make latest only on not PR
gerardsegarra Apr 25, 2024
00eaee1
push image
gerardsegarra Apr 25, 2024
8ac8b44
Push wrong runs-on
gerardsegarra Apr 25, 2024
c97b1bd
Add debug
gerardsegarra Apr 25, 2024
585f8fb
fixup! Add debug
gerardsegarra Apr 25, 2024
061631d
Add metric of seconds_in_queue
gerardsegarra Apr 25, 2024
6d27aab
Install datadog and send metric
gerardsegarra Apr 25, 2024
90f6ab8
Rename metic
gerardsegarra Apr 26, 2024
9dfb6ad
Fix metric name
gerardsegarra Apr 26, 2024
9ceba96
Send if runner is public
gerardsegarra Apr 26, 2024
3b1439f
Fix metric name
gerardsegarra Apr 26, 2024
7020328
Fix test
gerardsegarra Apr 26, 2024
3ff0fa3
fixup! Fix test
gerardsegarra Apr 26, 2024
ef3005e
fixup! fixup! Fix test
gerardsegarra Apr 26, 2024
66b4e62
Get job
gerardsegarra Apr 26, 2024
8215cfd
fixup! Get job
gerardsegarra Apr 26, 2024
009f3cb
Get data in job_processor
gerardsegarra Apr 26, 2024
97b5515
Do not remove outside queued
gerardsegarra Apr 26, 2024
2aeaaf8
Debug
gerardsegarra Apr 26, 2024
e141af7
fixup! Debug
gerardsegarra Apr 26, 2024
e1053bd
Add repo
gerardsegarra Apr 26, 2024
9bfbf17
RunId
gerardsegarra Apr 26, 2024
9c0be0f
Add run id to tags
gerardsegarra Apr 26, 2024
ac39d24
Tag
gerardsegarra Apr 26, 2024
f15d2c9
debug
gerardsegarra Apr 26, 2024
aeb919a
Repository name
gerardsegarra Apr 26, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,16 +36,17 @@ jobs:
with:
images: ${{ env.REGISTRY }}/${{ github.repository_owner }}/github-workflows-monitoring
flavor: |
latest=true
latest=${{ github.event_name != 'pull_request' }}
tags: |
type=sha
type=ref,event=branch
type=ref,event=pr
type=semver,pattern={{version}}
type=semver,pattern={{major}}.{{minor}}
- name: Docker build and push
uses: docker/build-push-action@v4
with:
push: ${{ github.event_name != 'pull_request' }}
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
context: .
5 changes: 2 additions & 3 deletions .github/workflows/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@ name: Tests

on:
push:
branches: [ main ]
pull_request:
# pull_request:

jobs:
build:
Expand All @@ -12,7 +11,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ["3.10", "3.11"]
python-version: ["3.11"]

steps:
- uses: actions/checkout@v3
Expand Down
2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ packages = find:
install_requires =
Flask>=2.2,<3
Flask-APScheduler==1.13.1
gql[all]==3.5.0
datadog==0.49.1

[flake8]
max-line-length = 120
Expand Down
60 changes: 37 additions & 23 deletions src/app.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from datetime import datetime
import logging
from logging.config import dictConfig
import os
Expand All @@ -10,6 +9,18 @@
from const import GithubHeaders, LOGGING_CONFIG
from github import GithubJob
from utils import dict_to_logfmt
from queryql import query_nodes
from job_processor import extract_jobs_metrics_from_data

from datadog import initialize, statsd

options = {
'statsd_host': 'datadog-agent.datadog.svc.cluster.local',
'statsd_port': 8125,
}

initialize(**options)


dictConfig(LOGGING_CONFIG)

Expand All @@ -26,6 +37,7 @@
logging.getLogger('apscheduler.executors.default').setLevel(logging.WARNING)

jobs = dict()
node_ids = dict()


# check all calls are valid
Expand Down Expand Up @@ -61,11 +73,13 @@ def process_workflow_job():
"job_name": job.name,
"workflow": job.workflow,
"requestor": job.requestor,
"node_id": job.node_id,
}

if job.action == "queued":
# add to memory
jobs[job.id] = job
node_ids[job.node_id] = job

elif job.action == "in_progress":
job_requested = jobs.get(job.id)
Expand Down Expand Up @@ -124,34 +138,34 @@ def process_workflow_job():
return True


@scheduler.task('interval', id='monitor_queued', seconds=30)
# Add GH_PAT_SECRET

@scheduler.task('interval', id='monitor_queued', seconds=15)
def monitor_queued_jobs():
"""Return the job that has been queued and not starting for long time."""
app.logger.debug("Starting monitor_queued_jobs")

if not jobs:
return

queued_jobs = [job for job in jobs.values() if job.action == "queued"]
if not queued_jobs:
return

job = min(queued_jobs, key=lambda x: x.time_start)
delay = (datetime.now() - job.time_start).seconds

if delay <= int(os.getenv("QUEUED_JOBS_DELAY_THRESHOLD", 150)):
if not node_ids:
return

context_details = {
"action": "monitor_queued",
"job_id": job.id,
"job_name": job.name,
"repository": job.repository,
"started_at": job.time_start,
"delay": delay,
}

app.logger.info(dict_to_logfmt(context_details))
jobs_data = query_nodes(list(node_ids.keys()))
details = extract_jobs_metrics_from_data(jobs_data, node_ids)

for run in details:
app.logger.info(f"DETAIL {run}")
statsd.histogram(
'midokura.github_runners.jobs.seconds_in_queue.histogram',
run["seconds_in_queue"],
tags=[
f"job:{run['job_name']}",
f"repository:{run['repository']}",
f"runner_name:{run['runner_name']}",
f"run_id:run-{run['run_id']}", # "run-" added to group by run-id in DD
f"public:{run['is_public']}"
]
)

app.logger.info(f"Jobs details {details}")


allowed_events = {"workflow_job": process_workflow_job}
Expand Down
4 changes: 4 additions & 0 deletions src/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ def id(self):
def job_id(self):
return self.id

@property
def node_id(self):
return self.data["workflow_job"]["node_id"]

@property
def run_id(self):
return self.data["workflow_job"]["run_id"]
Expand Down
28 changes: 28 additions & 0 deletions src/job_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from datetime import datetime


def extract_jobs_metrics_from_data(jobs_data: dict, queued_node_ids: dict):
jobs_metrics = []

for job in jobs_data["nodes"]:
started_at = datetime.strptime(job["startedAt"], "%Y-%m-%dT%H:%M:%SZ")
now = datetime.now()

context_details = {
"action": "monitor_queued",
"job_id": job["id"],
"job_name": job["name"],
"repository": job["repository"]["name"],
"run_id": queued_node_ids[job["id"]].run_id,
"is_public": queued_node_ids[job["id"]].runner_public,
"runner_name": queued_node_ids[job["id"]].runner_name,
"seconds_in_queue": (now - started_at).total_seconds(),
}

jobs_metrics.append(context_details)

if job["status"] != "QUEUED":
queued_node_ids.pop(job["id"], None)
continue

return jobs_metrics
49 changes: 49 additions & 0 deletions src/queryql.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import os

from typing import List

from gql import gql, Client
from gql.transport.aiohttp import AIOHTTPTransport

# Select your transport with a defined url endpoint
headers = {
"Authorization": f"bearer {os.getenv('GH_PAT')}"
}
transport = AIOHTTPTransport(url="https://api.github.com/graphql", headers=headers)

# Create a GraphQL client using the defined transport
client = Client(transport=transport, fetch_schema_from_transport=True)


# Provide a GraphQL query
def query_nodes(node_id_list: List[str]):
query = gql(
"""
query getCheckRuns($node_id_list: [ID!]!) {
nodes(ids: $node_id_list) {
... on CheckRun {
id
name
status
startedAt
completedAt
repository {
owner {
login
}
name
}
checkSuite {
workflowRun {
event
runNumber
}
}
}
}
}
"""
)
params = {"node_id_list": node_id_list}

return client.execute(query, variable_values=params)
3 changes: 1 addition & 2 deletions tests/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
Flask
Flask-APScheduler==1.13.1
-e .
pytest
pytest-cov
flake8
1 change: 1 addition & 0 deletions tests/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
"workflow_job": {
"id": 0,
"run_id": 10,
"node_id": "CR_blah",
"workflow_name": "CI",
"head_branch": "new-feature-branch",
"started_at": "2023-01-27T14:00:00Z",
Expand Down
Loading