Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Phase Stats are now also saved if runs fails #972

Merged
merged 8 commits into from
Nov 19, 2024
4 changes: 3 additions & 1 deletion api/api_helpers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import sys
import faulthandler
faulthandler.enable() # will catch segfaults and write to STDERR
faulthandler.enable(file=sys.__stderr__) # will catch segfaults and write to stderr

from functools import cache
from html import escape as html_escape
Expand Down Expand Up @@ -212,6 +213,7 @@ def get_timeline_query(uri, filename, machine_id, branch, metrics, phase, start_
AND r.filename = %s
AND r.branch = %s
AND r.end_measurement IS NOT NULL
AND r.failed != TRUE
AND r.machine_id = %s
AND p.phase LIKE %s
{metrics_condition}
Expand Down
6 changes: 3 additions & 3 deletions api/main.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import faulthandler

# It seems like FastAPI already enables faulthandler as it shows stacktrace on SEGFAULT
# Is the redundant call problematic?
faulthandler.enable() # will catch segfaults and write to STDERR
import sys
import faulthandler
faulthandler.enable(file=sys.__stderr__) # will catch segfaults and write to stderr

import zlib
import base64
Expand Down
3 changes: 2 additions & 1 deletion cron/carbondb_compress.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import sys
import faulthandler
faulthandler.enable() # will catch segfaults and write to stderr
faulthandler.enable(file=sys.__stderr__) # will catch segfaults and write to stderr

import os

Expand Down
6 changes: 3 additions & 3 deletions cron/carbondb_copy_over_and_remove_duplicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ def copy_over_eco_ci():
EXTRACT(EPOCH FROM created_at) * 1e6,
(energy_uj::DOUBLE PRECISION)/1e6/3600/1000, -- to get to kWh
(carbon_ug::DOUBLE PRECISION)/1e9, -- to get to kg
0, -- there is no need for this column for further processing
0.0, -- there is no need for this column for further processing
0.0, -- there is no need for this column for further processing
0, -- (carbon_intensity_g) there is no need for this column for further processing
0.0, -- (latitude) there is no need for this column for further processing
0.0, -- (longitude) there is no need for this column for further processing
ip_address,
user_id,
created_at
Expand Down
4 changes: 2 additions & 2 deletions cron/client.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import sys
import faulthandler
faulthandler.enable() # will catch segfaults and write to stderr
faulthandler.enable(file=sys.__stderr__) # will catch segfaults and write to stderr

import os
import sys
import time
import subprocess
import json
Expand Down
4 changes: 2 additions & 2 deletions cron/jobs.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# pylint: disable=cyclic-import
import sys
import faulthandler
faulthandler.enable() # will catch segfaults and write to stderr
faulthandler.enable(file=sys.__stderr__) # will catch segfaults and write to stderr

import sys
import os
from datetime import datetime
import argparse
Expand Down
3 changes: 2 additions & 1 deletion cron/timeline_projects.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import sys
import faulthandler
faulthandler.enable() # will catch segfaults and write to stderr
faulthandler.enable(file=sys.__stderr__) # will catch segfaults and write to stderr

import os
import pprint
Expand Down
3 changes: 2 additions & 1 deletion lib/diff.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import sys
import faulthandler
faulthandler.enable() # will catch segfaults and write to stderr
faulthandler.enable(file=sys.__stderr__) # will catch segfaults and write to stderr

from lib.db import DB
from deepdiff import DeepDiff
Expand Down
3 changes: 2 additions & 1 deletion lib/job/base.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# pylint: disable=cyclic-import
import sys
import faulthandler
faulthandler.enable() # will catch segfaults and write to stderr
faulthandler.enable(file=sys.__stderr__) # will catch segfaults and write to stderr

import os
import importlib
Expand Down
3 changes: 2 additions & 1 deletion lib/job/email.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# pylint: disable=cyclic-import
import sys
import faulthandler
faulthandler.enable() # will catch segfaults and write to stderr
faulthandler.enable(file=sys.__stderr__) # will catch segfaults and write to stderr

import os

Expand Down
10 changes: 5 additions & 5 deletions lib/job/run.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# pylint: disable=cyclic-import

import sys
import faulthandler
faulthandler.enable() # will catch segfaults and write to stderr
faulthandler.enable(file=sys.__stderr__) # will catch segfaults and write to stderr

import os

Expand All @@ -14,7 +16,6 @@
from lib.user import User
from lib.terminal_colors import TerminalColors
from lib.system_checks import ConfigurationCheckError
from tools.phase_stats import build_and_store_phase_stats
from runner import Runner
import optimization_providers.base

Expand Down Expand Up @@ -54,9 +55,6 @@ def _process(self, skip_system_checks=False, docker_prune=False, full_docker_pru
try:
# Start main code. Only URL is allowed for cron jobs
self._run_id = runner.run()
user.deduct_measurement_quota(self._machine_id, int(runner._last_measurement_duration/1_000_000)) # duration in runner is in microseconds. We need seconds

build_and_store_phase_stats(self._run_id, runner._sci)

# We need to import this here as we need the correct config file
print(TerminalColors.HEADER, '\nImporting optimization reporters ...', TerminalColors.ENDC)
Expand Down Expand Up @@ -85,3 +83,5 @@ def _process(self, skip_system_checks=False, docker_prune=False, full_docker_pru
message=f"Run-ID: {self._run_id}\nName: {self._name}\n\nDetails can also be found in the log under: {GlobalConfig().config['cluster']['metrics_url']}/stats.html?id={self._run_id}\n\nError message: {exc}\n"
)
raise exc
finally:
user.deduct_measurement_quota(self._machine_id, int(runner._last_measurement_duration/1_000_000)) # duration in runner is in microseconds. We need seconds
3 changes: 2 additions & 1 deletion lib/machine.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import sys
import faulthandler
faulthandler.enable() # will catch segfaults and write to stderr
faulthandler.enable(file=sys.__stderr__) # will catch segfaults and write to stderr

import os

Expand Down
16 changes: 14 additions & 2 deletions lib/phase_stats.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import sys
import faulthandler
faulthandler.enable() # will catch segfaults and write to stderr
faulthandler.enable(file=sys.__stderr__) # will catch segfaults and write to stderr

import decimal
from io import StringIO
Expand Down Expand Up @@ -30,20 +31,31 @@ def build_and_store_phase_stats(run_id, sci=None):
"""
metrics = DB().fetch_all(query, (run_id, ))

if not metrics:
error_helpers.log_error('Metrics was empty and no phase_stats could be created. This can happen for failed runs, but should be very rare ...', run_id=run_id)
return


query = """
SELECT phases, measurement_config
FROM runs
WHERE id = %s
"""
phases, measurement_config = DB().fetch_one(query, (run_id, ))
data = DB().fetch_one(query, (run_id, ))

if not data or not data[0] or not data[1]:
error_helpers.log_error('Phases object was empty and no phase_stats could be created. This can happen for failed runs, but should be very rare ...', run_id=run_id)
return

phases, measurement_config = data # unpack

csv_buffer = StringIO()

machine_power_idle = None
machine_power_runtime = None
machine_energy_runtime = None


for idx, phase in enumerate(phases):
network_bytes_total = [] # reset; # we use array here and sum later, because checking for 0 alone not enough

Expand Down
3 changes: 2 additions & 1 deletion lib/timeline_project.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import sys
import faulthandler
faulthandler.enable() # will catch segfaults and write to stderr
faulthandler.enable(file=sys.__stderr__) # will catch segfaults and write to stderr

from lib.db import DB

Expand Down
8 changes: 4 additions & 4 deletions lib/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,16 @@

'''

import sys
import faulthandler
faulthandler.enable() # will catch segfaults and write to stderr
faulthandler.enable(file=sys.__stderr__) # will catch segfaults and write to stderr

from lib.global_config import GlobalConfig
from lib.db import DB
from lib.terminal_colors import TerminalColors
from lib import error_helpers

from runner import Runner
from tools.phase_stats import build_and_store_phase_stats

class ValidationWorkloadStddevError(RuntimeError):
pass
Expand All @@ -44,6 +44,7 @@ def get_workload_stddev(repo_uri, filename, branch, machine_id, comparison_windo
AND branch = %s
AND machine_id = %s
AND end_measurement IS NOT NULL
AND failed != TRUE
ORDER BY created_at DESC
LIMIT %s
) SELECT
Expand Down Expand Up @@ -88,8 +89,7 @@ def run_workload(name, uri, filename, branch):
job_id=None,
)
# Start main code. Only URL is allowed for cron jobs
run_id = runner.run()
build_and_store_phase_stats(run_id, runner._sci)
runner.run()

def validate_workload_stddev(data, metrics):
warning = False
Expand Down
Loading