Skip to content

Commit

Permalink
feat: enhance diag tool log parser and fix python setup (#3303)
Browse files Browse the repository at this point in the history
  • Loading branch information
zhangziheng01233 authored and dl239 committed Jun 29, 2023
1 parent 8ffda7c commit cfc86ce
Show file tree
Hide file tree
Showing 11 changed files with 128 additions and 40 deletions.
2 changes: 1 addition & 1 deletion docs/zh/maintain/diagnose.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ killed | 被终止的任务
更多state信息详见[Spark State]( https://spark.apache.org/docs/3.2.1/api/java/org/apache/spark/launcher/SparkAppHandle.State.html)[Yarn State](https://hadoop.apache.org/docs/current/api/org/apache/hadoop/yarn/api/records/YarnApplicationState.html)


使用`inspect job --id <job_id>`查询指定任务的log日志,其结果会筛选出主要错误信息。如果需要详细的日志信息,可以添加`--detail`获取详细信息。
使用`inspect job --id <job_id>`查询指定任务的log日志,其结果会使用配置文件筛选出主要错误信息。如需更新配置文件,可以添加`--conf-update`,并且可以使用`--conf-url`配置镜像源,例如使用`--conf-url https://openmldb.ai/download/diag/common_err.yml`配置国内镜像。如果需要完整的日志信息,可以添加`--detail`获取详细信息。

### test 测试

Expand Down
2 changes: 1 addition & 1 deletion python/openmldb_sdk/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
]},
include_package_data=True,
package_data={'': ['*.so']},
packages=find_packages(),
packages=find_packages(exclude=['tests']),
entry_points={
'sqlalchemy.dialects': [
'openmldb = openmldb.sqlalchemy_openmldb.openmldb_dialect:OpenmldbDialect',
Expand Down
File renamed without changes.
15 changes: 15 additions & 0 deletions python/openmldb_tool/diagnostic_tool/common_err.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
errors:
other:
patterns:
- "at com.*openmldb"
- "At .*OpenMLDB"
- "Caused by"
- "java.*Exception"
- "Exception in"
- "ERROR"
description: "just print errs"
zk_conn_err:
patterns:
- "fail to init zk handler with hosts"
description: "Error: fail to init zk handler with hosts"
solution: "zk_conn_err"
22 changes: 18 additions & 4 deletions python/openmldb_tool/diagnostic_tool/diagnose.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
# limitations under the License.

import argparse
import os
import textwrap
import time

Expand All @@ -28,7 +29,7 @@
from diagnostic_tool.collector import Collector
import diagnostic_tool.server_checker as checker
from diagnostic_tool.table_checker import TableChecker
from diagnostic_tool.parser import log_parser
from diagnostic_tool.parser import LogParser

from absl import app
from absl import flags
Expand Down Expand Up @@ -130,7 +131,7 @@ def inspect_offline(args):
print(f"inspect {total} offline jobs")
if num:
failed_jobs_str = "\n".join(jobs)
AssertionError(f"{num} offline final jobs are failed\nfailed jobs:\n{failed_jobs_str}")
raise AssertionError(f"{num} offline final jobs are failed\nfailed jobs:\n{failed_jobs_str}")
print("all offline final jobs are finished")


Expand Down Expand Up @@ -168,8 +169,10 @@ def inspect_job(args):
if args.detail:
print(detailed_log)
else:
err_messages = log_parser(detailed_log)
print(*err_messages, sep="\n")
parser = LogParser()
if args.conf_update or not os.path.exists(parser.conf_file):
parser.update_conf_file(args.conf_url)
parser.parse_log(detailed_log)


def test_sql(args):
Expand Down Expand Up @@ -264,6 +267,7 @@ def parse_arg(argv):
"offline", help="only inspect offline jobs."
)
offline.set_defaults(command=inspect_offline)
# inspect job
ins_job = inspect_sub.add_parser("job", help="show jobs by state, show joblog or parse joblog by id.")
ins_job.set_defaults(command=inspect_job)
ins_job.add_argument(
Expand All @@ -280,6 +284,16 @@ def parse_arg(argv):
action="store_true",
help="show detailed joblog information, use with `--id`"
)
ins_job.add_argument(
"--conf-url",
default="https://raw.githubusercontent.com/4paradigm/OpenMLDB/main/python/openmldb_tool/diagnostic_tool/common_err.yml",
help="url used to update the log parser configuration. If downloading is slow, you can try mirror source 'https://openmldb.ai/download/diag/common_err.yml'"
)
ins_job.add_argument(
"--conf-update",
action="store_true",
help="update the log parser configuration"
)

# sub test
test_parser = subparsers.add_parser(
Expand Down
100 changes: 75 additions & 25 deletions python/openmldb_tool/diagnostic_tool/parser.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,80 @@
import os
import re
import requests
import warnings
import yaml

from diagnostic_tool.connector import Connector
from diagnostic_tool.server_checker import StatusChecker

def log_parser(log):
log_lines = log.split("\n")
error_patterns = [
re.compile(r"at com.*openmldb"),
re.compile(r"At .*OpenMLDB"),
re.compile(r"Caused by"),
re.compile(r"java.*Exception"),
re.compile(r"Exception in"),
re.compile(r"ERROR"),
]

error_messages = []
skip_flag = 0

for line in log_lines:
for pattern in error_patterns:
match = pattern.search(line)
if match:
error_messages.append(line)
skip_flag = 1
break
CONF_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "common_err.yml")


class LogParser:
def __init__(self, log_conf_file=CONF_FILE) -> None:
self.conf_file = log_conf_file
self._load_conf()

def _load_conf(self):
self.errs = yaml.safe_load(open(self.conf_file))["errors"]

def parse_log(self, log: str):
log_rows = log.split("\n")
# solution results
solution_results = []
# skip irrelevant rows
skip_flag = False
for row in log_rows:
result = self._parse_row(row)
if result:
if result != "null":
solution_results.append(result)
skip_flag = True
continue
# print "..." if some lines are skipped
else:
if skip_flag:
print("...")
skip_flag = False
if solution_results:
print("Solutions".center(50, "="))
print(*solution_results, sep="\n")

def _parse_row(self, row):
for name, value in self.errs.items():
for pattern in value['patterns']:
if re.search(pattern, row):
print(row)
if "solution" in self.errs[name]:
solution = ErrSolution(self.errs[name])
result = solution()
return result
return "null"

def update_conf_file(self, log_conf_url):
response = requests.get(log_conf_url)
if response.status_code == 200:
with open(self.conf_file, "w") as f:
f.write(response.text)
else:
if skip_flag:
error_messages.append("...")
skip_flag = 0
warnings.warn("log parser configuration update failed")
self._load_conf()


class ErrSolution:
def __init__(self, err) -> None:
self.desc = err["description"]
self.solution = err["solution"]
self.result = ""

def __call__(self, *args, **kwargs):
getattr(self, self.solution)()
return self.result

return error_messages
def zk_conn_err(self):
self.result += "\n" + self.desc
self.result += "\nChecking zk connection..."
conn = Connector()
checker = StatusChecker(conn)
assert checker._get_components(show=False), "Failed to connect to zk"
self.result += "\nSuccessfully checked zk connection. It may be caused by `Too many connections` in zk server. Please check zk server log."
4 changes: 2 additions & 2 deletions python/openmldb_tool/diagnostic_tool/table_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def check_distribution(self, dbs: list):
t = {}
t['name'] = table['db'] + "." + table['name']
parts = table['table_partition']
part_dist = self._collect(parts,'')
part_dist = self._collect(parts, '')
count_dist = self._collect(parts, 'record_cnt')
mem_dist = self._collect(parts, 'record_byte_size')
dused_dist = self._collect(parts, 'diskused')
Expand Down Expand Up @@ -125,4 +125,4 @@ def _add_merge(self, dist, dist2):

def _get_nameserver(self):
component_list = self.conn.execfetch("SHOW COMPONENTS")
return list(filter(lambda l: l[1]=="nameserver", component_list))[0][0]
return list(filter(lambda l: l[1] == "nameserver", component_list))[0][0]
5 changes: 4 additions & 1 deletion python/openmldb_tool/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,10 @@
extras_require={'test': [
"pytest",
]},
packages=find_packages(),
packages=find_packages(exclude=['tests']),
exclude_package_data={
'openmldb-tool': ['diagnostic_tool/common_err.yml']
},
entry_points={
'console_scripts': ['openmldb_tool = diagnostic_tool.diagnose:run'],
},
Expand Down
15 changes: 10 additions & 5 deletions python/openmldb_tool/tests/log_parser_test.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,21 @@
import os
import re
from absl import flags

import pytest
from diagnostic_tool.parser import log_parser
from diagnostic_tool.parser import LogParser
from .case_conf import OpenMLDB_ZK_CLUSTER

err_log_list = [os.path.join("off_err_logs", err_log) for err_log in os.listdir("off_err_logs")]
logs_path = os.path.join(os.path.dirname(__file__), "off_err_logs")
err_log_list = [os.path.join(logs_path, err_log) for err_log in os.listdir(logs_path)]


@pytest.mark.parametrize("err_log", err_log_list)
def test_pattern_logs(err_log):
flags.FLAGS['cluster'].parse(OpenMLDB_ZK_CLUSTER)
flags.FLAGS['sdk_log'].parse(False)
print("in", err_log)
with open(err_log, "r") as f:
log = f.read()
err_lines = log_parser(log)
print(*err_lines, sep="\n")
parser = LogParser()
parser.update_conf_file("https://openmldb.ai/download/diag/common_err.yml")
parser.parse_log(log)
3 changes: 2 additions & 1 deletion python/openmldb_tool/tests/off_err_logs/no_class.logfile
Original file line number Diff line number Diff line change
Expand Up @@ -41,4 +41,5 @@ Caused by: java.lang.IllegalArgumentException: Compression codec com.hadoop.comp
Caused by: java.lang.ClassNotFoundException: Class com.hadoop.compression.lzo.LzoCodec not found
at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2273)
at org.apache.hadoop.io.compress.CompressionCodecFactory.getCodecClasses(CompressionCodecFactory.java:132)
... 33 more
... 33 more
fail to init zk handler with hosts

0 comments on commit cfc86ce

Please sign in to comment.