diff --git a/docs/zh/maintain/diagnose.md b/docs/zh/maintain/diagnose.md index e4de2fb7fb5..9cd2f124599 100644 --- a/docs/zh/maintain/diagnose.md +++ b/docs/zh/maintain/diagnose.md @@ -94,7 +94,7 @@ killed | 被终止的任务 更多state信息详见[Spark State]( https://spark.apache.org/docs/3.2.1/api/java/org/apache/spark/launcher/SparkAppHandle.State.html),[Yarn State](https://hadoop.apache.org/docs/current/api/org/apache/hadoop/yarn/api/records/YarnApplicationState.html) -使用`inspect job --id `查询指定任务的log日志,其结果会筛选出主要错误信息。如果需要详细的日志信息,可以添加`--detail`获取详细信息。 +使用`inspect job --id `查询指定任务的log日志,其结果会使用配置文件筛选出主要错误信息。如需更新配置文件,可以添加`--conf-update`,并且可以使用`--conf-url`配置镜像源,例如使用`--conf-url https://openmldb.ai/download/diag/common_err.yml`配置国内镜像。如果需要完整的日志信息,可以添加`--detail`获取详细信息。 ### test 测试 diff --git a/python/openmldb_sdk/setup.py b/python/openmldb_sdk/setup.py index d24682e4a5e..196841116ba 100644 --- a/python/openmldb_sdk/setup.py +++ b/python/openmldb_sdk/setup.py @@ -38,7 +38,7 @@ ]}, include_package_data=True, package_data={'': ['*.so']}, - packages=find_packages(), + packages=find_packages(exclude=['tests']), entry_points={ 'sqlalchemy.dialects': [ 'openmldb = openmldb.sqlalchemy_openmldb.openmldb_dialect:OpenmldbDialect', diff --git a/python/openmldb_sdk/openmldb/test/conftest.py b/python/openmldb_sdk/tests/conftest.py similarity index 100% rename from python/openmldb_sdk/openmldb/test/conftest.py rename to python/openmldb_sdk/tests/conftest.py diff --git a/python/openmldb_sdk/openmldb/test/sqlalchemy_standardtest_example.py b/python/openmldb_sdk/tests/sqlalchemy_standardtest_example.py similarity index 100% rename from python/openmldb_sdk/openmldb/test/sqlalchemy_standardtest_example.py rename to python/openmldb_sdk/tests/sqlalchemy_standardtest_example.py diff --git a/python/openmldb_tool/diagnostic_tool/common_err.yml b/python/openmldb_tool/diagnostic_tool/common_err.yml new file mode 100644 index 00000000000..2f45945fbcf --- /dev/null +++ b/python/openmldb_tool/diagnostic_tool/common_err.yml @@ -0,0 +1,15 @@ +errors: + other: + patterns: + - "at com.*openmldb" + - "At .*OpenMLDB" + - "Caused by" + - "java.*Exception" + - "Exception in" + - "ERROR" + description: "just print errs" + zk_conn_err: + patterns: + - "fail to init zk handler with hosts" + description: "Error: fail to init zk handler with hosts" + solution: "zk_conn_err" diff --git a/python/openmldb_tool/diagnostic_tool/diagnose.py b/python/openmldb_tool/diagnostic_tool/diagnose.py index 1b06ff162d4..6b2c742f03e 100644 --- a/python/openmldb_tool/diagnostic_tool/diagnose.py +++ b/python/openmldb_tool/diagnostic_tool/diagnose.py @@ -15,6 +15,7 @@ # limitations under the License. import argparse +import os import textwrap import time @@ -28,7 +29,7 @@ from diagnostic_tool.collector import Collector import diagnostic_tool.server_checker as checker from diagnostic_tool.table_checker import TableChecker -from diagnostic_tool.parser import log_parser +from diagnostic_tool.parser import LogParser from absl import app from absl import flags @@ -130,7 +131,7 @@ def inspect_offline(args): print(f"inspect {total} offline jobs") if num: failed_jobs_str = "\n".join(jobs) - AssertionError(f"{num} offline final jobs are failed\nfailed jobs:\n{failed_jobs_str}") + raise AssertionError(f"{num} offline final jobs are failed\nfailed jobs:\n{failed_jobs_str}") print("all offline final jobs are finished") @@ -168,8 +169,10 @@ def inspect_job(args): if args.detail: print(detailed_log) else: - err_messages = log_parser(detailed_log) - print(*err_messages, sep="\n") + parser = LogParser() + if args.conf_update or not os.path.exists(parser.conf_file): + parser.update_conf_file(args.conf_url) + parser.parse_log(detailed_log) def test_sql(args): @@ -264,6 +267,7 @@ def parse_arg(argv): "offline", help="only inspect offline jobs." ) offline.set_defaults(command=inspect_offline) + # inspect job ins_job = inspect_sub.add_parser("job", help="show jobs by state, show joblog or parse joblog by id.") ins_job.set_defaults(command=inspect_job) ins_job.add_argument( @@ -280,6 +284,16 @@ def parse_arg(argv): action="store_true", help="show detailed joblog information, use with `--id`" ) + ins_job.add_argument( + "--conf-url", + default="https://raw.githubusercontent.com/4paradigm/OpenMLDB/main/python/openmldb_tool/diagnostic_tool/common_err.yml", + help="url used to update the log parser configuration. If downloading is slow, you can try mirror source 'https://openmldb.ai/download/diag/common_err.yml'" + ) + ins_job.add_argument( + "--conf-update", + action="store_true", + help="update the log parser configuration" + ) # sub test test_parser = subparsers.add_parser( diff --git a/python/openmldb_tool/diagnostic_tool/parser.py b/python/openmldb_tool/diagnostic_tool/parser.py index 2e1b5c35cdd..ae57c5597af 100644 --- a/python/openmldb_tool/diagnostic_tool/parser.py +++ b/python/openmldb_tool/diagnostic_tool/parser.py @@ -1,30 +1,80 @@ +import os import re +import requests +import warnings +import yaml +from diagnostic_tool.connector import Connector +from diagnostic_tool.server_checker import StatusChecker -def log_parser(log): - log_lines = log.split("\n") - error_patterns = [ - re.compile(r"at com.*openmldb"), - re.compile(r"At .*OpenMLDB"), - re.compile(r"Caused by"), - re.compile(r"java.*Exception"), - re.compile(r"Exception in"), - re.compile(r"ERROR"), - ] - - error_messages = [] - skip_flag = 0 - - for line in log_lines: - for pattern in error_patterns: - match = pattern.search(line) - if match: - error_messages.append(line) - skip_flag = 1 - break +CONF_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "common_err.yml") + + +class LogParser: + def __init__(self, log_conf_file=CONF_FILE) -> None: + self.conf_file = log_conf_file + self._load_conf() + + def _load_conf(self): + self.errs = yaml.safe_load(open(self.conf_file))["errors"] + + def parse_log(self, log: str): + log_rows = log.split("\n") + # solution results + solution_results = [] + # skip irrelevant rows + skip_flag = False + for row in log_rows: + result = self._parse_row(row) + if result: + if result != "null": + solution_results.append(result) + skip_flag = True + continue + # print "..." if some lines are skipped + else: + if skip_flag: + print("...") + skip_flag = False + if solution_results: + print("Solutions".center(50, "=")) + print(*solution_results, sep="\n") + + def _parse_row(self, row): + for name, value in self.errs.items(): + for pattern in value['patterns']: + if re.search(pattern, row): + print(row) + if "solution" in self.errs[name]: + solution = ErrSolution(self.errs[name]) + result = solution() + return result + return "null" + + def update_conf_file(self, log_conf_url): + response = requests.get(log_conf_url) + if response.status_code == 200: + with open(self.conf_file, "w") as f: + f.write(response.text) else: - if skip_flag: - error_messages.append("...") - skip_flag = 0 + warnings.warn("log parser configuration update failed") + self._load_conf() + + +class ErrSolution: + def __init__(self, err) -> None: + self.desc = err["description"] + self.solution = err["solution"] + self.result = "" + + def __call__(self, *args, **kwargs): + getattr(self, self.solution)() + return self.result - return error_messages + def zk_conn_err(self): + self.result += "\n" + self.desc + self.result += "\nChecking zk connection..." + conn = Connector() + checker = StatusChecker(conn) + assert checker._get_components(show=False), "Failed to connect to zk" + self.result += "\nSuccessfully checked zk connection. It may be caused by `Too many connections` in zk server. Please check zk server log." diff --git a/python/openmldb_tool/diagnostic_tool/table_checker.py b/python/openmldb_tool/diagnostic_tool/table_checker.py index e8901f541be..969e7d110e4 100644 --- a/python/openmldb_tool/diagnostic_tool/table_checker.py +++ b/python/openmldb_tool/diagnostic_tool/table_checker.py @@ -50,7 +50,7 @@ def check_distribution(self, dbs: list): t = {} t['name'] = table['db'] + "." + table['name'] parts = table['table_partition'] - part_dist = self._collect(parts,'') + part_dist = self._collect(parts, '') count_dist = self._collect(parts, 'record_cnt') mem_dist = self._collect(parts, 'record_byte_size') dused_dist = self._collect(parts, 'diskused') @@ -125,4 +125,4 @@ def _add_merge(self, dist, dist2): def _get_nameserver(self): component_list = self.conn.execfetch("SHOW COMPONENTS") - return list(filter(lambda l: l[1]=="nameserver", component_list))[0][0] + return list(filter(lambda l: l[1] == "nameserver", component_list))[0][0] diff --git a/python/openmldb_tool/setup.py b/python/openmldb_tool/setup.py index b111b647b25..fafaeae3a88 100644 --- a/python/openmldb_tool/setup.py +++ b/python/openmldb_tool/setup.py @@ -38,7 +38,10 @@ extras_require={'test': [ "pytest", ]}, - packages=find_packages(), + packages=find_packages(exclude=['tests']), + exclude_package_data={ + 'openmldb-tool': ['diagnostic_tool/common_err.yml'] + }, entry_points={ 'console_scripts': ['openmldb_tool = diagnostic_tool.diagnose:run'], }, diff --git a/python/openmldb_tool/tests/log_parser_test.py b/python/openmldb_tool/tests/log_parser_test.py index 99936003c04..b9dd71b5df6 100644 --- a/python/openmldb_tool/tests/log_parser_test.py +++ b/python/openmldb_tool/tests/log_parser_test.py @@ -1,16 +1,21 @@ import os -import re +from absl import flags import pytest -from diagnostic_tool.parser import log_parser +from diagnostic_tool.parser import LogParser +from .case_conf import OpenMLDB_ZK_CLUSTER -err_log_list = [os.path.join("off_err_logs", err_log) for err_log in os.listdir("off_err_logs")] +logs_path = os.path.join(os.path.dirname(__file__), "off_err_logs") +err_log_list = [os.path.join(logs_path, err_log) for err_log in os.listdir(logs_path)] @pytest.mark.parametrize("err_log", err_log_list) def test_pattern_logs(err_log): + flags.FLAGS['cluster'].parse(OpenMLDB_ZK_CLUSTER) + flags.FLAGS['sdk_log'].parse(False) print("in", err_log) with open(err_log, "r") as f: log = f.read() - err_lines = log_parser(log) - print(*err_lines, sep="\n") + parser = LogParser() + parser.update_conf_file("https://openmldb.ai/download/diag/common_err.yml") + parser.parse_log(log) diff --git a/python/openmldb_tool/tests/off_err_logs/no_class.logfile b/python/openmldb_tool/tests/off_err_logs/no_class.logfile index ca3cffb3661..8e62ebe15cd 100644 --- a/python/openmldb_tool/tests/off_err_logs/no_class.logfile +++ b/python/openmldb_tool/tests/off_err_logs/no_class.logfile @@ -41,4 +41,5 @@ Caused by: java.lang.IllegalArgumentException: Compression codec com.hadoop.comp Caused by: java.lang.ClassNotFoundException: Class com.hadoop.compression.lzo.LzoCodec not found at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2273) at org.apache.hadoop.io.compress.CompressionCodecFactory.getCodecClasses(CompressionCodecFactory.java:132) - ... 33 more \ No newline at end of file + ... 33 more +fail to init zk handler with hosts \ No newline at end of file