Projects
Eulaceura:Factory
sysSentry
_service:obs_scm:feature-add-avg_block_io-plugi...
Sign Up
Log In
Username
Password
Sorry, you are not authorized to perform this action.
×
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File _service:obs_scm:feature-add-avg_block_io-plugin.patch of Package sysSentry
From acb77d6a69aa9269b0f691613bef53efd0c01e53 Mon Sep 17 00:00:00 2001 From: gaoruoshu <gaoruoshu@huawei.com> Date: Thu, 12 Sep 2024 11:31:34 +0800 Subject: [PATCH 2/2] add avg_block_io plugin --- config/plugins/avg_block_io.ini | 21 ++ config/tasks/avg_block_io.mod | 5 + src/python/sentryPlugins/__init__.py | 0 .../sentryPlugins/avg_block_io/__init__.py | 0 .../avg_block_io/avg_block_io.py | 257 ++++++++++++++++++ .../sentryPlugins/avg_block_io/module_conn.py | 86 ++++++ .../avg_block_io/stage_window.py | 47 ++++ .../sentryPlugins/avg_block_io/utils.py | 86 ++++++ 8 files changed, 502 insertions(+) create mode 100644 config/plugins/avg_block_io.ini create mode 100644 config/tasks/avg_block_io.mod create mode 100644 src/python/sentryPlugins/__init__.py create mode 100644 src/python/sentryPlugins/avg_block_io/__init__.py create mode 100644 src/python/sentryPlugins/avg_block_io/avg_block_io.py create mode 100644 src/python/sentryPlugins/avg_block_io/module_conn.py create mode 100644 src/python/sentryPlugins/avg_block_io/stage_window.py create mode 100644 src/python/sentryPlugins/avg_block_io/utils.py diff --git a/config/plugins/avg_block_io.ini b/config/plugins/avg_block_io.ini new file mode 100644 index 0000000..bc33dde --- /dev/null +++ b/config/plugins/avg_block_io.ini @@ -0,0 +1,21 @@ +[common] +disk=default +stage=default +iotype=read,write +period_time=1 + +[algorithm] +win_size=30 +win_threshold=6 + +[latency] +read_avg_lim=10 +write_avg_lim=10 +read_avg_time=3 +write_avg_time=3 +read_tot_lim=50 +write_tot_lim=50 + +[iodump] +read_iodump_lim=0 +write_iodump_lim=0 diff --git a/config/tasks/avg_block_io.mod b/config/tasks/avg_block_io.mod new file mode 100644 index 0000000..814c483 --- /dev/null +++ b/config/tasks/avg_block_io.mod @@ -0,0 +1,5 @@ +[common] +enabled=yes +task_start=/usr/bin/python3 /usr/bin/avg_block_io +task_stop=pkill avg_block_io +type=oneshot \ No newline at end of file diff --git a/src/python/sentryPlugins/__init__.py b/src/python/sentryPlugins/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/python/sentryPlugins/avg_block_io/__init__.py b/src/python/sentryPlugins/avg_block_io/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py new file mode 100644 index 0000000..ff2071d --- /dev/null +++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py @@ -0,0 +1,257 @@ +# coding: utf-8 +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# sysSentry is licensed under the Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +# PURPOSE. +# See the Mulan PSL v2 for more details. +import logging +import signal +import configparser +import time + +from .stage_window import IoWindow, IoDumpWindow +from .module_conn import avg_is_iocollect_valid, avg_get_io_data, report_alarm_fail, process_report_data, sig_handler +from .utils import update_avg_and_check_abnormal + +CONFIG_FILE = "/etc/sysSentry/plugins/avg_block_io.ini" + +def log_invalid_keys(not_in_list, keys_name, config_list, default_list): + """print invalid log""" + if config_list and default_list: + logging.warning("{} in common.{} are not valid, set {}={}".format(not_in_list, keys_name, keys_name, default_list)) + elif config_list == ["default"]: + logging.warning("Default {} use {}".format(keys_name, default_list)) + + +def read_config_common(config): + """read config file, get [common] section value""" + try: + common_sec = config['common'] + except configparser.NoSectionError: + report_alarm_fail("Cannot find common section in config file") + + try: + period_time = int(common_sec.get("period_time", 1)) + if not (1 <= period_time <= 300): + raise ValueError("Invalid period_time") + except ValueError: + period_time = 1 + logging.warning("Invalid period_time, set to 1s") + + disk = common_sec.get('disk').split(",") if common_sec.get('disk') not in [None, 'default'] else [] + stage = common_sec.get('stage').split(",") if common_sec.get('stage') not in [None, 'default'] else [] + + if len(disk) > 10: + logging.warning("Too many disks, record only max 10 disks") + disk = disk[:10] + + iotype = common_sec.get('iotype', 'read,write').split(",") + iotype_list = [rw.lower() for rw in iotype if rw.lower() in ['read', 'write', 'flush', 'discard']] + err_iotype = [rw for rw in iotype if rw.lower() not in ['read', 'write', 'flush', 'discard']] + + if err_iotype: + logging.warning("{} in common.iotype are not valid, set iotype={}".format(err_iotype, iotype_list)) + + return period_time, disk, stage, iotype_list + + +def read_config_algorithm(config): + """read config file, get [algorithm] section value""" + if not config.has_section("algorithm"): + report_alarm_fail("Cannot find algorithm section in config file") + + try: + win_size = int(config.get("algorithm", "win_size")) + if not (1 <= win_size <= 300): + raise ValueError("Invalid win_size") + win_threshold = int(config.get("algorithm", "win_threshold")) + if win_threshold < 1 or win_threshold > 300 or win_threshold > win_size: + raise ValueError("Invalid win_threshold") + except ValueError: + report_alarm_fail("Invalid win_threshold or win_size") + + return win_size, win_threshold + + +def read_config_lat_iodump(io_dic, config): + """read config file, get [latency] [iodump] section value""" + common_param = {} + for io_type in io_dic["iotype_list"]: + common_param[io_type] = {} + + latency_keys = { + "avg_lim": "{}_avg_lim".format(io_type), + "avg_time": "{}_avg_time".format(io_type), + "tot_lim": "{}_tot_lim".format(io_type), + } + iodump_key = "{}_iodump_lim".format(io_type) + + for key_suffix, key_template in latency_keys.items(): + if key_template in config["latency"] and config["latency"][key_template].isdecimal(): + common_param[io_type][key_template] = int(config["latency"][key_template]) + + if iodump_key in config["iodump"] and config["iodump"][iodump_key].isdecimal(): + common_param[io_type][iodump_key] = int(config["iodump"][iodump_key]) + + return common_param + + +def read_config_stage(config, stage, iotype_list): + """read config file, get [STAGE_NAME] section value""" + res = {} + if not stage in config: + return res + + for key in config[stage]: + if config[stage][key].isdecimal(): + res[key] = int(config[stage][key]) + + return res + + +def init_io_win(io_dic, config, common_param): + """initialize windows of latency, iodump, and dict of avg_value""" + iotype_list = io_dic["iotype_list"] + io_data = {} + io_avg_value = {} + for disk_name in io_dic["disk_list"]: + io_data[disk_name] = {} + io_avg_value[disk_name] = {} + for stage_name in io_dic["stage_list"]: + io_data[disk_name][stage_name] = {} + io_avg_value[disk_name][stage_name] = {} + # step3. 解析stage配置 + curr_stage_param = read_config_stage(config, stage_name, iotype_list) + for rw in iotype_list: + io_data[disk_name][stage_name][rw] = {} + io_avg_value[disk_name][stage_name][rw] = [0, 0] + + # 对每个rw创建latency和iodump窗口 + avg_lim_key = "{}_avg_lim".format(rw) + avg_time_key = "{}_avg_time".format(rw) + tot_lim_key = "{}_tot_lim".format(rw) + iodump_lim_key = "{}_iodump_lim".format(rw) + + # 获取值,优先从 curr_stage_param 获取,如果不存在,则从 common_param 获取 + avg_lim_value = curr_stage_param.get(avg_lim_key, common_param.get(rw, {}).get(avg_lim_key)) + avg_time_value = curr_stage_param.get(avg_time_key, common_param.get(rw, {}).get(avg_time_key)) + tot_lim_value = curr_stage_param.get(tot_lim_key, common_param.get(rw, {}).get(tot_lim_key)) + iodump_lim_value = curr_stage_param.get(iodump_lim_key, common_param.get(rw, {}).get(iodump_lim_key)) + + if avg_lim_value and avg_time_value and tot_lim_value: + io_data[disk_name][stage_name][rw]["latency"] = IoWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_multiple=avg_time_value, abnormal_multiple_lim=avg_lim_value, abnormal_time=tot_lim_value) + + if iodump_lim_value is not None: + io_data[disk_name][stage_name][rw]["iodump"] = IoDumpWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_time=iodump_lim_value) + return io_data, io_avg_value + + +def get_valid_disk_stage_list(io_dic, config_disk, config_stage): + """get disk_list and stage_list by sentryCollector""" + json_data = avg_is_iocollect_valid(io_dic, config_disk, config_stage) + + all_disk_set = json_data.keys() + all_stage_set = set() + for disk_stage_list in json_data.values(): + all_stage_set.update(disk_stage_list) + + disk_list = [key for key in config_disk if key in all_disk_set] + not_in_disk_list = [key for key in config_disk if key not in all_disk_set] + + stage_list = [key for key in config_stage if key in all_stage_set] + not_in_stage_list = [key for key in config_stage if key not in all_stage_set] + + if not config_disk: + disk_list = [key for key in all_disk_set] + + if not config_stage: + stage_list = [key for key in all_stage_set] + + if config_disk and not disk_list: + logging.warning("Cannot get valid disk by disk={}, set to default".format(config_disk)) + disk_list, stage_list = get_valid_disk_stage_list(io_dic, [], config_stage) + + if config_stage and not stage_list: + logging.warning("Cannot get valid stage by stage={}, set to default".format(config_stage)) + disk_list, stage_list = get_valid_disk_stage_list(io_dic, config_disk, []) + + if not stage_list or not disk_list: + report_alarm_fail("Cannot get valid disk name or stage name.") + + log_invalid_keys(not_in_disk_list, 'disk', config_disk, disk_list) + log_invalid_keys(not_in_stage_list, 'stage', config_stage, stage_list) + + return disk_list, stage_list + + +def main_loop(io_dic, io_data, io_avg_value): + """main loop of avg_block_io""" + period_time = io_dic["period_time"] + disk_list = io_dic["disk_list"] + stage_list = io_dic["stage_list"] + iotype_list = io_dic["iotype_list"] + win_size = io_dic["win_size"] + # 开始循环 + while True: + # 等待x秒 + time.sleep(period_time) + + # 采集模块对接,获取周期数据 + curr_period_data = avg_get_io_data(io_dic) + + # 处理周期数据 + reach_size = False + for disk_name in disk_list: + for stage_name in stage_list: + for rw in iotype_list: + if disk_name in curr_period_data and stage_name in curr_period_data[disk_name] and rw in curr_period_data[disk_name][stage_name]: + io_key = (disk_name, stage_name, rw) + reach_size = update_avg_and_check_abnormal(curr_period_data, io_key, win_size, io_avg_value, io_data) + + # win_size不满时不进行告警判断 + if not reach_size: + continue + + # 判断异常窗口、异常场景 + for disk_name in disk_list: + for rw in iotype_list: + process_report_data(disk_name, rw, io_data) + + +def main(): + """main func""" + # 注册停止信号-2/-15 + signal.signal(signal.SIGINT, sig_handler) + signal.signal(signal.SIGTERM, sig_handler) + + # 初始化配置读取 + config = configparser.ConfigParser(comment_prefixes=('#', ';')) + try: + config.read(CONFIG_FILE) + except configparser.Error: + report_alarm_fail("Failed to read config file") + + io_dic = {} + + # 读取配置文件 -- common段 + io_dic["period_time"], disk, stage, io_dic["iotype_list"] = read_config_common(config) + + # 采集模块对接,is_iocollect_valid() + io_dic["disk_list"], io_dic["stage_list"] = get_valid_disk_stage_list(io_dic, disk, stage) + + if "bio" not in io_dic["stage_list"]: + report_alarm_fail("Cannot run avg_block_io without bio stage") + + # 初始化窗口 -- config读取,对应is_iocollect_valid返回的结果 + # step1. 解析公共配置 --- algorithm + io_dic["win_size"], io_dic["win_threshold"] = read_config_algorithm(config) + + # step2. 循环创建窗口 + common_param = read_config_lat_iodump(io_dic, config) + io_data, io_avg_value = init_io_win(io_dic, config, common_param) + + main_loop(io_dic, io_data, io_avg_value) diff --git a/src/python/sentryPlugins/avg_block_io/module_conn.py b/src/python/sentryPlugins/avg_block_io/module_conn.py new file mode 100644 index 0000000..caa0191 --- /dev/null +++ b/src/python/sentryPlugins/avg_block_io/module_conn.py @@ -0,0 +1,86 @@ +# coding: utf-8 +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# sysSentry is licensed under the Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +# PURPOSE. +# See the Mulan PSL v2 for more details. +import json +import logging +import sys +import time + +from .utils import is_abnormal +from sentryCollector.collect_plugin import is_iocollect_valid, get_io_data, Result_Messages +from syssentry.result import ResultLevel, report_result + + +TASK_NAME = "avg_block_io" + +def sig_handler(signum, _f): + """stop avg_block_io""" + report_result(TASK_NAME, ResultLevel.PASS, json.dumps({})) + logging.info("Finished avg_block_io plugin running.") + sys.exit(0) + +def avg_get_io_data(io_dic): + """get_io_data from sentryCollector""" + res = get_io_data(io_dic["period_time"], io_dic["disk_list"], io_dic["stage_list"], io_dic["iotype_list"]) + return check_result_validation(res, 'get io data') + + +def avg_is_iocollect_valid(io_dic, config_disk, config_stage): + """is_iocollect_valid from sentryCollector""" + res = is_iocollect_valid(io_dic["period_time"], config_disk, config_stage) + return check_result_validation(res, 'check config validation') + + +def check_result_validation(res, reason): + """check validation of result from sentryCollector""" + if not 'ret' in res or not 'message' in res: + err_msg = "Failed to {}: Cannot connect to sentryCollector.".format(reason) + report_alarm_fail(err_msg) + if res['ret'] != 0: + err_msg = "Failed to {}: {}".format(reason, Result_Messages[res['ret']]) + report_alarm_fail(err_msg) + + try: + json_data = json.loads(res['message']) + except json.JSONDecodeError: + err_msg = "Failed to {}: invalid return message".format(reason) + report_alarm_fail(err_msg) + + return json_data + + +def report_alarm_fail(alarm_info): + """report result to xalarmd""" + report_result(TASK_NAME, ResultLevel.FAIL, json.dumps({"msg": alarm_info})) + logging.error(alarm_info) + sys.exit(1) + + +def process_report_data(disk_name, rw, io_data): + """check abnormal window and report to xalarm""" + if not is_abnormal((disk_name, 'bio', rw), io_data): + return + + ctrl_stage = ['throtl', 'wbt', 'iocost', 'bfq'] + for stage_name in ctrl_stage: + if is_abnormal((disk_name, stage_name, rw), io_data): + logging.warning("{} - {} - {} report IO press".format(time.ctime(), disk_name, rw)) + return + + if is_abnormal((disk_name, 'rq_driver', rw), io_data): + logging.warning("{} - {} - {} report driver".format(time.ctime(), disk_name, rw)) + return + + kernel_stage = ['gettag', 'plug', 'deadline', 'hctx', 'requeue'] + for stage_name in kernel_stage: + if is_abnormal((disk_name, stage_name, rw), io_data): + logging.warning("{} - {} - {} report kernel".format(time.ctime(), disk_name, rw)) + return + logging.warning("{} - {} - {} report IO press".format(time.ctime(), disk_name, rw)) diff --git a/src/python/sentryPlugins/avg_block_io/stage_window.py b/src/python/sentryPlugins/avg_block_io/stage_window.py new file mode 100644 index 0000000..9b0ce79 --- /dev/null +++ b/src/python/sentryPlugins/avg_block_io/stage_window.py @@ -0,0 +1,47 @@ +# coding: utf-8 +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# sysSentry is licensed under the Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +# PURPOSE. +# See the Mulan PSL v2 for more details. + +class AbnormalWindowBase: + def __init__(self, window_size=10, window_threshold=7): + self.window_size = window_size + self.window_threshold = window_threshold + self.abnormal_window = [False] * window_size + + def append_new_period(self, ab_res, avg_val=0): + self.abnormal_window.pop(0) + if self.is_abnormal_period(ab_res, avg_val): + self.abnormal_window.append(True) + else: + self.abnormal_window.append(False) + + def is_abnormal_window(self): + return sum(self.abnormal_window) > self.window_threshold + + +class IoWindow(AbnormalWindowBase): + def __init__(self, window_size=10, window_threshold=7, abnormal_multiple=5, abnormal_multiple_lim=30, abnormal_time=40): + super().__init__(window_size, window_threshold) + self.abnormal_multiple = abnormal_multiple + self.abnormal_multiple_lim = abnormal_multiple_lim + self.abnormal_time = abnormal_time + + def is_abnormal_period(self, value, avg_val): + return (value > avg_val * self.abnormal_multiple and value > self.abnormal_multiple_lim) or \ + (value > self.abnormal_time) + + +class IoDumpWindow(AbnormalWindowBase): + def __init__(self, window_size=10, window_threshold=7, abnormal_time=40): + super().__init__(window_size, window_threshold) + self.abnormal_time = abnormal_time + + def is_abnormal_period(self, value, avg_val=0): + return value > self.abnormal_time diff --git a/src/python/sentryPlugins/avg_block_io/utils.py b/src/python/sentryPlugins/avg_block_io/utils.py new file mode 100644 index 0000000..54ed080 --- /dev/null +++ b/src/python/sentryPlugins/avg_block_io/utils.py @@ -0,0 +1,86 @@ +# coding: utf-8 +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# sysSentry is licensed under the Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +# PURPOSE. +# See the Mulan PSL v2 for more details. +AVG_VALUE = 0 +AVG_COUNT = 1 + + +def get_nested_value(data, keys): + """get data from nested dict""" + for key in keys: + if key in data: + data = data[key] + else: + return None + return data + + +def set_nested_value(data, keys, value): + """set data to nested dict""" + for key in keys[:-1]: + if key in data: + data = data[key] + else: + return False + data[keys[-1]] = value + return True + + +def is_abnormal(io_key, io_data): + """check if latency and iodump win abnormal""" + for key in ['latency', 'iodump']: + all_keys = get_nested_value(io_data, io_key) + if all_keys and key in all_keys: + win = get_nested_value(io_data, io_key + (key,)) + if win and win.is_abnormal_window(): + return True + return False + + +def update_io_avg(old_avg, period_value, win_size): + """update average of latency window""" + if old_avg[AVG_COUNT] < win_size: + new_avg_count = old_avg[AVG_COUNT] + 1 + new_avg_value = (old_avg[AVG_VALUE] * old_avg[AVG_COUNT] + period_value[0]) / new_avg_count + else: + new_avg_count = old_avg[AVG_COUNT] + new_avg_value = (old_avg[AVG_VALUE] * (old_avg[AVG_COUNT] - 1) + period_value[0]) / new_avg_count + return [new_avg_value, new_avg_count] + + +def update_io_data(old_avg, period_value, win_size, io_data, io_key): + """update data of latency and iodump window""" + all_wins = get_nested_value(io_data, io_key) + if all_wins and "latency" in all_wins: + io_data[io_key[0]][io_key[1]][io_key[2]]["latency"].append_new_period(period_value[0], old_avg[AVG_VALUE]) + if all_wins and "iodump" in all_wins: + io_data[io_key[0]][io_key[1]][io_key[2]]["iodump"].append_new_period(period_value[1]) + + +def update_avg_and_check_abnormal(data, io_key, win_size, io_avg_value, io_data): + """update avg and check abonrmal, return true if win_size full""" + period_value = get_nested_value(data, io_key) + old_avg = get_nested_value(io_avg_value, io_key) + + # 更新avg数据 + if old_avg[AVG_COUNT] < win_size: + set_nested_value(io_avg_value, io_key, update_io_avg(old_avg, period_value, win_size)) + return False + + # 更新win数据 -- 判断异常周期 + update_io_data(old_avg, period_value, win_size, io_data, io_key) + all_wins = get_nested_value(io_data, io_key) + if all_wins and 'latency' not in all_wins: + return True + period = get_nested_value(io_data, io_key + ("latency",)) + if period and period.is_abnormal_period(period_value[0], old_avg[AVG_VALUE]): + return True + set_nested_value(io_avg_value, io_key, update_io_avg(old_avg, period_value, win_size)) + return True -- 2.33.0
Locations
Projects
Search
Status Monitor
Help
Open Build Service
OBS Manuals
API Documentation
OBS Portal
Reporting a Bug
Contact
Mailing List
Forums
Chat (IRC)
Twitter
Open Build Service (OBS)
is an
openSUSE project
.
浙ICP备2022010568号-2