From 9479e4325ba2a3940e51dbe3755fa686d9da85e3 Mon Sep 17 00:00:00 2001 From: "M. Rehan" Date: Tue, 21 Jan 2025 13:57:16 +0500 Subject: [PATCH] Refactor CPU reporting --- .../usr/lib/netdata/python.d/cputemp.chart.py | 8 +- .../python.d/truenas_cpu_usage.chart.py | 21 ++-- .../etc_files/netdata/netdata.conf.mako | 12 +-- .../middlewared/plugins/reporting/events.py | 3 - .../plugins/reporting/netdata/graphs.py | 11 ++- .../reporting/realtime_reporting/cpu.py | 30 +++--- .../middlewared/plugins/reporting/utils.py | 3 +- .../reporting/test_netdata_approximation.py | 18 ++-- .../reporting/test_netdata_stats_func.py | 97 ++++++++++--------- .../pytest/unit/utils/test_cpu_usage_util.py | 11 ++- .../middlewared/utils/metrics/cpu_usage.py | 27 ++++-- 11 files changed, 128 insertions(+), 113 deletions(-) diff --git a/src/freenas/usr/lib/netdata/python.d/cputemp.chart.py b/src/freenas/usr/lib/netdata/python.d/cputemp.chart.py index e6f8edc42f923..dcf80b95b38ac 100755 --- a/src/freenas/usr/lib/netdata/python.d/cputemp.chart.py +++ b/src/freenas/usr/lib/netdata/python.d/cputemp.chart.py @@ -76,10 +76,14 @@ def get_data(self): cpu_temps = {} data = {} + total_temp = 0 for core, temp in cpu_temps.items(): - data[str(core)] = temp + data[f'cpu{core}'] = temp + total_temp += temp - return data or {str(i): 0 for i in range(cpu_info()['core_count'])} + if total_temp: + data['cpu'] = total_temp / len(data.keys()) + return data or ({f'cpu{i}': 0 for i in range(cpu_info()['core_count'])} | {'cpu': 0}) def check(self): try: diff --git a/src/freenas/usr/lib/netdata/python.d/truenas_cpu_usage.chart.py b/src/freenas/usr/lib/netdata/python.d/truenas_cpu_usage.chart.py index b4ccdd14aea2e..f4cf69900fd9f 100644 --- a/src/freenas/usr/lib/netdata/python.d/truenas_cpu_usage.chart.py +++ b/src/freenas/usr/lib/netdata/python.d/truenas_cpu_usage.chart.py @@ -6,21 +6,24 @@ class Service(SimpleService): def __init__(self, configuration=None, name=None): SimpleService.__init__(self, configuration=configuration, name=name) + self.old_stats = {} def check(self): self.add_cpu_usage_to_charts() return True def get_data(self): - return get_cpu_usage() + data, self.old_stats = get_cpu_usage(self.old_stats) + return data def add_cpu_usage_to_charts(self): - for cpu_name in get_cpu_usage().keys(): - self.charts.add_chart([ - cpu_name, cpu_name, cpu_name, 'CPU USAGE%', - 'cpu.usage', - 'Cpu usage', - 'line', - ]) + data, self.old_stats = get_cpu_usage() + self.charts.add_chart([ + 'cpu', 'cpu', 'cpu', 'CPU USAGE%', + 'cpu.usage', + 'Cpu usage', + 'line', + ]) - self.charts[cpu_name].add_dimension([cpu_name, 'usage', 'absolute']) + for cpu_name in filter(lambda s: s.startswith('cpu'), data.keys()): + self.charts['cpu'].add_dimension([f'{cpu_name}', f'{cpu_name}', 'absolute']) diff --git a/src/middlewared/middlewared/etc_files/netdata/netdata.conf.mako b/src/middlewared/middlewared/etc_files/netdata/netdata.conf.mako index a484cb0a539c4..dedd9d05f772e 100644 --- a/src/middlewared/middlewared/etc_files/netdata/netdata.conf.mako +++ b/src/middlewared/middlewared/etc_files/netdata/netdata.conf.mako @@ -64,9 +64,7 @@ /proc/meminfo = no /proc/net/dev = yes /proc/pagetypeinfo = no - # /proc/stat = yes - we keep this uncommented as by default that enables it, for some reason - # if we explicitly set it to yes it still does not has the desired affect and we are not able - # to retrieve system.cpu stats + /proc/stat = no /proc/uptime = yes /proc/loadavg = yes /proc/sys/kernel/random/entropy_avail = no @@ -120,11 +118,3 @@ [plugin:cgroups] enable by default cgroups names matching = !*udev* * - -[plugin:proc:/proc/stat] - per cpu core utilization = no - context switches = no - cpu interrupts = no - processes started = no - processes running = no - cpu idle states = no diff --git a/src/middlewared/middlewared/plugins/reporting/events.py b/src/middlewared/middlewared/plugins/reporting/events.py index 63a6b3b2587a6..42fea2211841e 100644 --- a/src/middlewared/middlewared/plugins/reporting/events.py +++ b/src/middlewared/middlewared/plugins/reporting/events.py @@ -104,9 +104,6 @@ def run_sync(self): 'failed_to_connect': False, } - # CPU temperature - data['cpu']['temperature_celsius'] = self.middleware.call_sync('reporting.cpu_temperatures') or None - self.send_event('ADDED', fields=data) time.sleep(interval) diff --git a/src/middlewared/middlewared/plugins/reporting/netdata/graphs.py b/src/middlewared/middlewared/plugins/reporting/netdata/graphs.py index 651b698ab3055..4c2665f7511bf 100644 --- a/src/middlewared/middlewared/plugins/reporting/netdata/graphs.py +++ b/src/middlewared/middlewared/plugins/reporting/netdata/graphs.py @@ -1,5 +1,6 @@ import typing +from middlewared.utils.cpu import cpu_info from middlewared.utils.disk_temperatures import get_disks_for_temperature_reading from .graph_base import GraphBase @@ -13,12 +14,12 @@ class CPUPlugin(GraphBase): vertical_label = '%CPU' def get_chart_name(self, identifier: typing.Optional[str] = None) -> str: - return 'system.cpu' + return 'truenas_cpu_usage.cpu' - def query_parameters(self) -> dict: - return super().query_parameters() | { - 'dimensions': 'system|user|idle|softirq|nice|iowait', - } + async def get_identifiers(self) -> typing.Optional[list]: + cpu_usage = [f'cpu{i}' for i in cpu_info()['core']] + cpu_usage.append('cpu') + return cpu_usage class CPUTempPlugin(GraphBase): diff --git a/src/middlewared/middlewared/plugins/reporting/realtime_reporting/cpu.py b/src/middlewared/middlewared/plugins/reporting/realtime_reporting/cpu.py index c82d98a8ff7b6..bcaafcaddc7a0 100644 --- a/src/middlewared/middlewared/plugins/reporting/realtime_reporting/cpu.py +++ b/src/middlewared/middlewared/plugins/reporting/realtime_reporting/cpu.py @@ -2,18 +2,24 @@ from .utils import safely_retrieve_dimension -def calculate_usage(cpu_stats: dict) -> float: - cp_total = sum(cpu_stats.values()) - return ((cp_total - cpu_stats['idle'] - cpu_stats['iowait']) / cp_total) * 100 if cp_total else 0 - - def get_cpu_stats(netdata_metrics: dict) -> dict: - metric_name = 'system.cpu' - fields = ['user', 'nice', 'system', 'idle', 'iowait', 'irq', 'softirq', 'steal', 'guest', 'guest_nice'] - data = {field: safely_retrieve_dimension(netdata_metrics, metric_name, field, 0) for field in fields} - data['aggregated_usage'] = safely_retrieve_dimension(netdata_metrics, 'truenas_cpu_usage.cpu', 'cpu', 0) + data = { + 'cpu': { + 'usage': safely_retrieve_dimension( + netdata_metrics, f'truenas_cpu_usage.cpu', 'cpu', 0 + ), + 'temp': safely_retrieve_dimension( + netdata_metrics, 'cputemp.temperatures', 'cpu', + ) or None + }} for core_index in range(cpu_info()['core_count']): - data[f'core{core_index}_usage'] = safely_retrieve_dimension( - netdata_metrics, f'truenas_cpu_usage.cpu{core_index}', f'cpu{core_index}', 0 - ) + data[f'cpu{core_index}'] = { + 'usage': safely_retrieve_dimension( + netdata_metrics, f'truenas_cpu_usage.cpu', f'cpu{core_index}', 0 + ), + 'temp': safely_retrieve_dimension( + netdata_metrics, 'cputemp.temperatures', f'cpu{core_index}', + ) or None + } + return data diff --git a/src/middlewared/middlewared/plugins/reporting/utils.py b/src/middlewared/middlewared/plugins/reporting/utils.py index ba68faaf9fa18..fa7d6361ae1e6 100644 --- a/src/middlewared/middlewared/plugins/reporting/utils.py +++ b/src/middlewared/middlewared/plugins/reporting/utils.py @@ -59,7 +59,6 @@ def get_metrics_approximation( ) -> dict: data = { 1: { - 'system.cpu': 10, 'system.clock_sync_state': 1, 'system.clock_status': 2, 'system.clock_sync_offset': 1, @@ -122,7 +121,7 @@ def get_metrics_approximation( 'cpu.usage': core_count + 1, # cputemp - 'cputemp.temperatures': core_count, + 'cputemp.temperatures': core_count + 1, # ups 'nut_ups.charge': 1, diff --git a/src/middlewared/middlewared/pytest/unit/plugins/reporting/test_netdata_approximation.py b/src/middlewared/middlewared/pytest/unit/plugins/reporting/test_netdata_approximation.py index 4b1a82b48bac4..cb6ee3a4f2d12 100644 --- a/src/middlewared/middlewared/pytest/unit/plugins/reporting/test_netdata_approximation.py +++ b/src/middlewared/middlewared/pytest/unit/plugins/reporting/test_netdata_approximation.py @@ -4,9 +4,9 @@ @pytest.mark.parametrize('disk_count,core_count,interface_count,services_count,vms_count,expected_output', [ - (4, 2, 1, 10, 2, {1: 708, 60: 4}), - (1600, 32, 4, 10, 1, {1: 8763, 60: 1600}), - (10, 16, 2, 12, 3, {1: 847, 60: 10}), + (4, 2, 1, 10, 2, {1: 699, 60: 4}), + (1600, 32, 4, 10, 1, {1: 8754, 60: 1600}), + (10, 16, 2, 12, 3, {1: 838, 60: 10}), ]) def test_netdata_metrics_count_approximation( disk_count, core_count, interface_count, services_count, vms_count, expected_output @@ -19,14 +19,14 @@ def test_netdata_metrics_count_approximation( @pytest.mark.parametrize( 'disk_count,core_count,interface_count,services_count,vms_count,days,' 'bytes_per_point,tier_interval,expected_output', [ - (4, 2, 1, 10, 2, 7, 1, 1, 408), + (4, 2, 1, 10, 2, 7, 1, 1, 403), (4, 2, 1, 10, 1, 7, 4, 60, 25), - (1600, 32, 4, 2, 4, 4, 1, 1, 2928), - (1600, 32, 4, 1, 4, 4, 4, 900, 13), - (10, 16, 2, 12, 1, 3, 1, 1, 185), + (1600, 32, 4, 2, 4, 4, 1, 1, 2925), + (1600, 32, 4, 1, 4, 4, 4, 900, 12), + (10, 16, 2, 12, 1, 3, 1, 1, 183), (10, 16, 2, 10, 3, 3, 4, 60, 13), - (1600, 32, 4, 12, 3, 18, 1, 1, 13196), - (1600, 32, 4, 12, 1, 18, 4, 900, 58), + (1600, 32, 4, 12, 3, 18, 1, 1, 13183), + (1600, 32, 4, 12, 1, 18, 4, 900, 57), ], ) def test_netdata_disk_space_approximation( diff --git a/src/middlewared/middlewared/pytest/unit/plugins/reporting/test_netdata_stats_func.py b/src/middlewared/middlewared/pytest/unit/plugins/reporting/test_netdata_stats_func.py index 2a47636f4a916..630894100223e 100644 --- a/src/middlewared/middlewared/pytest/unit/plugins/reporting/test_netdata_stats_func.py +++ b/src/middlewared/middlewared/pytest/unit/plugins/reporting/test_netdata_stats_func.py @@ -8,52 +8,53 @@ NETDATA_ALL_METRICS = { - 'system.cpu': { - 'name': 'system.cpu', - 'family': 'cpu', - 'context': 'system.cpu', - 'units': 'percentage', - 'last_updated': 1691150349, - 'dimensions': { - 'guest_nice': { - 'name': 'guest_nice', - 'value': 0.5375124 - }, - 'guest': { - 'name': 'guest', - 'value': 0.5375124 - }, - 'steal': { - 'name': 'steal', - 'value': 0.5275124 + 'cputemp.temperatures': { + 'name': 'cputemp.temperatures', + 'family': 'temperature', + 'context': 'sensors.temperature', + 'units': 'Celsius', + 'last_updated': 1737452189, + 'dimensions': { + 'cpu0': { + 'name': 'cpu0', + 'value': 22 }, - 'softirq': { - 'name': 'softirq', - 'value': 0.5175124 + 'cpu1': { + 'name': 'cpu1', + 'value': 21 }, - 'irq': { - 'name': 'irq', - 'value': 0.4975124 - }, - 'user': { - 'name': 'user', - 'value': 0.4975124 + 'cpu2': { + 'name': 'cpu2', + 'value': 10 }, - 'system': { - 'name': 'system', - 'value': 0.4975124 + 'cpu': { + 'name': 'cpu', + 'value': 40 + } + } + }, + 'truenas_cpu_usage.cpu': { + 'name': 'truenas_cpu_usage.cpu', + 'family': 'cpu.usage', + 'context': 'Cpu usage', + 'units': 'CPU USAGE%', + 'last_updated': 1737452189, + 'dimensions': { + 'cpu': { + 'name': 'cpu', + 'value': 4 }, - 'nice': { - 'name': 'nice', - 'value': 49.75124 + 'cpu0': { + 'name': 'cpu0', + 'value': 2 }, - 'iowait': { - 'name': 'iowait', - 'value': 4.75124 + 'cpu1': { + 'name': 'cpu1', + 'value': 3 }, - 'idle': { - 'name': 'idle', - 'value': 99.0049751 + 'cpu2': { + 'name': 'cpu2', + 'value': 4 } } }, @@ -729,13 +730,17 @@ def test_arc_stats(): def test_cpu_stats(): cpu_stat = get_cpu_stats(NETDATA_ALL_METRICS) - total_sum = 0 for metric, value in cpu_stat.items(): - if metric == 'usage': - assert value == ((total_sum - cpu_stat['idle'] - cpu_stat['iowait']) / total_sum) * 100 - else: - assert value == safely_retrieve_dimension(NETDATA_ALL_METRICS, 'system.cpu', metric, 0) - total_sum += value + assert value == { + 'usage': safely_retrieve_dimension( + NETDATA_ALL_METRICS, + f'truenas_cpu_usage.cpu', + f'{metric}', 0 + ), + 'temp': safely_retrieve_dimension( + NETDATA_ALL_METRICS, 'cputemp.temperatures', metric, + ) or None + } def test_disk_stats(): diff --git a/src/middlewared/middlewared/pytest/unit/utils/test_cpu_usage_util.py b/src/middlewared/middlewared/pytest/unit/utils/test_cpu_usage_util.py index c3f820b73bd17..cec72deaea39d 100644 --- a/src/middlewared/middlewared/pytest/unit/utils/test_cpu_usage_util.py +++ b/src/middlewared/middlewared/pytest/unit/utils/test_cpu_usage_util.py @@ -19,9 +19,10 @@ def test_memory_stats(): with patch('builtins.open', mock_open(read_data=STAT)): - assert get_cpu_usage() == { - 'cpu': calculate_cpu_usage([269140, 3068, 35283, 1952826, 2648, 0, 1978, 1126, 12492, 0]), - 'cpu0': calculate_cpu_usage([89904, 989, 11999, 512042, 887, 0, 997, 361, 4042, 0]), - 'cpu1': calculate_cpu_usage([93084, 950, 12594, 716357, 845, 0, 604, 369, 4842, 0]), - 'cpu2': calculate_cpu_usage([86151, 1129, 10690, 724426, 914, 0, 376, 396, 3608, 0]), + + assert get_cpu_usage({cpu: [0] * 9 for cpu in ('cpu_usage', 'cpu0_usage', 'cpu1_usage', 'cpu2_usage')})[0] == { + 'cpu': calculate_cpu_usage([269140, 3068, 35283, 1952826, 2648, 0, 1978, 1126, 12492, 0], [0] * 9), + 'cpu0': calculate_cpu_usage([89904, 989, 11999, 512042, 887, 0, 997, 361, 4042, 0], [0] * 9), + 'cpu1': calculate_cpu_usage([93084, 950, 12594, 716357, 845, 0, 604, 369, 4842, 0], [0] * 9), + 'cpu2': calculate_cpu_usage([86151, 1129, 10690, 724426, 914, 0, 376, 396, 3608, 0], [0] * 9), } diff --git a/src/middlewared/middlewared/utils/metrics/cpu_usage.py b/src/middlewared/middlewared/utils/metrics/cpu_usage.py index 5a2ec05a8cdba..5734a9bf43e1a 100644 --- a/src/middlewared/middlewared/utils/metrics/cpu_usage.py +++ b/src/middlewared/middlewared/utils/metrics/cpu_usage.py @@ -1,24 +1,26 @@ -def calculate_cpu_usage(cpu_times: list[int]) -> float: +def calculate_cpu_usage(cur_cpu_times: list[int], old_cpu_times: list[int]) -> float: """ Calculate CPU usage as a percentage. Excludes 'idle' and 'iowait' times from the calculation. Args: - cpu_times (list[int]): List of CPU time values. + cur_cpu_times (list[int]): List of CPU time values. + old_cpu_times (list[int]): List of CPU time values. Returns: float: CPU usage percentage, rounded to two decimal places. """ - total_time = sum(cpu_times) + delta_time = list(map(lambda args: args[0] - args[1], zip(cur_cpu_times, old_cpu_times))) + total_time = sum(delta_time) if total_time: - idle_time = cpu_times[3] # Idle - iowait_time = cpu_times[4] # I/O Wait + idle_time = delta_time[3] # Idle + iowait_time = delta_time[4] # I/O Wait active_time = total_time - idle_time - iowait_time return round((active_time / total_time) * 100, 2) return 0.0 -def get_cpu_usage() -> dict[str, float]: +def get_cpu_usage(old_stats: dict | None = None) -> (dict[str, float], dict[str, list]): """ Retrieve CPU usage statistics from /proc/stat. @@ -26,7 +28,11 @@ def get_cpu_usage() -> dict[str, float]: dict[str, float]: Dictionary containing CPU usage percentages for the aggregate ('cpu') and each individual core ('cpu0', 'cpu1', ...). """ + # Calculation is inspired by how htop does it + # https://github.com/htop-dev/htop/blob/3a9f468c626b9261dc3a5234fc362303aeb5103d/linux/Platform.c#L320 + old_stats = old_stats or {} cpu_usage_data = {} + cached_values = {} with open('/proc/stat') as f: # Process only CPU-related lines for line in filter(lambda x: x.startswith('cpu'), f): @@ -35,7 +41,10 @@ def get_cpu_usage() -> dict[str, float]: # all cpu cores and the later representing the # cpu core specific values core, values = line.split(' ', 1) + cpu_stats = list(map(int, values.strip().split())) cpu_usage_data[core] = calculate_cpu_usage( - list(map(int, values.strip().split())) - ) - return cpu_usage_data + cpu_stats, old_stats.get(core, [0] * len(values)) + ) if old_stats else 0 + cached_values[core] = cpu_stats + + return cpu_usage_data, cached_values