Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NAS-133683 / 25.04 / Refactor CPU reporting #15445

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions src/freenas/usr/lib/netdata/python.d/cputemp.chart.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,14 @@ def get_data(self):
cpu_temps = {}

data = {}
total_temp = 0
for core, temp in cpu_temps.items():
data[str(core)] = temp
data[f'cpu{core}'] = temp
total_temp += temp

return data or {str(i): 0 for i in range(cpu_info()['core_count'])}
if total_temp:
data['cpu'] = total_temp / len(data.keys())
return data or ({f'cpu{i}': 0 for i in range(cpu_info()['core_count'])} | {'cpu': 0})

def check(self):
try:
Expand Down
21 changes: 12 additions & 9 deletions src/freenas/usr/lib/netdata/python.d/truenas_cpu_usage.chart.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,24 @@
class Service(SimpleService):
def __init__(self, configuration=None, name=None):
SimpleService.__init__(self, configuration=configuration, name=name)
self.old_stats = {}

def check(self):
self.add_cpu_usage_to_charts()
return True

def get_data(self):
return get_cpu_usage()
data, self.old_stats = get_cpu_usage(self.old_stats)
return data

def add_cpu_usage_to_charts(self):
for cpu_name in get_cpu_usage().keys():
self.charts.add_chart([
cpu_name, cpu_name, cpu_name, 'CPU USAGE%',
'cpu.usage',
'Cpu usage',
'line',
])
data, self.old_stats = get_cpu_usage()
self.charts.add_chart([
'cpu', 'cpu', 'cpu', 'CPU USAGE%',
'cpu.usage',
'Cpu usage',
'line',
])

self.charts[cpu_name].add_dimension([cpu_name, 'usage', 'absolute'])
for cpu_name in filter(lambda s: s.startswith('cpu'), data.keys()):
self.charts['cpu'].add_dimension([f'{cpu_name}', f'{cpu_name}', 'absolute'])
12 changes: 1 addition & 11 deletions src/middlewared/middlewared/etc_files/netdata/netdata.conf.mako
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,7 @@
/proc/meminfo = no
/proc/net/dev = yes
/proc/pagetypeinfo = no
# /proc/stat = yes - we keep this uncommented as by default that enables it, for some reason
# if we explicitly set it to yes it still does not has the desired affect and we are not able
# to retrieve system.cpu stats
/proc/stat = no
/proc/uptime = yes
/proc/loadavg = yes
/proc/sys/kernel/random/entropy_avail = no
Expand Down Expand Up @@ -120,11 +118,3 @@

[plugin:cgroups]
enable by default cgroups names matching = !*udev* *

[plugin:proc:/proc/stat]
per cpu core utilization = no
context switches = no
cpu interrupts = no
processes started = no
processes running = no
cpu idle states = no
3 changes: 0 additions & 3 deletions src/middlewared/middlewared/plugins/reporting/events.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,6 @@ def run_sync(self):
'failed_to_connect': False,
}

# CPU temperature
data['cpu']['temperature_celsius'] = self.middleware.call_sync('reporting.cpu_temperatures') or None

self.send_event('ADDED', fields=data)
time.sleep(interval)

Expand Down
11 changes: 6 additions & 5 deletions src/middlewared/middlewared/plugins/reporting/netdata/graphs.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import typing

from middlewared.utils.cpu import cpu_info
from middlewared.utils.disk_temperatures import get_disks_for_temperature_reading

from .graph_base import GraphBase
Expand All @@ -13,12 +14,12 @@ class CPUPlugin(GraphBase):
vertical_label = '%CPU'

def get_chart_name(self, identifier: typing.Optional[str] = None) -> str:
return 'system.cpu'
return 'truenas_cpu_usage.cpu'

def query_parameters(self) -> dict:
return super().query_parameters() | {
'dimensions': 'system|user|idle|softirq|nice|iowait',
}
async def get_identifiers(self) -> typing.Optional[list]:
cpu_usage = [f'cpu{i}' for i in cpu_info()['core']]
cpu_usage.append('cpu')
return cpu_usage


class CPUTempPlugin(GraphBase):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,24 @@
from .utils import safely_retrieve_dimension


def calculate_usage(cpu_stats: dict) -> float:
cp_total = sum(cpu_stats.values())
return ((cp_total - cpu_stats['idle'] - cpu_stats['iowait']) / cp_total) * 100 if cp_total else 0


def get_cpu_stats(netdata_metrics: dict) -> dict:
metric_name = 'system.cpu'
fields = ['user', 'nice', 'system', 'idle', 'iowait', 'irq', 'softirq', 'steal', 'guest', 'guest_nice']
data = {field: safely_retrieve_dimension(netdata_metrics, metric_name, field, 0) for field in fields}
data['aggregated_usage'] = safely_retrieve_dimension(netdata_metrics, 'truenas_cpu_usage.cpu', 'cpu', 0)
data = {
'cpu': {
'usage': safely_retrieve_dimension(
netdata_metrics, f'truenas_cpu_usage.cpu', 'cpu', 0
),
'temp': safely_retrieve_dimension(
netdata_metrics, 'cputemp.temperatures', 'cpu',
) or None
}}
for core_index in range(cpu_info()['core_count']):
data[f'core{core_index}_usage'] = safely_retrieve_dimension(
netdata_metrics, f'truenas_cpu_usage.cpu{core_index}', f'cpu{core_index}', 0
)
data[f'cpu{core_index}'] = {
'usage': safely_retrieve_dimension(
netdata_metrics, f'truenas_cpu_usage.cpu', f'cpu{core_index}', 0
),
'temp': safely_retrieve_dimension(
netdata_metrics, 'cputemp.temperatures', f'cpu{core_index}',
) or None
}

return data
3 changes: 1 addition & 2 deletions src/middlewared/middlewared/plugins/reporting/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ def get_metrics_approximation(
) -> dict:
data = {
1: {
'system.cpu': 10,
'system.clock_sync_state': 1,
'system.clock_status': 2,
'system.clock_sync_offset': 1,
Expand Down Expand Up @@ -122,7 +121,7 @@ def get_metrics_approximation(
'cpu.usage': core_count + 1,

# cputemp
'cputemp.temperatures': core_count,
'cputemp.temperatures': core_count + 1,

# ups
'nut_ups.charge': 1,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@


@pytest.mark.parametrize('disk_count,core_count,interface_count,services_count,vms_count,expected_output', [
(4, 2, 1, 10, 2, {1: 708, 60: 4}),
(1600, 32, 4, 10, 1, {1: 8763, 60: 1600}),
(10, 16, 2, 12, 3, {1: 847, 60: 10}),
(4, 2, 1, 10, 2, {1: 699, 60: 4}),
(1600, 32, 4, 10, 1, {1: 8754, 60: 1600}),
(10, 16, 2, 12, 3, {1: 838, 60: 10}),
])
def test_netdata_metrics_count_approximation(
disk_count, core_count, interface_count, services_count, vms_count, expected_output
Expand All @@ -19,14 +19,14 @@ def test_netdata_metrics_count_approximation(
@pytest.mark.parametrize(
'disk_count,core_count,interface_count,services_count,vms_count,days,'
'bytes_per_point,tier_interval,expected_output', [
(4, 2, 1, 10, 2, 7, 1, 1, 408),
(4, 2, 1, 10, 2, 7, 1, 1, 403),
(4, 2, 1, 10, 1, 7, 4, 60, 25),
(1600, 32, 4, 2, 4, 4, 1, 1, 2928),
(1600, 32, 4, 1, 4, 4, 4, 900, 13),
(10, 16, 2, 12, 1, 3, 1, 1, 185),
(1600, 32, 4, 2, 4, 4, 1, 1, 2925),
(1600, 32, 4, 1, 4, 4, 4, 900, 12),
(10, 16, 2, 12, 1, 3, 1, 1, 183),
(10, 16, 2, 10, 3, 3, 4, 60, 13),
(1600, 32, 4, 12, 3, 18, 1, 1, 13196),
(1600, 32, 4, 12, 1, 18, 4, 900, 58),
(1600, 32, 4, 12, 3, 18, 1, 1, 13183),
(1600, 32, 4, 12, 1, 18, 4, 900, 57),
],
)
def test_netdata_disk_space_approximation(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,52 +8,53 @@


NETDATA_ALL_METRICS = {
'system.cpu': {
'name': 'system.cpu',
'family': 'cpu',
'context': 'system.cpu',
'units': 'percentage',
'last_updated': 1691150349,
'dimensions': {
'guest_nice': {
'name': 'guest_nice',
'value': 0.5375124
},
'guest': {
'name': 'guest',
'value': 0.5375124
},
'steal': {
'name': 'steal',
'value': 0.5275124
'cputemp.temperatures': {
'name': 'cputemp.temperatures',
'family': 'temperature',
'context': 'sensors.temperature',
'units': 'Celsius',
'last_updated': 1737452189,
'dimensions': {
'cpu0': {
'name': 'cpu0',
'value': 22
},
'softirq': {
'name': 'softirq',
'value': 0.5175124
'cpu1': {
'name': 'cpu1',
'value': 21
},
'irq': {
'name': 'irq',
'value': 0.4975124
},
'user': {
'name': 'user',
'value': 0.4975124
'cpu2': {
'name': 'cpu2',
'value': 10
},
'system': {
'name': 'system',
'value': 0.4975124
'cpu': {
'name': 'cpu',
'value': 40
}
}
},
'truenas_cpu_usage.cpu': {
'name': 'truenas_cpu_usage.cpu',
'family': 'cpu.usage',
'context': 'Cpu usage',
'units': 'CPU USAGE%',
'last_updated': 1737452189,
'dimensions': {
'cpu': {
'name': 'cpu',
'value': 4
},
'nice': {
'name': 'nice',
'value': 49.75124
'cpu0': {
'name': 'cpu0',
'value': 2
},
'iowait': {
'name': 'iowait',
'value': 4.75124
'cpu1': {
'name': 'cpu1',
'value': 3
},
'idle': {
'name': 'idle',
'value': 99.0049751
'cpu2': {
'name': 'cpu2',
'value': 4
}
}
},
Expand Down Expand Up @@ -729,13 +730,17 @@ def test_arc_stats():

def test_cpu_stats():
cpu_stat = get_cpu_stats(NETDATA_ALL_METRICS)
total_sum = 0
for metric, value in cpu_stat.items():
if metric == 'usage':
assert value == ((total_sum - cpu_stat['idle'] - cpu_stat['iowait']) / total_sum) * 100
else:
assert value == safely_retrieve_dimension(NETDATA_ALL_METRICS, 'system.cpu', metric, 0)
total_sum += value
assert value == {
'usage': safely_retrieve_dimension(
NETDATA_ALL_METRICS,
f'truenas_cpu_usage.cpu',
f'{metric}', 0
),
'temp': safely_retrieve_dimension(
NETDATA_ALL_METRICS, 'cputemp.temperatures', metric,
) or None
}


def test_disk_stats():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,10 @@

def test_memory_stats():
with patch('builtins.open', mock_open(read_data=STAT)):
assert get_cpu_usage() == {
'cpu': calculate_cpu_usage([269140, 3068, 35283, 1952826, 2648, 0, 1978, 1126, 12492, 0]),
'cpu0': calculate_cpu_usage([89904, 989, 11999, 512042, 887, 0, 997, 361, 4042, 0]),
'cpu1': calculate_cpu_usage([93084, 950, 12594, 716357, 845, 0, 604, 369, 4842, 0]),
'cpu2': calculate_cpu_usage([86151, 1129, 10690, 724426, 914, 0, 376, 396, 3608, 0]),

assert get_cpu_usage({cpu: [0] * 9 for cpu in ('cpu_usage', 'cpu0_usage', 'cpu1_usage', 'cpu2_usage')})[0] == {
'cpu': calculate_cpu_usage([269140, 3068, 35283, 1952826, 2648, 0, 1978, 1126, 12492, 0], [0] * 9),
'cpu0': calculate_cpu_usage([89904, 989, 11999, 512042, 887, 0, 997, 361, 4042, 0], [0] * 9),
'cpu1': calculate_cpu_usage([93084, 950, 12594, 716357, 845, 0, 604, 369, 4842, 0], [0] * 9),
'cpu2': calculate_cpu_usage([86151, 1129, 10690, 724426, 914, 0, 376, 396, 3608, 0], [0] * 9),
}
27 changes: 18 additions & 9 deletions src/middlewared/middlewared/utils/metrics/cpu_usage.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,38 @@
def calculate_cpu_usage(cpu_times: list[int]) -> float:
def calculate_cpu_usage(cur_cpu_times: list[int], old_cpu_times: list[int]) -> float:
"""
Calculate CPU usage as a percentage.
Excludes 'idle' and 'iowait' times from the calculation.
Args:
cpu_times (list[int]): List of CPU time values.
cur_cpu_times (list[int]): List of CPU time values.
old_cpu_times (list[int]): List of CPU time values.
Returns:
float: CPU usage percentage, rounded to two decimal places.
"""
total_time = sum(cpu_times)
delta_time = list(map(lambda args: args[0] - args[1], zip(cur_cpu_times, old_cpu_times)))
total_time = sum(delta_time)
if total_time:
idle_time = cpu_times[3] # Idle
iowait_time = cpu_times[4] # I/O Wait
idle_time = delta_time[3] # Idle
iowait_time = delta_time[4] # I/O Wait
active_time = total_time - idle_time - iowait_time
return round((active_time / total_time) * 100, 2)
return 0.0


def get_cpu_usage() -> dict[str, float]:
def get_cpu_usage(old_stats: dict | None = None) -> (dict[str, float], dict[str, list]):
"""
Retrieve CPU usage statistics from /proc/stat.
Returns:
dict[str, float]: Dictionary containing CPU usage percentages for
the aggregate ('cpu') and each individual core ('cpu0', 'cpu1', ...).
"""
# Calculation is inspired by how htop does it
# https://github.com/htop-dev/htop/blob/3a9f468c626b9261dc3a5234fc362303aeb5103d/linux/Platform.c#L320
old_stats = old_stats or {}
cpu_usage_data = {}
cached_values = {}
with open('/proc/stat') as f:
# Process only CPU-related lines
for line in filter(lambda x: x.startswith('cpu'), f):
Expand All @@ -35,7 +41,10 @@ def get_cpu_usage() -> dict[str, float]:
# all cpu cores and the later representing the
# cpu core specific values
core, values = line.split(' ', 1)
cpu_stats = list(map(int, values.strip().split()))
cpu_usage_data[core] = calculate_cpu_usage(
list(map(int, values.strip().split()))
)
return cpu_usage_data
cpu_stats, old_stats.get(core, [0] * len(values))
) if old_stats else 0
cached_values[core] = cpu_stats

return cpu_usage_data, cached_values
Loading