Skip to content

Commit 5602edd

Browse files
committed
🔔 added health check alerts
1 parent 5a5164d commit 5602edd

4 files changed

Lines changed: 73 additions & 3 deletions

File tree

‎simvue/metrics.py‎

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@
99
import contextlib
1010
import logging
1111
import psutil
12-
12+
import os
13+
import typing
1314

1415
from .pynvml import (
1516
nvmlDeviceGetComputeRunningProcesses,
@@ -158,6 +159,8 @@ def to_dict(self) -> dict[str, float]:
158159
_metrics: dict[str, float] = {
159160
f"{RESOURCES_METRIC_PREFIX}/cpu.usage.percentage": self.cpu_percent,
160161
f"{RESOURCES_METRIC_PREFIX}/cpu.usage.memory": self.cpu_memory,
162+
f"{RESOURCES_METRIC_PREFIX}/memory.virtual.available.percentage": self.memory_available_percent,
163+
f"{RESOURCES_METRIC_PREFIX}/disk.available.percentage": self.disk_available_percent,
161164
}
162165

163166
for i, gpu in enumerate(self.gpus or []):
@@ -177,3 +180,11 @@ def gpu_percent(self) -> float:
177180
@property
178181
def gpu_memory(self) -> float:
179182
return sum(m[1] for m in self.gpus or []) / (len(self.gpus or []) or 1)
183+
184+
@property
185+
def memory_available_percent(self) -> float:
186+
return 100 - typing.cast("float", psutil.virtual_memory().percent)
187+
188+
@property
189+
def disk_available_percent(self) -> float:
190+
return 100 - psutil.disk_usage(os.getcwd()).percent

‎simvue/run.py‎

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -499,6 +499,27 @@ def _dispatch_callback(
499499

500500
return _dispatch_callback
501501

502+
def _define_system_health_alerts(self, terminate_on_alert: bool) -> None:
503+
"""Define system health resource metric alerts."""
504+
_ = self.create_metric_threshold_alert(
505+
name="low_available_virtual_memory",
506+
metric=f"{RESOURCES_METRIC_PREFIX}/memory.virtual.available.percentage",
507+
threshold=5,
508+
aggregation="at least one",
509+
window=2,
510+
rule="is below",
511+
trigger_abort=terminate_on_alert,
512+
)
513+
_ = self.create_metric_threshold_alert(
514+
name="low_disk_space",
515+
metric=f"{RESOURCES_METRIC_PREFIX}/disk.available.percentage",
516+
threshold=5,
517+
aggregation="at least one",
518+
window=2,
519+
rule="is below",
520+
trigger_abort=terminate_on_alert,
521+
)
522+
502523
def _start(self) -> bool:
503524
"""Start a run
504525
@@ -627,6 +648,7 @@ def init(
627648
retention_period: str | None = None,
628649
timeout: int | None = 180,
629650
visibility: typing.Literal["public", "tenant"] | list[str] | None = None,
651+
terminate_on_low_system_health: bool = True,
630652
no_color: bool = False,
631653
record_shell_vars: set[str] | None = None,
632654
) -> bool:
@@ -664,6 +686,10 @@ def init(
664686
* public - run viewable to all.
665687
* tenant - run viewable to all within the current tenant.
666688
* A list of usernames with which to share this run
689+
terminate_on_low_system_health : bool, optional
690+
whether to terminate this run if the resource metrics are
691+
registering unhealthy values, e.g. very low available memory
692+
default is True
667693
no_color : bool, optional
668694
disable terminal colors. Default False.
669695
record_shell_vars : list[str] | None,
@@ -774,6 +800,8 @@ def init(
774800
if self._status == "running":
775801
self._start()
776802

803+
self._define_system_health_alerts(terminate_on_low_system_health)
804+
777805
if self._user_config.run.mode == "online":
778806
click.secho(
779807
f"[simvue] Run {self.name} created",

‎simvue/system.py‎

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,18 @@
1+
"""
2+
System Information
3+
==================
4+
5+
Retrieve and assemble information on the current system.
6+
"""
7+
18
import os
29
import platform
310
import socket
411
import subprocess
512
import shutil
613
import sys
714
import contextlib
15+
import psutil
816
import typing
917

1018

@@ -60,6 +68,14 @@ def get_gpu_info():
6068
return _gpu_info
6169

6270

71+
def get_memory_info() -> dict[str, int]:
72+
"""Get total available memory in GB."""
73+
return {
74+
"virtual": typing.cast("int", psutil.virtual_memory().total) // 1024**3,
75+
"swap": psutil.swap_memory().total // 1024**3,
76+
}
77+
78+
6379
def get_system() -> dict[str, typing.Any]:
6480
"""
6581
Get system details
@@ -76,6 +92,7 @@ def get_system() -> dict[str, typing.Any]:
7692
system["platform"]["system"] = platform.system()
7793
system["platform"]["release"] = platform.release()
7894
system["platform"]["version"] = platform.version()
95+
system["memory"] = {k: f"{v}GB" for k, v in get_memory_info().items()}
7996
system["cpu"] = {}
8097
system["cpu"]["arch"] = cpu[1]
8198
system["cpu"]["processor"] = cpu[0]

‎tests/functional/test_run_class.py‎

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,14 @@
1818
import concurrent.futures
1919
import random
2020
import datetime
21+
import json
2122
import simvue
22-
from simvue.api.objects import Alert, Metrics
23+
from simvue.api.objects import Alert, Metrics, Folder
2324
from simvue.api.objects.grids import GridMetrics
2425
from simvue.exception import ObjectNotFoundError, SimvueRunError
2526
from simvue.sender import Sender
2627
import simvue.run as sv_run
2728
import simvue.client as sv_cl
28-
import simvue.config.user as sv_cfg
2929

3030
from simvue.api.objects import Run as RunObject
3131

@@ -1052,6 +1052,7 @@ def test_update_tags_offline(
10521052

10531053

10541054
@pytest.mark.run
1055+
@pytest.mark.online
10551056
@pytest.mark.parametrize("object_type", ("DataFrame", "ndarray"))
10561057
def test_save_object(
10571058
create_plain_run: tuple[sv_run.Run, dict], object_type: str
@@ -1074,6 +1075,7 @@ def test_save_object(
10741075

10751076

10761077
@pytest.mark.run
1078+
@pytest.mark.online
10771079
def test_add_alerts() -> None:
10781080
_uuid = f"{uuid.uuid4()}".split("-")[0]
10791081

@@ -1259,6 +1261,7 @@ def test_add_alerts_offline(monkeypatch) -> None:
12591261

12601262

12611263
@pytest.mark.run
1264+
@pytest.mark.online
12621265
def test_log_alert() -> None:
12631266
_uuid = f"{uuid.uuid4()}".split("-")[0]
12641267

@@ -1309,6 +1312,7 @@ def test_log_alert() -> None:
13091312

13101313

13111314
@pytest.mark.run
1315+
@pytest.mark.online
13121316
def test_abort_on_alert_process(mocker: pytest_mock.MockerFixture) -> None:
13131317
def testing_exit(status: int) -> None:
13141318
raise SystemExit(status)
@@ -1362,6 +1366,7 @@ def abort_callback(abort_run=trigger) -> None:
13621366

13631367

13641368
@pytest.mark.run
1369+
@pytest.mark.online
13651370
def test_abort_on_alert_python(
13661371
speedy_heartbeat, create_plain_run: tuple[sv_run.Run, dict], mocker: pytest_mock.MockerFixture
13671372
) -> None:
@@ -1382,6 +1387,7 @@ def test_abort_on_alert_python(
13821387

13831388

13841389
@pytest.mark.run
1390+
@pytest.mark.online
13851391
def test_abort_on_alert_raise(
13861392
create_plain_run: tuple[sv_run.Run, dict]
13871393
) -> None:
@@ -1406,6 +1412,7 @@ def test_abort_on_alert_raise(
14061412

14071413

14081414
@pytest.mark.run
1415+
@pytest.mark.online
14091416
def test_kill_all_processes(create_plain_run: tuple[sv_run.Run, dict]) -> None:
14101417
run, _ = create_plain_run
14111418
run.config(system_metrics_interval=1)
@@ -1421,6 +1428,7 @@ def test_kill_all_processes(create_plain_run: tuple[sv_run.Run, dict]) -> None:
14211428

14221429

14231430
@pytest.mark.run
1431+
@pytest.mark.online
14241432
def test_run_created_with_no_timeout() -> None:
14251433
_uuid = f"{uuid.uuid4()}".split("-")[0]
14261434
with simvue.Run() as run:
@@ -1443,6 +1451,7 @@ def test_run_created_with_no_timeout() -> None:
14431451

14441452
@pytest.mark.parametrize("mode", ("online", "offline"), ids=("online", "offline"))
14451453
@pytest.mark.run
1454+
@pytest.mark.online
14461455
def test_reconnect_functionality(mode, monkeypatch: pytest.MonkeyPatch) -> None:
14471456
temp_d: tempfile.TemporaryDirectory | None = None
14481457
_uuid = f"{uuid.uuid4()}".split("-")[0]
@@ -1486,6 +1495,7 @@ def test_reconnect_functionality(mode, monkeypatch: pytest.MonkeyPatch) -> None:
14861495

14871496

14881497
@pytest.mark.run
1498+
@pytest.mark.online
14891499
def test_env_var_metadata() -> None:
14901500
# Add some environment variables to glob
14911501
_recorded_env = {
@@ -1506,6 +1516,7 @@ def test_env_var_metadata() -> None:
15061516
assert all(key in _recorded_meta.get("shell") for key in _recorded_env)
15071517

15081518
@pytest.mark.run
1519+
@pytest.mark.online
15091520
def test_reconnect_with_process() -> None:
15101521
_uuid = f"{uuid.uuid4()}".split("-")[0]
15111522
with simvue.Run() as run:
@@ -1537,6 +1548,8 @@ def test_reconnect_with_process() -> None:
15371548
@pytest.mark.parametrize(
15381549
"environment", ("python_conda", "python_poetry", "python_uv", "julia", "rust", "nodejs")
15391550
)
1551+
@pytest.mark.run
1552+
@pytest.mark.online
15401553
def test_run_environment_metadata(environment: str, mocker: pytest_mock.MockerFixture) -> None:
15411554
"""Tests that the environment information is compatible with the server."""
15421555
from simvue.config.user import SimvueConfiguration
@@ -1558,3 +1571,4 @@ def test_run_environment_metadata(environment: str, mocker: pytest_mock.MockerFi
15581571
)
15591572
run.update_metadata(env_func(_target_dir))
15601573

1574+

0 commit comments

Comments
 (0)