152 lines
9.6 KiB
Python
152 lines
9.6 KiB
Python
import logging
|
|
from sqlalchemy import text
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# ── In-memory threshold cache ──────────────────────────────────────────────────
|
|
# Loaded from DB on first use; invalidated by settings API after updates.
|
|
# Falls back to hard-coded defaults if DB has no rows yet (pre-seed).
|
|
|
|
_caches: dict[str, list[dict]] = {}
|
|
_dirty_sites: set[str] = {"sg-01"} # start dirty so first request loads from DB
|
|
|
|
|
|
def invalidate_threshold_cache(site_id: str = "sg-01") -> None:
|
|
"""Mark a site's cache as stale. Called by settings API after threshold changes."""
|
|
_dirty_sites.add(site_id)
|
|
|
|
|
|
async def _ensure_cache(session: AsyncSession, site_id: str) -> None:
|
|
if site_id not in _dirty_sites and site_id in _caches:
|
|
return
|
|
|
|
result = await session.execute(text("""
|
|
SELECT sensor_type, threshold_value, direction, severity, message_template
|
|
FROM alarm_thresholds
|
|
WHERE site_id = :site_id AND enabled = true
|
|
ORDER BY id
|
|
"""), {"site_id": site_id})
|
|
rows = result.mappings().all()
|
|
|
|
if rows:
|
|
_caches[site_id] = [dict(r) for r in rows]
|
|
else:
|
|
# DB not yet seeded — fall back to hard-coded defaults
|
|
_caches[site_id] = _FALLBACK_RULES
|
|
|
|
_dirty_sites.discard(site_id)
|
|
logger.info(f"Loaded {len(_caches[site_id])} threshold rules for {site_id}")
|
|
|
|
|
|
async def check_and_update_alarms(
|
|
session: AsyncSession,
|
|
sensor_id: str,
|
|
sensor_type: str,
|
|
site_id: str,
|
|
room_id: str | None,
|
|
rack_id: str | None,
|
|
value: float,
|
|
) -> None:
|
|
await _ensure_cache(session, site_id)
|
|
|
|
for rule in _caches.get(site_id, []):
|
|
if rule["sensor_type"] != sensor_type:
|
|
continue
|
|
|
|
threshold = rule["threshold_value"]
|
|
direction = rule["direction"]
|
|
severity = rule["severity"]
|
|
msg_tpl = rule["message_template"]
|
|
|
|
breached = (
|
|
(direction == "above" and value > threshold) or
|
|
(direction == "below" and value < threshold)
|
|
)
|
|
|
|
if breached:
|
|
existing = await session.execute(text("""
|
|
SELECT id FROM alarms
|
|
WHERE sensor_id = :sid AND severity = :sev AND state = 'active'
|
|
LIMIT 1
|
|
"""), {"sid": sensor_id, "sev": severity})
|
|
|
|
if not existing.fetchone():
|
|
message = msg_tpl.format(value=value, sensor_id=sensor_id)
|
|
await session.execute(text("""
|
|
INSERT INTO alarms
|
|
(sensor_id, site_id, room_id, rack_id, severity, message, state, triggered_at)
|
|
VALUES
|
|
(:sensor_id, :site_id, :room_id, :rack_id, :severity, :message, 'active', NOW())
|
|
"""), {
|
|
"sensor_id": sensor_id, "site_id": site_id,
|
|
"room_id": room_id, "rack_id": rack_id,
|
|
"severity": severity, "message": message,
|
|
})
|
|
logger.info(f"Alarm raised [{severity}]: {message}")
|
|
else:
|
|
await session.execute(text("""
|
|
UPDATE alarms
|
|
SET state = 'resolved', resolved_at = NOW()
|
|
WHERE sensor_id = :sid AND severity = :sev AND state = 'active'
|
|
"""), {"sid": sensor_id, "sev": severity})
|
|
|
|
|
|
# ── Hard-coded fallback (used before DB seed runs) ─────────────────────────────
|
|
|
|
_FALLBACK_RULES: list[dict] = [
|
|
{"sensor_type": st, "threshold_value": tv, "direction": d, "severity": s, "message_template": m}
|
|
for st, tv, d, s, m in [
|
|
("temperature", 28.0, "above", "warning", "Temperature elevated at {sensor_id}: {value:.1f}°C"),
|
|
("temperature", 32.0, "above", "critical", "Temperature critical at {sensor_id}: {value:.1f}°C"),
|
|
("humidity", 65.0, "above", "warning", "Humidity elevated at {sensor_id}: {value:.0f}%"),
|
|
("power_kw", 7.5, "above", "warning", "PDU load elevated at {sensor_id}: {value:.1f} kW"),
|
|
("power_kw", 9.5, "above", "critical", "PDU load critical at {sensor_id}: {value:.1f} kW"),
|
|
("ups_charge", 80.0, "below", "warning", "UPS battery low at {sensor_id}: {value:.0f}%"),
|
|
("ups_charge", 50.0, "below", "critical", "UPS battery critical at {sensor_id}: {value:.0f}%"),
|
|
("ups_state", 0.5, "above", "critical", "UPS switched to battery at {sensor_id} — mains power lost"),
|
|
("ups_state", 1.5, "above", "critical", "UPS overloaded at {sensor_id} — immediate risk of failure"),
|
|
("ups_load", 85.0, "above", "warning", "UPS load high at {sensor_id}: {value:.0f}%"),
|
|
("ups_load", 95.0, "above", "critical", "UPS load critical at {sensor_id}: {value:.0f}% — overload"),
|
|
("ups_runtime", 15.0, "below", "warning", "UPS runtime low at {sensor_id}: {value:.0f} min remaining"),
|
|
("ups_runtime", 5.0, "below", "critical", "UPS runtime critical at {sensor_id}: {value:.0f} min — imminent shutdown"),
|
|
("leak", 0.5, "above", "critical", "Water leak detected at {sensor_id}!"),
|
|
("cooling_cap_pct", 90.0, "above", "warning", "CRAC near capacity limit at {sensor_id}: {value:.1f}%"),
|
|
("cooling_cop", 1.5, "below", "warning", "CRAC running inefficiently at {sensor_id}: COP {value:.2f}"),
|
|
("cooling_comp_load", 95.0, "above", "warning", "CRAC compressor overloaded at {sensor_id}: {value:.1f}%"),
|
|
("cooling_high_press", 22.0, "above", "critical", "CRAC high refrigerant pressure at {sensor_id}: {value:.1f} bar"),
|
|
("cooling_low_press", 3.0, "below", "critical", "CRAC low refrigerant pressure at {sensor_id}: {value:.1f} bar — possible leak"),
|
|
("cooling_superheat", 16.0, "above", "warning", "CRAC discharge superheat high at {sensor_id}: {value:.1f}°C"),
|
|
("cooling_filter_dp", 80.0, "above", "warning", "CRAC filter requires attention at {sensor_id}: {value:.0f} Pa"),
|
|
("cooling_filter_dp", 120.0, "above", "critical", "CRAC filter critically blocked at {sensor_id}: {value:.0f} Pa — replace now"),
|
|
("cooling_return", 36.0, "above", "warning", "CRAC return air temperature high at {sensor_id}: {value:.1f}°C"),
|
|
("cooling_return", 42.0, "above", "critical", "CRAC return air temperature critical at {sensor_id}: {value:.1f}°C"),
|
|
("gen_fuel_pct", 25.0, "below", "warning", "Generator fuel low at {sensor_id}: {value:.1f}%"),
|
|
("gen_fuel_pct", 10.0, "below", "critical", "Generator fuel critical at {sensor_id}: {value:.1f}%"),
|
|
("gen_state", 0.5, "above", "warning", "Generator running at {sensor_id} — site is on standby power"),
|
|
("gen_state", -0.5, "below", "critical", "Generator fault at {sensor_id} — no standby power available"),
|
|
("gen_load_pct", 85.0, "above", "warning", "Generator load high at {sensor_id}: {value:.1f}%"),
|
|
("gen_load_pct", 95.0, "above", "critical", "Generator overloaded at {sensor_id}: {value:.1f}%"),
|
|
("gen_coolant_c", 95.0, "above", "warning", "Generator coolant temperature high at {sensor_id}: {value:.1f}°C"),
|
|
("gen_coolant_c", 105.0, "above", "critical", "Generator coolant critical at {sensor_id}: {value:.1f}°C — risk of shutdown"),
|
|
("gen_oil_press", 2.0, "below", "critical", "Generator oil pressure low at {sensor_id}: {value:.1f} bar"),
|
|
("pdu_imbalance", 5.0, "above", "warning", "PDU phase imbalance at {sensor_id}: {value:.1f}%"),
|
|
("pdu_imbalance", 15.0, "above", "critical", "PDU phase imbalance critical at {sensor_id}: {value:.1f}%"),
|
|
("ats_active", 1.5, "above", "warning", "ATS transferred to generator at {sensor_id} — utility power lost"),
|
|
("ats_ua_v", 50.0, "below", "critical", "Utility A power failure at {sensor_id} — supply lost"),
|
|
("chiller_state", 0.5, "below", "critical", "Chiller fault at {sensor_id} — CHW supply lost"),
|
|
("chiller_cop", 2.5, "below", "warning", "Chiller running inefficiently at {sensor_id}: COP {value:.2f}"),
|
|
("vesda_level", 0.5, "above", "warning", "VESDA smoke detected at {sensor_id}: level elevated"),
|
|
("vesda_level", 1.5, "above", "warning", "VESDA action threshold reached at {sensor_id}"),
|
|
("vesda_level", 2.5, "above", "critical", "VESDA FIRE ALARM at {sensor_id}!"),
|
|
("vesda_flow", 0.5, "below", "critical", "VESDA aspirator flow fault at {sensor_id} — detector may be compromised"),
|
|
("vesda_det1", 0.5, "below", "warning", "VESDA detector 1 fault at {sensor_id}"),
|
|
("vesda_det2", 0.5, "below", "warning", "VESDA detector 2 fault at {sensor_id}"),
|
|
("net_state", 0.5, "above", "warning", "Network switch degraded at {sensor_id}"),
|
|
("net_state", 1.5, "above", "critical", "Network switch down at {sensor_id} — connectivity lost"),
|
|
("net_pkt_loss_pct", 1.0, "above", "warning", "Packet loss detected at {sensor_id}: {value:.1f}%"),
|
|
("net_pkt_loss_pct", 5.0, "above", "critical", "High packet loss at {sensor_id}: {value:.1f}%"),
|
|
("net_temp_c", 65.0, "above", "warning", "Switch temperature high at {sensor_id}: {value:.1f}°C"),
|
|
("net_temp_c", 75.0, "above", "critical", "Switch temperature critical at {sensor_id}: {value:.1f}°C"),
|
|
]
|
|
]
|