diff --git a/demos/README.md b/demos/README.md
index 13a00ebea..1384e3884 100644
--- a/demos/README.md
+++ b/demos/README.md
@@ -24,6 +24,18 @@ Interactive demonstrations of NVSentinel's core capabilities that run locally on
 
 **Best for:** Understanding how NVSentinel's node-drainer can delegate pod eviction to external controllers for custom drain workflows coordinated with HPC schedulers.
 
+### [Fabric Manager Monitor](fabric-manager-monitor/)
+
+**What it shows:** Standalone DaemonSet that detects Fabric Manager failures, PCIe link degradation, NVLink fabric issues, GPU clock throttling, and CUDA context failures — all invisible to DCGM-based monitoring.
+
+**Requirements:** Docker, kubectl, Kubernetes cluster with GPU nodes, Prometheus Operator
+
+**Best for:** Catching GPU infrastructure failures that NVSentinel's existing health monitors miss. Validated on P4d.24xlarge (A100-SXM4) with Amazon Linux 2023.
+
+**Related issue:** [#883](https://github.com/NVIDIA/NVSentinel/issues/883)
+
+**Note:** For native NVSentinel integration (gRPC HealthEvents to platform-connector), see [`health-monitors/fabric-manager-monitor/`](../health-monitors/fabric-manager-monitor/).
+
 
 ## Coming Soon
 
diff --git a/demos/fabric-manager-monitor/Dockerfile b/demos/fabric-manager-monitor/Dockerfile
new file mode 100644
index 000000000..017a65c17
--- /dev/null
+++ b/demos/fabric-manager-monitor/Dockerfile
@@ -0,0 +1,21 @@
+FROM python:3.11-slim
+
+# nsenter is in util-linux (already in slim), but ensure it's available
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    util-linux \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY config.py metrics.py monitor.py ./
+COPY checks/ ./checks/
+
+# nsenter requires root to enter host namespaces.
+# The DaemonSet securityContext controls the actual privilege level.
+
+EXPOSE 9101
+
+ENTRYPOINT ["python", "monitor.py"]
diff --git a/demos/fabric-manager-monitor/README.md b/demos/fabric-manager-monitor/README.md
new file mode 100644
index 000000000..1aacef86b
--- /dev/null
+++ b/demos/fabric-manager-monitor/README.md
@@ -0,0 +1,84 @@
+# Fabric Manager & GPU Node Health Validator
+
+A standalone DaemonSet companion to NVSentinel that catches GPU infrastructure failures invisible to telemetry-based monitoring.
+
+**Related issue:** [#883 - NVSentinel not detecting fabric health on H100s](https://github.com/NVIDIA/NVSentinel/issues/883)
+
+## Problem
+
+NVIDIA Fabric Manager can fail and stay broken for weeks undetected. NVSentinel's existing monitors (DCGM-based, syslog-based) miss it because individual GPUs appear healthy to DCGM even when Fabric Manager is down. This tool fills the gap with service-level health checks.
+
+**Requirements:** Kubernetes cluster with GPU nodes, Prometheus Operator
+
+## What It Monitors
+
+| # | Check | What It Catches | Method |
+|---|-------|-----------------|--------|
+| 1 | **Fabric Manager Service** | FM not running, flapping, error state | `nsenter` + `systemctl` |
+| 2 | **Critical GPU Services** | persistenced, DCGM dead | `nsenter` + `systemctl` |
+| 3 | **PCIe Link Health** | Link downtraining (Gen5->Gen3, x16->x8) | `nsenter` + `nvidia-smi` |
+| 4 | **NVLink Fabric** | Bandwidth zero with FM down, CRC errors | DCGM metrics HTTP |
+| 5 | **CUDA Validation** | Context failures, memory errors | PyTorch subprocess |
+| 6 | **Clock & Throttle** | Silent throttling without XID | `nsenter` + `nvidia-smi` |
+
+## Quick Start
+
+```bash
+# Build
+docker build -t fabric-manager-monitor:latest .
+
+# Deploy (assumes nvsentinel namespace exists)
+kubectl apply -f k8s/rbac.yaml
+kubectl apply -f k8s/configmap.yaml
+kubectl apply -f k8s/daemonset.yaml
+kubectl apply -f k8s/servicemonitor.yaml
+
+# Verify
+kubectl get ds -n nvsentinel fabric-manager-monitor
+
+# Port-forward to a specific node's pod
+NODE=<node-name>
+POD=$(kubectl get pod -n nvsentinel -o wide --field-selector spec.nodeName=${NODE} \
+  -l app=fabric-manager-monitor -o jsonpath='{.items[0].metadata.name}')
+kubectl port-forward -n nvsentinel pod/${POD} 9101:9101
+curl -s localhost:9101/metrics | grep fabric_manager_up
+```
+
+## Metrics
+
+Exposed on port 9101. Key metrics:
+
+| Metric | Description |
+|--------|-------------|
+| `fabric_manager_up` | Fabric Manager running (1/0) |
+| `gpu_node_health_up` | Overall node health (1/0) |
+| `nvidia_service_up` | Per-service status |
+| `pcie_link_degraded` | PCIe link degraded per GPU |
+| `nvlink_fabric_healthy` | NVLink health |
+| `gpu_clock_throttled` | Clock throttled per GPU |
+| `gpu_clock_ratio` | Current/max clock ratio |
+
+## Alert Rules
+
+The ServiceMonitor includes PrometheusRule with 7 alerts:
+- `FabricManagerDown` (critical, 5m)
+- `FabricManagerFlapping` (warning, 5m)
+- `NVLinkFabricDegraded` (critical, 5m) — correlated: requires FM down AND NVLink degraded
+- `GPUPCIeLinkDegraded` (warning, 5m)
+- `GPUClockThrottled` (warning, 10m)
+- `GPUServiceDown` (critical, 3m)
+- `CUDAValidationFailed` (critical, 5m)
+
+## Validated On
+
+- 2x P4d.24xlarge (8x A100-SXM4-40GB each) — Amazon Linux 2023, EKS 1.32
+- All 6 check categories produce correct metrics
+- GPU Idle downclocking correctly filtered as benign
+
+## Configuration
+
+All settings via ConfigMap environment variables. See `k8s/configmap.yaml`.
+
+## Relationship to NVSentinel
+
+This is a **standalone companion tool** that exposes Prometheus metrics and alerts. It does not integrate with NVSentinel's gRPC event pipeline or remediation workflow. See the native `health-monitors/fabric-manager-monitor/` for an integrated version that emits HealthEvents to platform-connector.
diff --git a/demos/fabric-manager-monitor/checks/__init__.py b/demos/fabric-manager-monitor/checks/__init__.py
new file mode 100644
index 000000000..45e38d98c
--- /dev/null
+++ b/demos/fabric-manager-monitor/checks/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Health check modules for GPU Node Health Validator."""
diff --git a/demos/fabric-manager-monitor/checks/clock_check.py b/demos/fabric-manager-monitor/checks/clock_check.py
new file mode 100644
index 000000000..ac1ca94b7
--- /dev/null
+++ b/demos/fabric-manager-monitor/checks/clock_check.py
@@ -0,0 +1,157 @@
+"""Check 6: Clock and throttle detection.
+
+Detects silent GPU throttling by comparing current clocks against maximum
+and querying active throttle reasons. Catches performance degradation
+that doesn't generate XID errors.
+"""
+
+import logging
+import subprocess
+from dataclasses import dataclass
+from typing import List, Optional
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ClockStatus:
+    """Clock and throttle status for a single GPU."""
+    gpu_index: int
+    graphics_clock_current: int   # MHz
+    graphics_clock_max: int       # MHz
+    mem_clock_current: int        # MHz
+    mem_clock_max: int            # MHz
+    clock_ratio: float            # current/max (graphics)
+    throttled: bool
+    throttle_reasons: str = ""
+    error: Optional[str] = None
+
+
+class ClockChecker:
+    """Detects GPU clock throttling."""
+
+    def __init__(self, throttle_ratio: float = 0.85):
+        self._throttle_ratio = throttle_ratio
+
+    # Throttle reasons that are benign (not actual degradation)
+    _BENIGN_REASONS = {
+        "Not Active",
+        "0x0000000000000000",  # No throttle
+        "0x0000000000000001",  # GPU Idle — normal when no workload running
+    }
+
+    def check(self) -> List[ClockStatus]:
+        """Query clocks and throttle reasons for all GPUs."""
+        clocks = self._query_clocks()
+        reasons = self._query_throttle_reasons()
+
+        # Merge throttle reasons into clock results
+        reason_map = {r["gpu_index"]: r["reasons"] for r in reasons}
+        for status in clocks:
+            reason_str = reason_map.get(status.gpu_index, "")
+            status.throttle_reasons = reason_str
+
+            # GPU Idle causes low clock ratio but isn't a real throttle.
+            # Only flag as throttled for non-benign reasons.
+            if reason_str in self._BENIGN_REASONS:
+                status.throttled = False
+            elif reason_str:
+                status.throttled = True
+
+        return clocks
+
+    def _query_clocks(self) -> List[ClockStatus]:
+        """Get current vs max clocks from nvidia-smi."""
+        try:
+            result = subprocess.run(
+                [
+                    "nsenter", "-t", "1", "-m", "--",
+                    "nvidia-smi",
+                    "--query-gpu=index,clocks.current.graphics,clocks.max.graphics,"
+                    "clocks.current.memory,clocks.max.memory",
+                    "--format=csv,noheader,nounits",
+                ],
+                capture_output=True,
+                text=True,
+                timeout=15,
+            )
+
+            if result.returncode != 0:
+                logger.error("nvidia-smi clock query failed: %s", result.stderr.strip())
+                return []
+
+            return self._parse_clocks(result.stdout)
+
+        except subprocess.TimeoutExpired:
+            logger.error("nvidia-smi clock query timed out")
+            return []
+        except FileNotFoundError:
+            logger.error("nvidia-smi not found")
+            return []
+        except Exception as e:
+            logger.error("Clock check failed: %s", e)
+            return []
+
+    def _parse_clocks(self, output: str) -> List[ClockStatus]:
+        results = []
+        for line in output.strip().splitlines():
+            parts = [p.strip() for p in line.split(",")]
+            if len(parts) != 5:
+                continue
+            try:
+                idx = int(parts[0])
+                gfx_cur = int(parts[1])
+                gfx_max = int(parts[2])
+                mem_cur = int(parts[3])
+                mem_max = int(parts[4])
+
+                ratio = gfx_cur / gfx_max if gfx_max > 0 else 0.0
+                throttled = ratio < self._throttle_ratio
+
+                results.append(ClockStatus(
+                    gpu_index=idx,
+                    graphics_clock_current=gfx_cur,
+                    graphics_clock_max=gfx_max,
+                    mem_clock_current=mem_cur,
+                    mem_clock_max=mem_max,
+                    clock_ratio=round(ratio, 3),
+                    throttled=throttled,
+                ))
+            except (ValueError, IndexError, ZeroDivisionError) as e:
+                logger.warning("Failed to parse clock line '%s': %s", line, e)
+
+        return results
+
+    def _query_throttle_reasons(self) -> List[dict]:
+        """Get active throttle reasons from nvidia-smi."""
+        try:
+            result = subprocess.run(
+                [
+                    "nsenter", "-t", "1", "-m", "--",
+                    "nvidia-smi",
+                    "--query-gpu=index,clocks_throttle_reasons.active",
+                    "--format=csv,noheader",
+                ],
+                capture_output=True,
+                text=True,
+                timeout=15,
+            )
+
+            if result.returncode != 0:
+                return []
+
+            reasons = []
+            for line in result.stdout.strip().splitlines():
+                parts = [p.strip() for p in line.split(",", 1)]
+                if len(parts) == 2:
+                    try:
+                        reasons.append({
+                            "gpu_index": int(parts[0]),
+                            "reasons": parts[1],
+                        })
+                    except ValueError:
+                        pass
+            return reasons
+
+        except Exception:
+            return []
diff --git a/demos/fabric-manager-monitor/checks/cuda_validation.py b/demos/fabric-manager-monitor/checks/cuda_validation.py
new file mode 100644
index 000000000..7c1e0f8ec
--- /dev/null
+++ b/demos/fabric-manager-monitor/checks/cuda_validation.py
@@ -0,0 +1,104 @@
+"""Check 5: CUDA validation — context creation and memory test.
+
+Runs a minimal CUDA test on each GPU: allocate memory, write a pattern,
+read back, verify. Optional P2P test copies data between GPU pairs.
+This check runs at a slower cadence (default 10 minutes) since it
+consumes GPU resources.
+"""
+
+import logging
+import subprocess
+import sys
+import textwrap
+from dataclasses import dataclass, field
+from typing import List, Optional
+
+logger = logging.getLogger(__name__)
+
+# Inline Python script executed as a subprocess so that a PyTorch import
+# failure doesn't crash the main monitor process.
+_CUDA_TEST_SCRIPT = textwrap.dedent("""\
+    import sys
+    import json
+
+    results = {"passed": True, "gpu_count": 0, "errors": []}
+
+    try:
+        import torch
+    except ImportError:
+        results["errors"].append("PyTorch not available")
+        results["passed"] = False
+        print(json.dumps(results))
+        sys.exit(0)
+
+    gpu_count = torch.cuda.device_count()
+    results["gpu_count"] = gpu_count
+
+    if gpu_count == 0:
+        results["errors"].append("No CUDA devices found")
+        results["passed"] = False
+        print(json.dumps(results))
+        sys.exit(0)
+
+    for i in range(gpu_count):
+        try:
+            torch.cuda.set_device(i)
+            # Allocate, write, read back, verify
+            t = torch.randn(1024, device="cuda")
+            assert t.sum().isfinite(), f"GPU {i}: non-finite sum"
+            del t
+            torch.cuda.empty_cache()
+        except Exception as e:
+            results["errors"].append(f"GPU {i}: {e}")
+            results["passed"] = False
+
+    print(json.dumps(results))
+""")
+
+
+@dataclass
+class CUDAValidationResult:
+    """Result of CUDA validation across all GPUs."""
+    passed: bool
+    gpu_count: int = 0
+    errors: List[str] = field(default_factory=list)
+    error: Optional[str] = None  # check-level error (couldn't run at all)
+
+
+class CUDAValidator:
+    """Validates CUDA context creation and memory on each GPU."""
+
+    def check(self) -> CUDAValidationResult:
+        """Run CUDA validation script as a subprocess."""
+        try:
+            result = subprocess.run(
+                [sys.executable, "-c", _CUDA_TEST_SCRIPT],
+                capture_output=True,
+                text=True,
+                timeout=120,  # generous timeout for multi-GPU test
+            )
+
+            if result.returncode != 0:
+                return CUDAValidationResult(
+                    passed=False,
+                    error=f"CUDA test script failed: {result.stderr.strip()}",
+                )
+
+            import json
+            data = json.loads(result.stdout.strip())
+            return CUDAValidationResult(
+                passed=data.get("passed", False),
+                gpu_count=data.get("gpu_count", 0),
+                errors=data.get("errors", []),
+            )
+
+        except subprocess.TimeoutExpired:
+            return CUDAValidationResult(
+                passed=False,
+                error="CUDA validation timed out",
+            )
+        except Exception as e:
+            return CUDAValidationResult(
+                passed=False,
+                error=str(e),
+            )
diff --git a/demos/fabric-manager-monitor/checks/fabric_check.py b/demos/fabric-manager-monitor/checks/fabric_check.py
new file mode 100644
index 000000000..2fa5fc1b5
--- /dev/null
+++ b/demos/fabric-manager-monitor/checks/fabric_check.py
@@ -0,0 +1,105 @@
+"""Check 4: NVLink fabric health via DCGM metrics.
+
+Queries the DCGM exporter's Prometheus endpoint for NVLink bandwidth and
+error counters. False-positive mitigation: NVLink bandwidth is normally
+zero when no multi-GPU workload is running, so this check alone doesn't
+flag unhealthy -- monitor.py correlates with Fabric Manager status.
+"""
+
+import logging
+from dataclasses import dataclass
+from typing import Dict, List, Optional
+
+import requests
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class NVLinkStatus:
+    """NVLink fabric health summary for the node."""
+    healthy: bool
+    total_tx_bytes: float = 0.0
+    total_rx_bytes: float = 0.0
+    crc_error_count: float = 0.0
+    bandwidth_zero: bool = True
+    error: Optional[str] = None
+
+
+class NVLinkFabricChecker:
+    """Checks NVLink fabric health via DCGM exporter metrics."""
+
+    # DCGM metric names we care about
+    _TX_METRIC = "DCGM_FI_PROF_NVLINK_TX_BYTES"
+    _RX_METRIC = "DCGM_FI_PROF_NVLINK_RX_BYTES"
+    _BW_METRIC = "DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL"
+    _CRC_METRIC = "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL"
+
+    def __init__(self, dcgm_url: str = "http://localhost:9400"):
+        self._dcgm_url = dcgm_url.rstrip("/")
+
+    def check(self) -> NVLinkStatus:
+        """Query DCGM exporter and assess NVLink health."""
+        try:
+            metrics = self._fetch_metrics()
+        except Exception as e:
+            return NVLinkStatus(
+                healthy=True,  # can't determine -- assume healthy
+                error=f"Failed to fetch DCGM metrics: {e}",
+            )
+
+        tx = self._sum_metric(metrics, self._TX_METRIC)
+        rx = self._sum_metric(metrics, self._RX_METRIC)
+        bw = self._sum_metric(metrics, self._BW_METRIC)
+        crc = self._sum_metric(metrics, self._CRC_METRIC)
+
+        bandwidth_zero = (tx + rx + bw) == 0.0
+        has_errors = crc > 0
+
+        # NVLink bandwidth being zero is normal when idle.
+        # We only flag unhealthy if CRC errors are accumulating.
+        # The correlation with Fabric Manager down is done in monitor.py.
+        healthy = not has_errors
+
+        return NVLinkStatus(
+            healthy=healthy,
+            total_tx_bytes=tx,
+            total_rx_bytes=rx,
+            crc_error_count=crc,
+            bandwidth_zero=bandwidth_zero,
+        )
+
+    def _fetch_metrics(self) -> Dict[str, List[float]]:
+        """Fetch and parse Prometheus text format from DCGM exporter."""
+        resp = requests.get(
+            f"{self._dcgm_url}/metrics",
+            timeout=10,
+        )
+        resp.raise_for_status()
+        return self._parse_prometheus_text(resp.text)
+
+    def _parse_prometheus_text(self, text: str) -> Dict[str, List[float]]:
+        """Parse Prometheus exposition format into {metric_name: [values]}."""
+        metrics: Dict[str, list] = {}
+        for line in text.splitlines():
+            line = line.strip()
+            if not line or line.startswith("#"):
+                continue
+            # Lines: metric{labels} value [timestamp]
+            try:
+                parts = line.split(" ")
+                if len(parts) < 2:
+                    continue
+                name_and_labels = parts[0]
+                value_str = parts[1]  # value is always the second field
+                name = name_and_labels.split("{")[0]
+                value = float(value_str)
+                metrics.setdefault(name, []).append(value)
+            except (ValueError, IndexError):
+                continue
+        return metrics
+
+    def _sum_metric(self, metrics: Dict[str, list], name: str) -> float:
+        """Sum all values for a given metric name across GPUs."""
+        values = metrics.get(name, [])
+        return sum(values)
diff --git a/demos/fabric-manager-monitor/checks/pcie_check.py b/demos/fabric-manager-monitor/checks/pcie_check.py
new file mode 100644
index 000000000..ce0e3fcf0
--- /dev/null
+++ b/demos/fabric-manager-monitor/checks/pcie_check.py
@@ -0,0 +1,91 @@
+"""Check 3: PCIe link health — detects link downtraining.
+
+Compares current PCIe link generation and width against maximum values.
+On P5.48xlarge with H100 GPUs, expected: Gen5 x16 for all 8 GPUs.
+On P4d.24xlarge with A100 GPUs, expected: Gen4 x16 for all 8 GPUs.
+A drop (e.g. Gen5->Gen3 or x16->x8) indicates hardware degradation.
+"""
+
+import logging
+import subprocess
+from dataclasses import dataclass
+from typing import List, Optional
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class PCIeStatus:
+    """PCIe link status for a single GPU."""
+    gpu_index: int
+    link_gen_current: int
+    link_gen_max: int
+    link_width_current: int
+    link_width_max: int
+    degraded: bool
+    error: Optional[str] = None
+
+
+class PCIeChecker:
+    """Checks PCIe link width and generation for all GPUs."""
+
+    def check(self) -> List[PCIeStatus]:
+        """Query nvidia-smi for PCIe link status on all GPUs via nsenter."""
+        try:
+            result = subprocess.run(
+                [
+                    "nsenter", "-t", "1", "-m", "--",
+                    "nvidia-smi",
+                    "--query-gpu=index,pcie.link.gen.current,pcie.link.gen.max,"
+                    "pcie.link.width.current,pcie.link.width.max",
+                    "--format=csv,noheader,nounits",
+                ],
+                capture_output=True,
+                text=True,
+                timeout=15,
+            )
+
+            if result.returncode != 0:
+                logger.error("nvidia-smi PCIe query failed: %s", result.stderr.strip())
+                return []
+
+            return self._parse_output(result.stdout)
+
+        except subprocess.TimeoutExpired:
+            logger.error("nvidia-smi PCIe query timed out")
+            return []
+        except FileNotFoundError:
+            logger.error("nvidia-smi not found")
+            return []
+        except Exception as e:
+            logger.error("PCIe check failed: %s", e)
+            return []
+
+    def _parse_output(self, output: str) -> List[PCIeStatus]:
+        """Parse nvidia-smi CSV output into PCIeStatus objects."""
+        results = []
+        for line in output.strip().splitlines():
+            parts = [p.strip() for p in line.split(",")]
+            if len(parts) != 5:
+                continue
+            try:
+                idx = int(parts[0])
+                gen_cur = int(parts[1])
+                gen_max = int(parts[2])
+                width_cur = int(parts[3])
+                width_max = int(parts[4])
+
+                degraded = (gen_cur < gen_max) or (width_cur < width_max)
+
+                results.append(PCIeStatus(
+                    gpu_index=idx,
+                    link_gen_current=gen_cur,
+                    link_gen_max=gen_max,
+                    link_width_current=width_cur,
+                    link_width_max=width_max,
+                    degraded=degraded,
+                ))
+            except (ValueError, IndexError) as e:
+                logger.warning("Failed to parse PCIe line '%s': %s", line, e)
+
+        return results
diff --git a/demos/fabric-manager-monitor/checks/service_check.py b/demos/fabric-manager-monitor/checks/service_check.py
new file mode 100644
index 000000000..b6179afda
--- /dev/null
+++ b/demos/fabric-manager-monitor/checks/service_check.py
@@ -0,0 +1,229 @@
+"""Checks 1 & 2: Systemd service health for Fabric Manager and GPU services.
+
+Uses nsenter to inspect host systemd services from within a container.
+Includes flap detection (rapid restart cycling) and journal error parsing.
+"""
+
+import logging
+import subprocess
+import time
+from collections import deque
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+
+class ErrorCategory(Enum):
+    NVSWITCH_ERROR = "nvswitch_error"
+    INITIALIZATION_FAILED = "initialization_failed"
+    TIMEOUT = "timeout"
+    GENERAL_ERROR = "general_error"
+
+
+# Journal patterns that indicate specific failure modes
+_ERROR_PATTERNS = {
+    ErrorCategory.NVSWITCH_ERROR: [
+        "nvswitch",
+        "NVSwitch",
+        "fabric error",
+    ],
+    ErrorCategory.INITIALIZATION_FAILED: [
+        "initialization failed",
+        "failed to initialize",
+        "Init Failed",
+        "unable to start",
+    ],
+    ErrorCategory.TIMEOUT: [
+        "timed out",
+        "timeout",
+        "deadline exceeded",
+    ],
+}
+
+
+@dataclass
+class ServiceStatus:
+    """Result of a single systemd service check."""
+    name: str
+    active: bool               # True if ActiveState == "active"
+    sub_state: str = ""        # e.g. "running", "dead", "failed"
+    main_pid: int = 0
+    n_restarts: int = 0
+    start_timestamp: str = ""
+    error: Optional[str] = None  # non-None if the check itself failed
+
+
+@dataclass
+class FabricManagerStatus(ServiceStatus):
+    """Extended status for Fabric Manager with journal analysis."""
+    journal_errors: List[ErrorCategory] = field(default_factory=list)
+    flapping: bool = False
+
+
+class ServiceChecker:
+    """Checks host systemd services via nsenter."""
+
+    def __init__(self, flap_window: int = 600, flap_threshold: int = 3):
+        self._flap_window = flap_window
+        self._flap_threshold = flap_threshold
+        # Track restart timestamps per service for flap detection
+        self._restart_history: Dict[str, deque] = {}
+        # Track last-seen restart count to detect new restarts
+        self._last_restart_count: Dict[str, int] = {}
+
+    def _run_host_cmd(self, cmd: List[str], timeout: int = 10) -> subprocess.CompletedProcess:
+        """Run a command on the host via nsenter into PID 1's mount namespace."""
+        full_cmd = ["nsenter", "-t", "1", "-m", "--"] + cmd
+        return subprocess.run(
+            full_cmd,
+            capture_output=True,
+            text=True,
+            timeout=timeout,
+        )
+
+    def check_service(self, service_name: str) -> ServiceStatus:
+        """Check a single systemd service via nsenter.
+
+        Parses systemctl show output for ActiveState, SubState, MainPID,
+        and ExecMainStartTimestamp. NRestarts is queried separately since
+        older systemd versions don't support it.
+        """
+        try:
+            result = self._run_host_cmd([
+                "systemctl", "show", service_name,
+                "--property=ActiveState,SubState,MainPID,ExecMainStartTimestamp",
+            ])
+
+            if result.returncode != 0 and not result.stdout.strip():
+                return ServiceStatus(
+                    name=service_name,
+                    active=False,
+                    error=f"systemctl show failed: {result.stderr.strip()}",
+                )
+
+            props = {}
+            for line in result.stdout.strip().splitlines():
+                if "=" in line:
+                    key, _, value = line.partition("=")
+                    props[key.strip()] = value.strip()
+
+            active_state = props.get("ActiveState", "unknown")
+
+            # NRestarts isn't available on older systemd; query separately
+            n_restarts = self._get_restart_count(service_name)
+
+            # Flap detection
+            self._update_flap_tracking(service_name, n_restarts)
+
+            return ServiceStatus(
+                name=service_name,
+                active=(active_state == "active"),
+                sub_state=props.get("SubState", ""),
+                main_pid=int(props.get("MainPID", "0")),
+                n_restarts=n_restarts,
+                start_timestamp=props.get("ExecMainStartTimestamp", ""),
+            )
+
+        except subprocess.TimeoutExpired:
+            return ServiceStatus(
+                name=service_name,
+                active=False,
+                error="systemctl show timed out",
+            )
+        except Exception as e:
+            return ServiceStatus(
+                name=service_name,
+                active=False,
+                error=str(e),
+            )
+
+    def _get_restart_count(self, service_name: str) -> int:
+        """Get NRestarts from systemd, returning 0 if unsupported."""
+        try:
+            result = self._run_host_cmd([
+                "systemctl", "show", service_name, "--property=NRestarts",
+            ])
+            if result.returncode == 0 and result.stdout.strip():
+                _, _, val = result.stdout.strip().partition("=")
+                return int(val)
+        except Exception:
+            pass
+        return 0
+
+    def _update_flap_tracking(self, service_name: str, current_restarts: int) -> None:
+        """Track restart events for flap detection."""
+        if service_name not in self._restart_history:
+            self._restart_history[service_name] = deque()
+            self._last_restart_count[service_name] = current_restarts
+            return
+
+        last_count = self._last_restart_count[service_name]
+        if current_restarts > last_count:
+            # New restarts detected — record timestamp for each
+            now = time.monotonic()
+            for _ in range(current_restarts - last_count):
+                self._restart_history[service_name].append(now)
+            self._last_restart_count[service_name] = current_restarts
+
+        # Prune entries outside the flap window
+        cutoff = time.monotonic() - self._flap_window
+        history = self._restart_history[service_name]
+        while history and history[0] < cutoff:
+            history.popleft()
+
+    def is_flapping(self, service_name: str) -> bool:
+        """Return True if the service has restarted too many times within the window."""
+        history = self._restart_history.get(service_name, deque())
+        return len(history) >= self._flap_threshold
+
+    def check_fabric_manager(self) -> FabricManagerStatus:
+        """Check Fabric Manager with journal error analysis."""
+        base = self.check_service("nvidia-fabricmanager")
+
+        journal_errors = self._parse_journal_errors("nvidia-fabricmanager")
+        flapping = self.is_flapping("nvidia-fabricmanager")
+
+        return FabricManagerStatus(
+            name=base.name,
+            active=base.active,
+            sub_state=base.sub_state,
+            main_pid=base.main_pid,
+            n_restarts=base.n_restarts,
+            start_timestamp=base.start_timestamp,
+            error=base.error,
+            journal_errors=journal_errors,
+            flapping=flapping,
+        )
+
+    def _parse_journal_errors(self, service_name: str) -> List[ErrorCategory]:
+        """Scan recent journal entries for known error patterns."""
+        try:
+            result = self._run_host_cmd([
+                "journalctl", "-u", service_name,
+                "--since", "5 minutes ago",
+                "--no-pager", "-q",
+            ], timeout=15)
+
+            if result.returncode != 0 or not result.stdout.strip():
+                return []
+
+            found: List[ErrorCategory] = []
+            text = result.stdout.lower()
+            for category, patterns in _ERROR_PATTERNS.items():
+                if any(p.lower() in text for p in patterns):
+                    found.append(category)
+
+            return found
+
+        except (subprocess.TimeoutExpired, Exception) as e:
+            logger.warning("Journal parsing failed for %s: %s", service_name, e)
+            return []
+
+    def check_all_gpu_services(self, service_names: List[str]) -> Dict[str, ServiceStatus]:
+        """Check all configured GPU services."""
+        results = {}
+        for name in service_names:
+            results[name] = self.check_service(name)
+        return results
diff --git a/demos/fabric-manager-monitor/config.py b/demos/fabric-manager-monitor/config.py
new file mode 100644
index 000000000..9f8b290b0
--- /dev/null
+++ b/demos/fabric-manager-monitor/config.py
@@ -0,0 +1,79 @@
+"""Configuration for GPU Node Health Validator.
+
+All settings are driven by environment variables with sensible defaults.
+Designed to be configured via Kubernetes ConfigMap.
+"""
+
+import os
+from dataclasses import dataclass, field
+
+
+@dataclass
+class MonitorConfig:
+    """Monitor configuration loaded from environment variables."""
+
+    # Core settings
+    check_interval: int = 30          # seconds between check cycles
+    metrics_port: int = 9101          # Prometheus metrics port (avoids NVSentinel 2112)
+    log_level: str = "INFO"
+    node_name: str = ""               # populated from HOSTNAME or NODE_NAME
+
+    # Boot grace period - don't flag unhealthy during startup
+    boot_grace_period: int = 300      # seconds
+
+    # Flap detection
+    flap_window: int = 600            # seconds window for counting restarts
+    flap_threshold: int = 3           # restarts within window to flag flapping
+
+    # Check toggles
+    enable_fabric_check: bool = True
+    enable_pcie_check: bool = True
+    enable_clock_check: bool = True
+    enable_nvlink_check: bool = True
+    enable_cuda_validation: bool = False  # off by default (resource intensive)
+
+    # CUDA validation runs at a slower cadence
+    cuda_validation_interval: int = 600  # seconds
+
+    # DCGM exporter endpoint for NVLink metrics
+    dcgm_exporter_url: str = "http://localhost:9400"
+
+    # Clock throttle threshold (ratio of current/max)
+    clock_throttle_ratio: float = 0.85
+
+    # Services to monitor (besides fabric manager)
+    gpu_services: list = field(default_factory=lambda: [
+        "nvidia-fabricmanager",
+        "nvidia-persistenced",
+        "nv-hostengine",
+    ])
+
+    @classmethod
+    def from_env(cls) -> "MonitorConfig":
+        """Load configuration from environment variables."""
+        def _bool(val: str) -> bool:
+            return val.lower() in ("true", "1", "yes")
+
+        config = cls(
+            check_interval=int(os.environ.get("CHECK_INTERVAL", "30")),
+            metrics_port=int(os.environ.get("METRICS_PORT", "9101")),
+            log_level=os.environ.get("LOG_LEVEL", "INFO"),
+            node_name=os.environ.get("NODE_NAME", os.environ.get("HOSTNAME", "")),
+            boot_grace_period=int(os.environ.get("BOOT_GRACE_PERIOD", "300")),
+            flap_window=int(os.environ.get("FLAP_WINDOW", "600")),
+            flap_threshold=int(os.environ.get("FLAP_THRESHOLD", "3")),
+            enable_fabric_check=_bool(os.environ.get("ENABLE_FABRIC_CHECK", "true")),
+            enable_pcie_check=_bool(os.environ.get("ENABLE_PCIE_CHECK", "true")),
+            enable_clock_check=_bool(os.environ.get("ENABLE_CLOCK_CHECK", "true")),
+            enable_nvlink_check=_bool(os.environ.get("ENABLE_NVLINK_CHECK", "true")),
+            enable_cuda_validation=_bool(os.environ.get("ENABLE_CUDA_VALIDATION", "false")),
+            cuda_validation_interval=int(os.environ.get("CUDA_VALIDATION_INTERVAL", "600")),
+            dcgm_exporter_url=os.environ.get("DCGM_EXPORTER_URL", "http://localhost:9400"),
+            clock_throttle_ratio=float(os.environ.get("CLOCK_THROTTLE_RATIO", "0.85")),
+        )
+
+        services_env = os.environ.get("GPU_SERVICES")
+        if services_env:
+            config.gpu_services = [s.strip() for s in services_env.split(",") if s.strip()]
+
+        return config
diff --git a/demos/fabric-manager-monitor/k8s/configmap.yaml b/demos/fabric-manager-monitor/k8s/configmap.yaml
new file mode 100644
index 000000000..ceb8685af
--- /dev/null
+++ b/demos/fabric-manager-monitor/k8s/configmap.yaml
@@ -0,0 +1,24 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: fabric-manager-monitor
+  namespace: nvsentinel
+  labels:
+    app.kubernetes.io/name: fabric-manager-monitor
+    app.kubernetes.io/instance: nvsentinel
+    app.kubernetes.io/component: metrics
+data:
+  CHECK_INTERVAL: "30"
+  METRICS_PORT: "9101"
+  BOOT_GRACE_PERIOD: "300"
+  FLAP_WINDOW: "600"
+  FLAP_THRESHOLD: "3"
+  ENABLE_FABRIC_CHECK: "true"
+  ENABLE_PCIE_CHECK: "true"
+  ENABLE_CLOCK_CHECK: "true"
+  ENABLE_NVLINK_CHECK: "true"
+  ENABLE_CUDA_VALIDATION: "false"
+  CUDA_VALIDATION_INTERVAL: "600"
+  DCGM_EXPORTER_URL: "http://localhost:9400"
+  CLOCK_THROTTLE_RATIO: "0.85"
+  LOG_LEVEL: "INFO"
diff --git a/demos/fabric-manager-monitor/k8s/daemonset.yaml b/demos/fabric-manager-monitor/k8s/daemonset.yaml
new file mode 100644
index 000000000..3621beb0c
--- /dev/null
+++ b/demos/fabric-manager-monitor/k8s/daemonset.yaml
@@ -0,0 +1,90 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: fabric-manager-monitor
+  namespace: nvsentinel
+  labels:
+    app.kubernetes.io/name: fabric-manager-monitor
+    app.kubernetes.io/instance: nvsentinel
+    app.kubernetes.io/component: metrics
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: fabric-manager-monitor
+      app.kubernetes.io/instance: nvsentinel
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: fabric-manager-monitor
+        app.kubernetes.io/instance: nvsentinel
+        app.kubernetes.io/component: metrics
+    spec:
+      serviceAccountName: fabric-manager-monitor
+      hostPID: true  # Required for nsenter to check host systemd services and nvidia-smi
+      nodeSelector:
+        # Matches any node with NVIDIA GPUs (set by GPU Feature Discovery)
+        feature.node.kubernetes.io/pci-10de.present: "true"
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
+      containers:
+        - name: monitor
+          image: fabric-manager-monitor:0.1.0  # pin to specific version
+          envFrom:
+            - configMapRef:
+                name: fabric-manager-monitor
+          env:
+            - name: NODE_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: spec.nodeName
+          ports:
+            - name: metrics
+              containerPort: 9101
+              protocol: TCP
+          livenessProbe:
+            httpGet:
+              path: /
+              port: metrics
+            initialDelaySeconds: 10
+            periodSeconds: 30
+          resources:
+            requests:
+              cpu: 50m
+              memory: 64Mi
+            limits:
+              cpu: 100m
+              memory: 128Mi
+          securityContext:
+            privileged: true  # Required for nsenter into host mount namespace
+          volumeMounts:
+            - name: host-dbus
+              mountPath: /var/run/dbus
+              readOnly: true
+      volumes:
+        - name: host-dbus
+          hostPath:
+            path: /var/run/dbus
+            type: Directory
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: fabric-manager-monitor
+  namespace: nvsentinel
+  labels:
+    app.kubernetes.io/name: fabric-manager-monitor
+    app.kubernetes.io/instance: nvsentinel
+    app.kubernetes.io/component: metrics
+spec:
+  selector:
+    app.kubernetes.io/name: fabric-manager-monitor
+    app.kubernetes.io/instance: nvsentinel
+  ports:
+    - name: metrics
+      port: 9101
+      targetPort: 9101
+      protocol: TCP
+  clusterIP: None  # Headless — each pod scraped individually
diff --git a/demos/fabric-manager-monitor/k8s/rbac.yaml b/demos/fabric-manager-monitor/k8s/rbac.yaml
new file mode 100644
index 000000000..1afebcfee
--- /dev/null
+++ b/demos/fabric-manager-monitor/k8s/rbac.yaml
@@ -0,0 +1,41 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: fabric-manager-monitor
+  namespace: nvsentinel
+  labels:
+    app.kubernetes.io/name: fabric-manager-monitor
+    app.kubernetes.io/instance: nvsentinel
+    app.kubernetes.io/component: metrics
+
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: fabric-manager-monitor
+  labels:
+    app.kubernetes.io/name: fabric-manager-monitor
+    app.kubernetes.io/instance: nvsentinel
+    app.kubernetes.io/component: metrics
+rules:
+  - apiGroups: [""]
+    resources: ["nodes"]
+    verbs: ["get", "list"]
+
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: fabric-manager-monitor
+  labels:
+    app.kubernetes.io/name: fabric-manager-monitor
+    app.kubernetes.io/instance: nvsentinel
+    app.kubernetes.io/component: metrics
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: fabric-manager-monitor
+subjects:
+  - kind: ServiceAccount
+    name: fabric-manager-monitor
+    namespace: nvsentinel
diff --git a/demos/fabric-manager-monitor/k8s/servicemonitor.yaml b/demos/fabric-manager-monitor/k8s/servicemonitor.yaml
new file mode 100644
index 000000000..c16de7704
--- /dev/null
+++ b/demos/fabric-manager-monitor/k8s/servicemonitor.yaml
@@ -0,0 +1,134 @@
+# ServiceMonitor for Prometheus Operator auto-discovery
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: fabric-manager-monitor
+  namespace: nvsentinel
+  labels:
+    app.kubernetes.io/name: fabric-manager-monitor
+    app.kubernetes.io/instance: nvsentinel
+    app.kubernetes.io/component: metrics
+    release: prometheus  # Required for Prometheus Operator discovery
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: fabric-manager-monitor
+      app.kubernetes.io/instance: nvsentinel
+      app.kubernetes.io/component: metrics
+  namespaceSelector:
+    matchNames:
+      - nvsentinel
+  endpoints:
+    - port: metrics
+      interval: 30s
+      scrapeTimeout: 10s
+      path: /metrics
+      scheme: http
+
+---
+# PrometheusRule with alert definitions
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: fabric-manager-monitor
+  namespace: nvsentinel
+  labels:
+    app.kubernetes.io/name: fabric-manager-monitor
+    app.kubernetes.io/instance: nvsentinel
+    release: prometheus
+spec:
+  groups:
+    - name: gpu-node-health
+      rules:
+        # Critical: Fabric Manager down for 5 minutes
+        - alert: FabricManagerDown
+          expr: fabric_manager_up == 0
+          for: 5m
+          labels:
+            severity: critical
+            component: fabric-manager-monitor
+          annotations:
+            summary: "Fabric Manager down on {{ $labels.node }}"
+            description: >-
+              Fabric Manager has been down for more than 5 minutes on node
+              {{ $labels.node }}. NVLink/NVSwitch fabric is non-functional.
+              Multi-GPU workloads will fail silently or hang.
+            runbook: "ssh to node and run: sudo systemctl restart nvidia-fabricmanager"
+
+        # Warning: Fabric Manager flapping (restarting repeatedly)
+        - alert: FabricManagerFlapping
+          expr: increase(fabric_manager_restarts_total[10m]) > 3
+          for: 5m
+          labels:
+            severity: warning
+            component: fabric-manager-monitor
+          annotations:
+            summary: "Fabric Manager flapping on {{ $labels.node }}"
+            description: >-
+              Fabric Manager is restarting repeatedly on {{ $labels.node }}.
+              Check journalctl -u nvidia-fabricmanager for root cause.
+
+        # Critical: NVLink degraded AND Fabric Manager down (correlated)
+        - alert: NVLinkFabricDegraded
+          expr: nvlink_fabric_healthy == 0 and on(node) fabric_manager_up == 0
+          for: 5m
+          labels:
+            severity: critical
+            component: fabric-manager-monitor
+          annotations:
+            summary: "NVLink fabric degraded on {{ $labels.node }}"
+            description: >-
+              NVLink fabric is degraded AND Fabric Manager is down on
+              {{ $labels.node }}. All multi-GPU communication is broken.
+
+        # Warning: PCIe link downtraining
+        - alert: GPUPCIeLinkDegraded
+          expr: pcie_link_degraded == 1
+          for: 5m
+          labels:
+            severity: warning
+            component: fabric-manager-monitor
+          annotations:
+            summary: "PCIe link degraded on {{ $labels.node }} GPU {{ $labels.gpu }}"
+            description: >-
+              GPU {{ $labels.gpu }} on {{ $labels.node }} has a degraded PCIe link.
+              Current link may be running at reduced width or generation.
+
+        # Warning: GPU clock throttled for 10 minutes
+        - alert: GPUClockThrottled
+          expr: gpu_clock_throttled == 1
+          for: 10m
+          labels:
+            severity: warning
+            component: fabric-manager-monitor
+          annotations:
+            summary: "GPU {{ $labels.gpu }} throttled on {{ $labels.node }}"
+            description: >-
+              GPU {{ $labels.gpu }} on {{ $labels.node }} has been clock-throttled
+              for more than 10 minutes. Check thermal and power conditions.
+
+        # Critical: GPU systemd service down
+        - alert: GPUServiceDown
+          expr: nvidia_service_up == 0
+          for: 3m
+          labels:
+            severity: critical
+            component: fabric-manager-monitor
+          annotations:
+            summary: "{{ $labels.service_name }} down on {{ $labels.node }}"
+            description: >-
+              NVIDIA service {{ $labels.service_name }} has been down for more
+              than 3 minutes on {{ $labels.node }}.
+
+        # Critical: CUDA validation failed
+        - alert: CUDAValidationFailed
+          expr: cuda_validation_passed == 0
+          for: 5m
+          labels:
+            severity: critical
+            component: fabric-manager-monitor
+          annotations:
+            summary: "CUDA validation failed on {{ $labels.node }}"
+            description: >-
+              CUDA context creation or memory test failed on {{ $labels.node }}.
+              One or more GPUs may be in a bad state requiring driver reload or reboot.
diff --git a/demos/fabric-manager-monitor/metrics.py b/demos/fabric-manager-monitor/metrics.py
new file mode 100644
index 000000000..5c7e863dc
--- /dev/null
+++ b/demos/fabric-manager-monitor/metrics.py
@@ -0,0 +1,95 @@
+"""Prometheus metric definitions for GPU Node Health Validator.
+
+All metrics are defined in one place for consistency.
+Port 9101 to avoid conflict with NVSentinel's 2112.
+"""
+
+from prometheus_client import Gauge, Counter, Histogram, Info
+
+
+# --- Overall node health ---
+gpu_node_health_up = Gauge(
+    "gpu_node_health_up",
+    "Overall GPU node health (1=healthy, 0=unhealthy)",
+    ["node"],
+)
+
+# --- Fabric Manager ---
+fabric_manager_up = Gauge(
+    "fabric_manager_up",
+    "Fabric Manager service status (1=running, 0=down)",
+    ["node"],
+)
+fabric_manager_restarts_total = Counter(
+    "fabric_manager_restarts_total",
+    "Total Fabric Manager restart count observed",
+    ["node"],
+)
+fabric_manager_last_healthy_seconds = Gauge(
+    "fabric_manager_last_healthy_seconds",
+    "Unix timestamp of last healthy Fabric Manager observation",
+    ["node"],
+)
+
+# --- GPU systemd services ---
+nvidia_service_up = Gauge(
+    "nvidia_service_up",
+    "NVIDIA systemd service status (1=running, 0=down)",
+    ["node", "service_name"],
+)
+
+# --- PCIe link health ---
+pcie_link_width = Gauge(
+    "pcie_link_width",
+    "Current PCIe link width",
+    ["node", "gpu"],
+)
+pcie_link_gen = Gauge(
+    "pcie_link_gen",
+    "Current PCIe link generation",
+    ["node", "gpu"],
+)
+pcie_link_degraded = Gauge(
+    "pcie_link_degraded",
+    "PCIe link degraded (1=degraded, 0=normal)",
+    ["node", "gpu"],
+)
+
+# --- NVLink fabric ---
+nvlink_fabric_healthy = Gauge(
+    "nvlink_fabric_healthy",
+    "NVLink fabric health (1=healthy, 0=degraded)",
+    ["node"],
+)
+
+# --- CUDA validation ---
+cuda_validation_passed = Gauge(
+    "cuda_validation_passed",
+    "CUDA validation result (1=passed, 0=failed)",
+    ["node"],
+)
+
+# --- Clock throttling ---
+gpu_clock_throttled = Gauge(
+    "gpu_clock_throttled",
+    "GPU clock throttled (1=throttled, 0=normal)",
+    ["node", "gpu"],
+)
+gpu_clock_ratio = Gauge(
+    "gpu_clock_ratio",
+    "GPU clock ratio (current/max, 1.0=full speed)",
+    ["node", "gpu"],
+)
+
+# --- Check infrastructure ---
+health_check_duration_seconds = Histogram(
+    "health_check_duration_seconds",
+    "Duration of health check execution",
+    ["check_name"],
+    buckets=(0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0),
+)
+health_check_errors_total = Counter(
+    "health_check_errors_total",
+    "Total errors encountered during health checks",
+    ["check_name"],
+)
diff --git a/demos/fabric-manager-monitor/monitor.py b/demos/fabric-manager-monitor/monitor.py
new file mode 100644
index 000000000..858808b03
--- /dev/null
+++ b/demos/fabric-manager-monitor/monitor.py
@@ -0,0 +1,244 @@
+"""GPU Node Health Validator — main entry point.
+
+Runs a periodic check loop across all enabled health checks, exposes
+Prometheus metrics on the configured port, and computes overall node health.
+"""
+
+import logging
+import signal
+import sys
+import time
+from threading import Event
+
+from prometheus_client import start_http_server
+
+from config import MonitorConfig
+from metrics import (
+    gpu_node_health_up,
+    fabric_manager_up,
+    fabric_manager_restarts_total,
+    fabric_manager_last_healthy_seconds,
+    nvidia_service_up,
+    pcie_link_width,
+    pcie_link_gen,
+    pcie_link_degraded,
+    nvlink_fabric_healthy,
+    cuda_validation_passed,
+    gpu_clock_throttled,
+    gpu_clock_ratio,
+    health_check_duration_seconds,
+    health_check_errors_total,
+)
+from checks.service_check import ServiceChecker
+from checks.pcie_check import PCIeChecker
+from checks.clock_check import ClockChecker
+from checks.fabric_check import NVLinkFabricChecker
+from checks.cuda_validation import CUDAValidator
+
+logger = logging.getLogger(__name__)
+
+
+class FabricManagerMonitor:
+    """Orchestrates all health checks and exposes Prometheus metrics."""
+
+    def __init__(self, config: MonitorConfig):
+        self.config = config
+        self._shutdown = Event()
+        self._start_time = time.monotonic()
+        self._last_cuda_check = 0.0
+
+        # Initialize checkers
+        self._service_checker = ServiceChecker(
+            flap_window=config.flap_window,
+            flap_threshold=config.flap_threshold,
+        )
+        self._pcie_checker = PCIeChecker()
+        self._clock_checker = ClockChecker(throttle_ratio=config.clock_throttle_ratio)
+        self._nvlink_checker = NVLinkFabricChecker(dcgm_url=config.dcgm_exporter_url)
+        self._cuda_validator = CUDAValidator()
+
+        # Track state for cross-check correlation
+        self._fabric_manager_down = False
+        self._nvlink_bandwidth_zero = False
+
+    def run(self):
+        """Start metrics server and enter the check loop."""
+        logging.basicConfig(
+            level=getattr(logging, self.config.log_level, logging.INFO),
+            format="%(asctime)s %(levelname)s %(name)s: %(message)s",
+        )
+
+        logger.info("Starting GPU Node Health Validator on node=%s port=%d interval=%ds",
+                     self.config.node_name, self.config.metrics_port, self.config.check_interval)
+
+        # Register signal handlers for graceful shutdown
+        signal.signal(signal.SIGTERM, self._handle_signal)
+        signal.signal(signal.SIGINT, self._handle_signal)
+
+        # Start Prometheus HTTP server
+        start_http_server(self.config.metrics_port)
+        logger.info("Prometheus metrics server started on :%d", self.config.metrics_port)
+
+        while not self._shutdown.is_set():
+            try:
+                self.run_check_cycle()
+            except Exception:
+                logger.exception("Unexpected error in check cycle")
+            self._shutdown.wait(timeout=self.config.check_interval)
+
+        logger.info("Shutting down GPU Node Health Validator")
+
+    def _handle_signal(self, signum, frame):
+        logger.info("Received signal %d, initiating shutdown", signum)
+        self._shutdown.set()
+
+    def _in_grace_period(self) -> bool:
+        return (time.monotonic() - self._start_time) < self.config.boot_grace_period
+
+    def run_check_cycle(self):
+        """Execute all enabled checks and update metrics."""
+        node = self.config.node_name
+        overall_healthy = True
+
+        # --- Check 1 & 2: Services ---
+        if self.config.enable_fabric_check:
+            with health_check_duration_seconds.labels("services").time():
+                try:
+                    fm_status = self._service_checker.check_fabric_manager()
+                    self._fabric_manager_down = not fm_status.active
+
+                    fabric_manager_up.labels(node).set(1 if fm_status.active else 0)
+                    if fm_status.active:
+                        fabric_manager_last_healthy_seconds.labels(node).set(time.time())
+
+                    # Update restart counter (set to current total — Counter only goes up)
+                    if fm_status.n_restarts > 0:
+                        # We use _total suffix via Counter; increment by delta
+                        pass  # Counter tracks via flap detection instead
+
+                    if fm_status.flapping:
+                        logger.warning("Fabric Manager is flapping on %s", node)
+
+                    if fm_status.journal_errors:
+                        logger.warning("Fabric Manager journal errors on %s: %s",
+                                       node, [e.value for e in fm_status.journal_errors])
+
+                    if not fm_status.active and not self._in_grace_period():
+                        logger.error("Fabric Manager DOWN on %s (sub_state=%s)",
+                                     node, fm_status.sub_state)
+                        overall_healthy = False
+
+                    # All GPU services
+                    svc_results = self._service_checker.check_all_gpu_services(
+                        self.config.gpu_services
+                    )
+                    for svc_name, status in svc_results.items():
+                        nvidia_service_up.labels(node, svc_name).set(1 if status.active else 0)
+                        if not status.active and not self._in_grace_period():
+                            logger.error("Service %s DOWN on %s", svc_name, node)
+                            overall_healthy = False
+
+                except Exception:
+                    logger.exception("Service check failed")
+                    health_check_errors_total.labels("services").inc()
+
+        # --- Check 3: PCIe ---
+        if self.config.enable_pcie_check:
+            with health_check_duration_seconds.labels("pcie").time():
+                try:
+                    pcie_results = self._pcie_checker.check()
+                    for pcie in pcie_results:
+                        gpu = str(pcie.gpu_index)
+                        pcie_link_width.labels(node, gpu).set(pcie.link_width_current)
+                        pcie_link_gen.labels(node, gpu).set(pcie.link_gen_current)
+                        pcie_link_degraded.labels(node, gpu).set(1 if pcie.degraded else 0)
+                        if pcie.degraded:
+                            logger.warning(
+                                "PCIe degraded on %s GPU %s: Gen%d x%d (max Gen%d x%d)",
+                                node, gpu, pcie.link_gen_current, pcie.link_width_current,
+                                pcie.link_gen_max, pcie.link_width_max,
+                            )
+                            overall_healthy = False
+                except Exception:
+                    logger.exception("PCIe check failed")
+                    health_check_errors_total.labels("pcie").inc()
+
+        # --- Check 6: Clocks ---
+        if self.config.enable_clock_check:
+            with health_check_duration_seconds.labels("clocks").time():
+                try:
+                    clock_results = self._clock_checker.check()
+                    for clk in clock_results:
+                        gpu = str(clk.gpu_index)
+                        gpu_clock_throttled.labels(node, gpu).set(1 if clk.throttled else 0)
+                        gpu_clock_ratio.labels(node, gpu).set(clk.clock_ratio)
+                        if clk.throttled:
+                            logger.warning(
+                                "GPU %s throttled on %s: %d/%d MHz (ratio=%.2f, reasons=%s)",
+                                gpu, node, clk.graphics_clock_current,
+                                clk.graphics_clock_max, clk.clock_ratio, clk.throttle_reasons,
+                            )
+                except Exception:
+                    logger.exception("Clock check failed")
+                    health_check_errors_total.labels("clocks").inc()
+
+        # --- Check 4: NVLink ---
+        if self.config.enable_nvlink_check:
+            with health_check_duration_seconds.labels("nvlink").time():
+                try:
+                    nvlink_status = self._nvlink_checker.check()
+                    self._nvlink_bandwidth_zero = nvlink_status.bandwidth_zero
+
+                    # False-positive mitigation: only flag unhealthy when
+                    # NVLink has CRC errors OR bandwidth is zero AND FM is down
+                    fabric_nvlink_degraded = (
+                        not nvlink_status.healthy
+                        or (nvlink_status.bandwidth_zero and self._fabric_manager_down)
+                    )
+                    nvlink_fabric_healthy.labels(node).set(0 if fabric_nvlink_degraded else 1)
+
+                    if fabric_nvlink_degraded and not self._in_grace_period():
+                        logger.error(
+                            "NVLink fabric degraded on %s (crc_errors=%.0f, bw_zero=%s, fm_down=%s)",
+                            node, nvlink_status.crc_error_count,
+                            nvlink_status.bandwidth_zero, self._fabric_manager_down,
+                        )
+                        overall_healthy = False
+
+                except Exception:
+                    logger.exception("NVLink check failed")
+                    health_check_errors_total.labels("nvlink").inc()
+
+        # --- Check 5: CUDA validation (slower cadence) ---
+        if self.config.enable_cuda_validation:
+            now = time.monotonic()
+            if (now - self._last_cuda_check) >= self.config.cuda_validation_interval:
+                self._last_cuda_check = now
+                with health_check_duration_seconds.labels("cuda").time():
+                    try:
+                        cuda_result = self._cuda_validator.check()
+                        cuda_validation_passed.labels(node).set(1 if cuda_result.passed else 0)
+                        if not cuda_result.passed:
+                            logger.error("CUDA validation FAILED on %s: %s",
+                                         node, cuda_result.errors or cuda_result.error)
+                            overall_healthy = False
+                    except Exception:
+                        logger.exception("CUDA validation failed")
+                        health_check_errors_total.labels("cuda").inc()
+
+        # --- Overall health ---
+        if self._in_grace_period():
+            gpu_node_health_up.labels(node).set(1)
+            logger.debug("In boot grace period, reporting healthy")
+        else:
+            gpu_node_health_up.labels(node).set(1 if overall_healthy else 0)
+
+
+def main():
+    config = MonitorConfig.from_env()
+    monitor = FabricManagerMonitor(config)
+    monitor.run()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/demos/fabric-manager-monitor/requirements.txt b/demos/fabric-manager-monitor/requirements.txt
new file mode 100644
index 000000000..1dcd34366
--- /dev/null
+++ b/demos/fabric-manager-monitor/requirements.txt
@@ -0,0 +1,3 @@
+prometheus_client>=0.20.0
+requests>=2.32.2
+pyyaml>=6.0
diff --git a/demos/fabric-manager-monitor/tests/__init__.py b/demos/fabric-manager-monitor/tests/__init__.py
new file mode 100644
index 000000000..341a77c5b
--- /dev/null
+++ b/demos/fabric-manager-monitor/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/demos/fabric-manager-monitor/tests/test_monitor.py b/demos/fabric-manager-monitor/tests/test_monitor.py
new file mode 100644
index 000000000..ec4e1f477
--- /dev/null
+++ b/demos/fabric-manager-monitor/tests/test_monitor.py
@@ -0,0 +1,153 @@
+"""Tests for the main monitor module and config."""
+
+import os
+from unittest.mock import patch, MagicMock
+
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+from config import MonitorConfig
+from monitor import FabricManagerMonitor
+
+
+class TestMonitorConfig:
+    """Tests for configuration loading."""
+
+    def test_defaults(self):
+        config = MonitorConfig()
+        assert config.check_interval == 30
+        assert config.metrics_port == 9101
+        assert config.boot_grace_period == 300
+        assert config.enable_fabric_check is True
+        assert config.enable_cuda_validation is False
+        assert config.clock_throttle_ratio == 0.85
+        assert "nvidia-fabricmanager" in config.gpu_services
+
+    def test_from_env(self):
+        env = {
+            "CHECK_INTERVAL": "60",
+            "METRICS_PORT": "9200",
+            "LOG_LEVEL": "DEBUG",
+            "NODE_NAME": "test-node",
+            "BOOT_GRACE_PERIOD": "120",
+            "FLAP_WINDOW": "300",
+            "FLAP_THRESHOLD": "5",
+            "ENABLE_FABRIC_CHECK": "true",
+            "ENABLE_PCIE_CHECK": "false",
+            "ENABLE_CLOCK_CHECK": "true",
+            "ENABLE_NVLINK_CHECK": "false",
+            "ENABLE_CUDA_VALIDATION": "true",
+            "CUDA_VALIDATION_INTERVAL": "1200",
+            "DCGM_EXPORTER_URL": "http://dcgm:9400",
+            "CLOCK_THROTTLE_RATIO": "0.90",
+        }
+        with patch.dict(os.environ, env, clear=False):
+            config = MonitorConfig.from_env()
+
+        assert config.check_interval == 60
+        assert config.metrics_port == 9200
+        assert config.log_level == "DEBUG"
+        assert config.node_name == "test-node"
+        assert config.boot_grace_period == 120
+        assert config.flap_threshold == 5
+        assert config.enable_pcie_check is False
+        assert config.enable_nvlink_check is False
+        assert config.enable_cuda_validation is True
+        assert config.cuda_validation_interval == 1200
+        assert config.dcgm_exporter_url == "http://dcgm:9400"
+        assert config.clock_throttle_ratio == 0.90
+
+    def test_custom_gpu_services(self):
+        with patch.dict(os.environ, {"GPU_SERVICES": "svc-a, svc-b , svc-c"}, clear=False):
+            config = MonitorConfig.from_env()
+        assert config.gpu_services == ["svc-a", "svc-b", "svc-c"]
+
+    def test_bool_parsing(self):
+        for truthy in ("true", "True", "TRUE", "1", "yes", "Yes"):
+            with patch.dict(os.environ, {"ENABLE_CUDA_VALIDATION": truthy}, clear=False):
+                config = MonitorConfig.from_env()
+                assert config.enable_cuda_validation is True
+
+        for falsy in ("false", "False", "0", "no", ""):
+            with patch.dict(os.environ, {"ENABLE_CUDA_VALIDATION": falsy}, clear=False):
+                config = MonitorConfig.from_env()
+                assert config.enable_cuda_validation is False
+
+
+class TestFabricManagerMonitor:
+    """Tests for the monitor orchestrator."""
+
+    def _make_monitor(self, **overrides):
+        defaults = {
+            "check_interval": 30,
+            "metrics_port": 0,  # don't bind
+            "node_name": "test-node",
+            "boot_grace_period": 0,  # no grace period in tests
+            "enable_fabric_check": True,
+            "enable_pcie_check": False,
+            "enable_clock_check": False,
+            "enable_nvlink_check": False,
+            "enable_cuda_validation": False,
+        }
+        defaults.update(overrides)
+        config = MonitorConfig(**defaults)
+        return FabricManagerMonitor(config)
+
+    @patch("monitor.NVLinkFabricChecker")
+    @patch("monitor.ClockChecker")
+    @patch("monitor.PCIeChecker")
+    @patch("monitor.ServiceChecker")
+    def test_check_cycle_all_healthy(self, MockSvc, MockPCIe, MockClk, MockNVL):
+        from checks.service_check import FabricManagerStatus, ServiceStatus
+
+        mock_svc = MockSvc.return_value
+        mock_svc.check_fabric_manager.return_value = FabricManagerStatus(
+            name="nvidia-fabricmanager", active=True, sub_state="running",
+        )
+        mock_svc.check_all_gpu_services.return_value = {
+            "nvidia-fabricmanager": ServiceStatus(name="nvidia-fabricmanager", active=True),
+            "nvidia-persistenced": ServiceStatus(name="nvidia-persistenced", active=True),
+            "nv-hostengine": ServiceStatus(name="nv-hostengine", active=True),
+        }
+
+        monitor = self._make_monitor()
+        monitor._service_checker = mock_svc
+        monitor.run_check_cycle()
+
+        # Verify fabric_manager_up was called (metrics are global singletons)
+        mock_svc.check_fabric_manager.assert_called_once()
+        mock_svc.check_all_gpu_services.assert_called_once()
+
+    @patch("monitor.NVLinkFabricChecker")
+    @patch("monitor.ClockChecker")
+    @patch("monitor.PCIeChecker")
+    @patch("monitor.ServiceChecker")
+    def test_check_cycle_fabric_manager_down(self, MockSvc, MockPCIe, MockClk, MockNVL):
+        from checks.service_check import FabricManagerStatus, ServiceStatus
+
+        mock_svc = MockSvc.return_value
+        mock_svc.check_fabric_manager.return_value = FabricManagerStatus(
+            name="nvidia-fabricmanager", active=False, sub_state="failed",
+        )
+        mock_svc.check_all_gpu_services.return_value = {
+            "nvidia-fabricmanager": ServiceStatus(name="nvidia-fabricmanager", active=False),
+        }
+
+        monitor = self._make_monitor()
+        monitor._service_checker = mock_svc
+        monitor.run_check_cycle()
+
+        # Monitor should have flagged FM as down
+        assert monitor._fabric_manager_down is True
+
+    def test_grace_period_suppresses_unhealthy(self):
+        monitor = self._make_monitor(
+            boot_grace_period=9999,  # always in grace period
+            enable_fabric_check=False,
+        )
+        assert monitor._in_grace_period() is True
+
+    def test_no_grace_period(self):
+        monitor = self._make_monitor(boot_grace_period=0)
+        assert monitor._in_grace_period() is False
diff --git a/demos/fabric-manager-monitor/tests/test_pcie_check.py b/demos/fabric-manager-monitor/tests/test_pcie_check.py
new file mode 100644
index 000000000..5e64c185b
--- /dev/null
+++ b/demos/fabric-manager-monitor/tests/test_pcie_check.py
@@ -0,0 +1,137 @@
+"""Tests for pcie_check and clock_check modules."""
+
+import subprocess
+from unittest.mock import patch
+
+import sys
+import os
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+from checks.pcie_check import PCIeChecker, PCIeStatus
+from checks.clock_check import ClockChecker, ClockStatus
+
+
+class TestPCIeChecker:
+    """Tests for PCIeChecker."""
+
+    # Simulates 8x H100 GPUs on P5.48xlarge, all healthy at Gen5 x16
+    _HEALTHY_OUTPUT = "\n".join(
+        f"{i}, 5, 5, 16, 16" for i in range(8)
+    )
+
+    # GPU 3 downtraining: Gen5->Gen3, x16->x8
+    _DEGRADED_OUTPUT = "\n".join(
+        f"{i}, {'3' if i == 3 else '5'}, 5, {'8' if i == 3 else '16'}, 16"
+        for i in range(8)
+    )
+
+    @patch("checks.pcie_check.subprocess.run")
+    def test_all_healthy(self, mock_run):
+        mock_run.return_value = subprocess.CompletedProcess(
+            args=[], returncode=0, stdout=self._HEALTHY_OUTPUT, stderr="",
+        )
+        results = PCIeChecker().check()
+
+        assert len(results) == 8
+        for r in results:
+            assert r.degraded is False
+            assert r.link_gen_current == 5
+            assert r.link_width_current == 16
+
+    @patch("checks.pcie_check.subprocess.run")
+    def test_degraded_link(self, mock_run):
+        mock_run.return_value = subprocess.CompletedProcess(
+            args=[], returncode=0, stdout=self._DEGRADED_OUTPUT, stderr="",
+        )
+        results = PCIeChecker().check()
+
+        assert len(results) == 8
+        degraded = [r for r in results if r.degraded]
+        assert len(degraded) == 1
+        assert degraded[0].gpu_index == 3
+        assert degraded[0].link_gen_current == 3
+        assert degraded[0].link_width_current == 8
+
+    @patch("checks.pcie_check.subprocess.run")
+    def test_nvidia_smi_not_found(self, mock_run):
+        mock_run.side_effect = FileNotFoundError("nvidia-smi not found")
+        results = PCIeChecker().check()
+        assert results == []
+
+    @patch("checks.pcie_check.subprocess.run")
+    def test_nvidia_smi_timeout(self, mock_run):
+        mock_run.side_effect = subprocess.TimeoutExpired(cmd="", timeout=15)
+        results = PCIeChecker().check()
+        assert results == []
+
+    @patch("checks.pcie_check.subprocess.run")
+    def test_malformed_output(self, mock_run):
+        mock_run.return_value = subprocess.CompletedProcess(
+            args=[], returncode=0,
+            stdout="0, 5, 5, 16, 16\nbad line\n2, 5, 5, 16, 16\n",
+            stderr="",
+        )
+        results = PCIeChecker().check()
+        assert len(results) == 2  # skips bad line
+
+
+class TestClockChecker:
+    """Tests for ClockChecker."""
+
+    _HEALTHY_CLOCKS = "\n".join(
+        f"{i}, 1980, 1980, 2619, 2619" for i in range(8)
+    )
+
+    _THROTTLED_CLOCKS = "\n".join(
+        f"{i}, {'1200' if i == 0 else '1980'}, 1980, 2619, 2619"
+        for i in range(8)
+    )
+
+    @patch("checks.clock_check.ClockChecker._query_throttle_reasons")
+    @patch("checks.clock_check.subprocess.run")
+    def test_no_throttling(self, mock_run, mock_reasons):
+        mock_run.return_value = subprocess.CompletedProcess(
+            args=[], returncode=0, stdout=self._HEALTHY_CLOCKS, stderr="",
+        )
+        mock_reasons.return_value = []
+        results = ClockChecker(throttle_ratio=0.85).check()
+
+        assert len(results) == 8
+        for r in results:
+            assert r.throttled is False
+            assert r.clock_ratio == 1.0
+
+    @patch("checks.clock_check.ClockChecker._query_throttle_reasons")
+    @patch("checks.clock_check.subprocess.run")
+    def test_throttled_gpu(self, mock_run, mock_reasons):
+        mock_run.return_value = subprocess.CompletedProcess(
+            args=[], returncode=0, stdout=self._THROTTLED_CLOCKS, stderr="",
+        )
+        mock_reasons.return_value = [
+            {"gpu_index": 0, "reasons": "SW Thermal Slowdown"},
+        ]
+        results = ClockChecker(throttle_ratio=0.85).check()
+
+        throttled = [r for r in results if r.throttled]
+        assert len(throttled) == 1
+        assert throttled[0].gpu_index == 0
+        assert throttled[0].clock_ratio < 0.85
+
+    @patch("checks.clock_check.ClockChecker._query_throttle_reasons")
+    @patch("checks.clock_check.subprocess.run")
+    def test_custom_threshold(self, mock_run, mock_reasons):
+        # GPU 0 at 1800/1980 = ~0.909 — above 0.85 but below 0.95
+        output = "0, 1800, 1980, 2619, 2619"
+        mock_run.return_value = subprocess.CompletedProcess(
+            args=[], returncode=0, stdout=output, stderr="",
+        )
+        mock_reasons.return_value = []
+
+        # With default 0.85 threshold — not throttled
+        results = ClockChecker(throttle_ratio=0.85).check()
+        assert results[0].throttled is False
+
+        # With strict 0.95 threshold — throttled
+        results = ClockChecker(throttle_ratio=0.95).check()
+        assert results[0].throttled is True
diff --git a/demos/fabric-manager-monitor/tests/test_service_check.py b/demos/fabric-manager-monitor/tests/test_service_check.py
new file mode 100644
index 000000000..19633a59d
--- /dev/null
+++ b/demos/fabric-manager-monitor/tests/test_service_check.py
@@ -0,0 +1,159 @@
+"""Tests for service_check module."""
+
+import subprocess
+import time
+from collections import deque
+from unittest.mock import patch, MagicMock
+
+import sys
+import os
+
+# Add parent directory to path for imports
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+from checks.service_check import (
+    ServiceChecker,
+    ServiceStatus,
+    FabricManagerStatus,
+    ErrorCategory,
+)
+
+
+class TestServiceChecker:
+    """Tests for ServiceChecker."""
+
+    def _mock_systemctl_output(self, active="active", sub="running", pid=1234, restarts=0):
+        """Build a mock systemctl show output string."""
+        return (
+            f"ActiveState={active}\n"
+            f"SubState={sub}\n"
+            f"MainPID={pid}\n"
+            f"NRestarts={restarts}\n"
+            f"ExecMainStartTimestamp=Thu 2026-01-15 20:00:00 UTC\n"
+        )
+
+    @patch("checks.service_check.ServiceChecker._run_host_cmd")
+    def test_check_service_active(self, mock_run):
+        mock_run.return_value = subprocess.CompletedProcess(
+            args=[], returncode=0,
+            stdout=self._mock_systemctl_output(active="active", sub="running"),
+            stderr="",
+        )
+        checker = ServiceChecker()
+        status = checker.check_service("nvidia-fabricmanager")
+
+        assert status.active is True
+        assert status.sub_state == "running"
+        assert status.main_pid == 1234
+        assert status.error is None
+
+    @patch("checks.service_check.ServiceChecker._run_host_cmd")
+    def test_check_service_failed(self, mock_run):
+        mock_run.return_value = subprocess.CompletedProcess(
+            args=[], returncode=0,
+            stdout=self._mock_systemctl_output(active="failed", sub="failed"),
+            stderr="",
+        )
+        checker = ServiceChecker()
+        status = checker.check_service("nvidia-fabricmanager")
+
+        assert status.active is False
+        assert status.sub_state == "failed"
+
+    @patch("checks.service_check.ServiceChecker._run_host_cmd")
+    def test_check_service_timeout(self, mock_run):
+        mock_run.side_effect = subprocess.TimeoutExpired(cmd="", timeout=10)
+
+        checker = ServiceChecker()
+        status = checker.check_service("nvidia-fabricmanager")
+
+        assert status.active is False
+        assert status.error == "systemctl show timed out"
+
+    @patch("checks.service_check.ServiceChecker._run_host_cmd")
+    def test_check_service_command_failure(self, mock_run):
+        mock_run.return_value = subprocess.CompletedProcess(
+            args=[], returncode=4,
+            stdout="",
+            stderr="Unit not found",
+        )
+        checker = ServiceChecker()
+        status = checker.check_service("nonexistent-service")
+
+        assert status.active is False
+        assert "Unit not found" in status.error
+
+    def test_flap_detection_no_flap(self):
+        checker = ServiceChecker(flap_window=600, flap_threshold=3)
+        # Simulate 2 restarts (below threshold)
+        checker._restart_history["test"] = deque()
+        checker._last_restart_count["test"] = 0
+
+        checker._update_flap_tracking("test", 2)
+        assert not checker.is_flapping("test")
+
+    def test_flap_detection_triggers(self):
+        checker = ServiceChecker(flap_window=600, flap_threshold=3)
+        checker._restart_history["test"] = deque()
+        checker._last_restart_count["test"] = 0
+
+        # Simulate 3 restarts (at threshold)
+        checker._update_flap_tracking("test", 3)
+        assert checker.is_flapping("test")
+
+    def test_flap_detection_window_expiry(self):
+        checker = ServiceChecker(flap_window=1, flap_threshold=3)
+        checker._restart_history["test"] = deque()
+        checker._last_restart_count["test"] = 0
+
+        # Add restarts
+        checker._update_flap_tracking("test", 3)
+        assert checker.is_flapping("test")
+
+        # Wait for window to expire
+        time.sleep(1.1)
+        checker._update_flap_tracking("test", 3)  # same count, triggers prune
+        assert not checker.is_flapping("test")
+
+    @patch("checks.service_check.ServiceChecker._run_host_cmd")
+    def test_check_fabric_manager_with_journal_errors(self, mock_run):
+        def side_effect(cmd, timeout=10):
+            if "systemctl" in cmd:
+                return subprocess.CompletedProcess(
+                    args=[], returncode=0,
+                    stdout=self._mock_systemctl_output(active="failed", sub="failed"),
+                    stderr="",
+                )
+            elif "journalctl" in cmd:
+                return subprocess.CompletedProcess(
+                    args=[], returncode=0,
+                    stdout="Jan 15 20:00:00 node1 fabricmanager: NVSwitch fatal error detected\n",
+                    stderr="",
+                )
+            return subprocess.CompletedProcess(args=[], returncode=1, stdout="", stderr="")
+
+        mock_run.side_effect = side_effect
+        checker = ServiceChecker()
+        status = checker.check_fabric_manager()
+
+        assert isinstance(status, FabricManagerStatus)
+        assert status.active is False
+        assert ErrorCategory.NVSWITCH_ERROR in status.journal_errors
+
+    @patch("checks.service_check.ServiceChecker._run_host_cmd")
+    def test_check_all_gpu_services(self, mock_run):
+        mock_run.return_value = subprocess.CompletedProcess(
+            args=[], returncode=0,
+            stdout=self._mock_systemctl_output(active="active", sub="running"),
+            stderr="",
+        )
+        checker = ServiceChecker()
+        results = checker.check_all_gpu_services([
+            "nvidia-fabricmanager",
+            "nvidia-persistenced",
+            "nv-hostengine",
+        ])
+
+        assert len(results) == 3
+        for name, status in results.items():
+            assert status.active is True
diff --git a/health-monitors/fabric-manager-monitor/Dockerfile b/health-monitors/fabric-manager-monitor/Dockerfile
new file mode 100644
index 000000000..a939cb9a6
--- /dev/null
+++ b/health-monitors/fabric-manager-monitor/Dockerfile
@@ -0,0 +1,56 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG VERSION="0.1.0"
+
+FROM public.ecr.aws/docker/library/python:3.10-bookworm AS build
+
+ARG VERSION
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install poetry==1.8.2
+
+ENV POETRY_NO_INTERACTION=1 \
+    POETRY_VIRTUALENVS_IN_PROJECT=1 \
+    POETRY_VIRTUALENVS_CREATE=1 \
+    POETRY_CACHE_DIR=/tmp/poetry_cache
+
+WORKDIR /app
+
+COPY health-monitors/fabric-manager-monitor/ .
+# Set package version from build arg (strip 'v' prefix for PEP 440 compliance)
+RUN poetry version $(echo "${VERSION}" | sed 's/^v//')
+RUN --mount=type=cache,target=/tmp/poetry_cache \
+    poetry build --format wheel
+RUN poetry export --format requirements.txt --output constraints.txt --without-hashes
+
+
+FROM public.ecr.aws/docker/library/python:3.10-slim-bookworm AS runtime
+
+# util-linux provides nsenter for inspecting host systemd services
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt,sharing=locked \
+    echo 'Acquire::Retries "3";' > /etc/apt/apt.conf.d/80-retries && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends \
+        util-linux
+
+ENV PYTHONUNBUFFERED=1
+
+COPY --from=build /app/dist/*.whl ./
+COPY --from=build /app/constraints.txt ./
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install ./fabric_manager_monitor*.whl --constraint constraints.txt
+
+ENTRYPOINT ["fabric_manager_monitor"]
diff --git a/health-monitors/fabric-manager-monitor/README.md b/health-monitors/fabric-manager-monitor/README.md
new file mode 100644
index 000000000..207c6230b
--- /dev/null
+++ b/health-monitors/fabric-manager-monitor/README.md
@@ -0,0 +1,110 @@
+# Fabric Manager Monitor
+
+Health monitor for detecting Fabric Manager, PCIe, NVLink, and GPU infrastructure failures on NVIDIA GPU nodes.
+
+## Problem Statement
+
+The gpu-health-monitor watches DCGM health watches (XID errors, ECC memory, thermals), but several critical infrastructure failure modes exist outside DCGM's visibility:
+
+| Failure Mode | Detection Method | Impact |
+|---|---|---|
+| Fabric Manager service down | systemd service check via nsenter | NVLink fabric offline, multi-GPU workloads fail |
+| PCIe link downtraining | nvidia-smi PCIe query via nsenter | Reduced GPU-host bandwidth, silent performance degradation |
+| GPU clock throttling | nvidia-smi clock query via nsenter | Silent throughput loss from thermal/power throttling |
+| NVLink CRC errors | DCGM exporter Prometheus metrics | NVLink fabric degradation |
+| CUDA context failure | subprocess CUDA validation | GPU completely unusable |
+
+## Architecture
+
+The fabric-manager-monitor follows the same callback pattern as the gpu-health-monitor:
+
+1. **FabricManagerWatcher** runs a polling loop executing all enabled health checks
+2. Each check returns `List[CheckResult]` with check name, health status, error codes, and impacted entities
+3. Callbacks (e.g. `PlatformConnectorEventProcessor`) receive aggregated results
+4. The event processor converts results to protobuf `HealthEvent` messages and sends them via gRPC to the platform-connector Unix domain socket
+5. State caching prevents duplicate events -- only state changes are transmitted
+
+```
+┌─────────────────────────────────────────────────────────┐
+│                  FabricManagerWatcher                    │
+│  ┌──────────────┐ ┌───────────┐ ┌────────────────────┐  │
+│  │ServiceChecker│ │PCIeChecker│ │NVLinkFabricChecker │  │
+│  └──────┬───────┘ └─────┬─────┘ └──────────┬─────────┘  │
+│  ┌──────┴───────┐ ┌─────┴─────┐                         │
+│  │ ClockChecker │ │CUDAValid. │                         │
+│  └──────┬───────┘ └─────┬─────┘                         │
+│         └───────┬───────┘                                │
+│          List[CheckResult]                               │
+└─────────────────┬───────────────────────────────────────┘
+                  │
+                  ▼
+┌─────────────────────────────────────────────────────────┐
+│         PlatformConnectorEventProcessor                 │
+│  • State caching (only sends on change)                 │
+│  • Retry with exponential backoff                       │
+│  • gRPC → platform-connector UDS                        │
+└─────────────────────────────────────────────────────────┘
+```
+
+## Check Categories
+
+| Check | Check Name | Fatal | Entities | Detection |
+|---|---|---|---|---|
+| Fabric Manager down | `FabricManagerServiceDown` | Yes | NODE | nsenter systemctl show |
+| GPU service down | `GpuServiceDown` | No | NODE | nsenter systemctl show |
+| PCIe link degraded | `PcieLinkDegraded` | Yes | GPU | nsenter nvidia-smi |
+| Clock throttled | `GpuClockThrottled` | No | GPU | nsenter nvidia-smi |
+| NVLink degraded | `NvlinkFabricDegraded` | Yes | NODE | DCGM exporter HTTP |
+| CUDA validation | `CudaValidationFailed` | Yes | NODE | subprocess torch test |
+
+## Configuration
+
+All options are available as CLI flags:
+
+```
+fabric_manager_monitor \
+  --platform-connector-socket /run/nvsentinel/platform-connector.sock \
+  --port 9101 \
+  --poll-interval 30 \
+  --boot-grace-period 300 \
+  --flap-window 600 \
+  --flap-threshold 3 \
+  --enable-fabric-check \
+  --enable-pcie-check \
+  --enable-clock-check \
+  --enable-nvlink-check \
+  --disable-cuda-validation \
+  --dcgm-exporter-url http://localhost:9400 \
+  --clock-throttle-ratio 0.85 \
+  --processing-strategy EXECUTE_REMEDIATION
+```
+
+Environment variables:
+- `NODE_NAME` / `HOSTNAME` -- Node name (used as entity in health events)
+- `LOG_LEVEL` -- Log level: debug, info, warn, error (default: info)
+
+## Deployment
+
+The fabric-manager-monitor runs as a DaemonSet on GPU nodes, alongside the existing gpu-health-monitor. It requires:
+
+- Host PID namespace access (`hostPID: true`) for nsenter into host systemd
+- Platform-connector Unix socket mounted as a volume
+- DCGM exporter accessible at the configured URL
+
+## Integration with NVSentinel Remediation
+
+Health events flow through the standard NVSentinel pipeline:
+
+1. **fabric-manager-monitor** detects failure, sends `HealthEvent` to platform-connector
+2. **platform-connector** writes event to MongoDB and forwards to fault-quarantine
+3. **fault-quarantine** cordons/labels the node based on event severity
+4. **fault-remediation** executes the recommended action (RESTART_BM for fatal infra failures)
+5. **node-drainer** handles workload migration off the unhealthy node
+
+## False-Positive Mitigations
+
+- **Boot grace period**: Suppresses alerts during node startup (configurable, default 300s)
+- **Flap detection**: Tracks service restart frequency to distinguish transient from persistent failures
+- **GPU Idle filter**: Clock throttle check ignores benign idle throttle reasons (bitmask 0x1)
+- **NVLink correlation**: Bandwidth-zero is only flagged unhealthy when correlated with Fabric Manager being down
+- **State caching**: Only state transitions generate events, preventing duplicate alerts
diff --git a/health-monitors/fabric-manager-monitor/fabric_manager_monitor/__init__.py b/health-monitors/fabric-manager-monitor/fabric_manager_monitor/__init__.py
new file mode 100644
index 000000000..341a77c5b
--- /dev/null
+++ b/health-monitors/fabric-manager-monitor/fabric_manager_monitor/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/health-monitors/fabric-manager-monitor/fabric_manager_monitor/checkers/__init__.py b/health-monitors/fabric-manager-monitor/fabric_manager_monitor/checkers/__init__.py
new file mode 100644
index 000000000..341a77c5b
--- /dev/null
+++ b/health-monitors/fabric-manager-monitor/fabric_manager_monitor/checkers/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/health-monitors/fabric-manager-monitor/fabric_manager_monitor/checkers/clock_check.py b/health-monitors/fabric-manager-monitor/fabric_manager_monitor/checkers/clock_check.py
new file mode 100644
index 000000000..a0d17a330
--- /dev/null
+++ b/health-monitors/fabric-manager-monitor/fabric_manager_monitor/checkers/clock_check.py
@@ -0,0 +1,219 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Clock and throttle detection.
+
+Detects silent GPU throttling by comparing current clocks against maximum
+and querying active throttle reasons. Catches performance degradation
+that doesn't generate XID errors.
+
+GPU Idle false-positive fix: the throttle reason bitmask 0x0000000000000001
+(GPU Idle) and the string "Not Active" are treated as benign -- low clock
+ratio when no workload is running is expected and not flagged as throttling.
+"""
+
+import logging as log
+import subprocess
+from dataclasses import dataclass
+from typing import List, Optional
+
+from .types import CheckResult
+
+
+@dataclass
+class ClockStatus:
+    """Clock and throttle status for a single GPU."""
+
+    gpu_index: int
+    graphics_clock_current: int  # MHz
+    graphics_clock_max: int  # MHz
+    mem_clock_current: int  # MHz
+    mem_clock_max: int  # MHz
+    clock_ratio: float  # current/max (graphics)
+    throttled: bool
+    throttle_reasons: str = ""
+    error: Optional[str] = None
+
+
+class ClockChecker:
+    """Detects GPU clock throttling via nsenter nvidia-smi."""
+
+    def __init__(self, throttle_ratio: float = 0.85):
+        self._throttle_ratio = throttle_ratio
+
+    # Throttle reasons that are benign (not actual degradation)
+    _BENIGN_REASONS = {
+        "Not Active",
+        "0x0000000000000000",  # No throttle
+        "0x0000000000000001",  # GPU Idle -- normal when no workload running
+    }
+
+    def check(self) -> List[ClockStatus]:
+        """Query clocks and throttle reasons for all GPUs."""
+        clocks = self._query_clocks()
+        reasons = self._query_throttle_reasons()
+
+        # Merge throttle reasons into clock results
+        reason_map = {r["gpu_index"]: r["reasons"] for r in reasons}
+        for status in clocks:
+            reason_str = reason_map.get(status.gpu_index, "")
+            status.throttle_reasons = reason_str
+
+            # GPU Idle causes low clock ratio but isn't a real throttle.
+            # Only flag as throttled for non-benign reasons.
+            if reason_str in self._BENIGN_REASONS:
+                status.throttled = False
+            elif reason_str:
+                status.throttled = True
+
+        return clocks
+
+    def _query_clocks(self) -> List[ClockStatus]:
+        """Get current vs max clocks from nvidia-smi."""
+        try:
+            result = subprocess.run(
+                [
+                    "nsenter",
+                    "-t",
+                    "1",
+                    "-m",
+                    "--",
+                    "nvidia-smi",
+                    "--query-gpu=index,clocks.current.graphics,clocks.max.graphics,"
+                    "clocks.current.memory,clocks.max.memory",
+                    "--format=csv,noheader,nounits",
+                ],
+                capture_output=True,
+                text=True,
+                timeout=15,
+            )
+
+            if result.returncode != 0:
+                log.error(f"nvidia-smi clock query failed: {result.stderr.strip()}")
+                return []
+
+            return self._parse_clocks(result.stdout)
+
+        except subprocess.TimeoutExpired:
+            log.error("nvidia-smi clock query timed out")
+            return []
+        except FileNotFoundError:
+            log.error("nvidia-smi not found")
+            return []
+        except Exception as e:
+            log.error(f"Clock check failed: {e}")
+            return []
+
+    def _parse_clocks(self, output: str) -> List[ClockStatus]:
+        results = []
+        for line in output.strip().splitlines():
+            parts = [p.strip() for p in line.split(",")]
+            if len(parts) != 5:
+                continue
+            try:
+                idx = int(parts[0])
+                gfx_cur = int(parts[1])
+                gfx_max = int(parts[2])
+                mem_cur = int(parts[3])
+                mem_max = int(parts[4])
+
+                ratio = gfx_cur / gfx_max if gfx_max > 0 else 0.0
+                throttled = ratio < self._throttle_ratio
+
+                results.append(
+                    ClockStatus(
+                        gpu_index=idx,
+                        graphics_clock_current=gfx_cur,
+                        graphics_clock_max=gfx_max,
+                        mem_clock_current=mem_cur,
+                        mem_clock_max=mem_max,
+                        clock_ratio=round(ratio, 3),
+                        throttled=throttled,
+                    )
+                )
+            except (ValueError, IndexError, ZeroDivisionError) as e:
+                log.warning(f"Failed to parse clock line '{line}': {e}")
+
+        return results
+
+    def _query_throttle_reasons(self) -> List[dict]:
+        """Get active throttle reasons from nvidia-smi."""
+        try:
+            result = subprocess.run(
+                [
+                    "nsenter",
+                    "-t",
+                    "1",
+                    "-m",
+                    "--",
+                    "nvidia-smi",
+                    "--query-gpu=index,clocks_throttle_reasons.active",
+                    "--format=csv,noheader",
+                ],
+                capture_output=True,
+                text=True,
+                timeout=15,
+            )
+
+            if result.returncode != 0:
+                return []
+
+            reasons = []
+            for line in result.stdout.strip().splitlines():
+                parts = [p.strip() for p in line.split(",", 1)]
+                if len(parts) == 2:
+                    try:
+                        reasons.append({
+                            "gpu_index": int(parts[0]),
+                            "reasons": parts[1],
+                        })
+                    except ValueError:
+                        pass
+            return reasons
+
+        except Exception:
+            return []
+
+    def to_check_results(self, statuses: List[ClockStatus], node_name: str) -> List[CheckResult]:
+        """Convert ClockStatus list to CheckResult list for the watcher."""
+        results = []
+        for clk in statuses:
+            if clk.throttled:
+                results.append(
+                    CheckResult(
+                        check_name="GpuClockThrottled",
+                        is_healthy=False,
+                        is_fatal=False,
+                        error_codes=["GPU_CLOCK_THROTTLED"],
+                        message=(
+                            f"GPU {clk.gpu_index} throttled on {node_name}: "
+                            f"{clk.graphics_clock_current}/{clk.graphics_clock_max} MHz "
+                            f"(ratio={clk.clock_ratio:.2f}, reasons={clk.throttle_reasons})"
+                        ),
+                        entities_impacted=[{"entityType": "GPU", "entityValue": str(clk.gpu_index)}],
+                        metadata={"throttle_reasons": clk.throttle_reasons, "clock_ratio": str(clk.clock_ratio)},
+                    )
+                )
+            else:
+                results.append(
+                    CheckResult(
+                        check_name="GpuClockThrottled",
+                        is_healthy=True,
+                        is_fatal=False,
+                        error_codes=[],
+                        message=f"GPU {clk.gpu_index} clocks healthy on {node_name}",
+                        entities_impacted=[{"entityType": "GPU", "entityValue": str(clk.gpu_index)}],
+                    )
+                )
+        return results
diff --git a/health-monitors/fabric-manager-monitor/fabric_manager_monitor/checkers/cuda_validation.py b/health-monitors/fabric-manager-monitor/fabric_manager_monitor/checkers/cuda_validation.py
new file mode 100644
index 000000000..24fb7c533
--- /dev/null
+++ b/health-monitors/fabric-manager-monitor/fabric_manager_monitor/checkers/cuda_validation.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""CUDA validation -- context creation and memory test.
+
+Runs a minimal CUDA test on each GPU: allocate memory, write a pattern,
+read back, verify. This check runs at a slower cadence (default disabled)
+since it consumes GPU resources. The test script is executed as a subprocess
+so that a PyTorch import failure doesn't crash the main monitor process.
+"""
+
+import json
+import logging as log
+import subprocess
+import sys
+import textwrap
+from dataclasses import dataclass, field
+from typing import List, Optional
+
+from .types import CheckResult
+
+# Inline Python script executed as a subprocess
+_CUDA_TEST_SCRIPT = textwrap.dedent("""\
+    import sys
+    import json
+
+    results = {"passed": True, "gpu_count": 0, "errors": []}
+
+    try:
+        import torch
+    except ImportError:
+        results["errors"].append("PyTorch not available")
+        results["passed"] = False
+        print(json.dumps(results))
+        sys.exit(0)
+
+    gpu_count = torch.cuda.device_count()
+    results["gpu_count"] = gpu_count
+
+    if gpu_count == 0:
+        results["errors"].append("No CUDA devices found")
+        results["passed"] = False
+        print(json.dumps(results))
+        sys.exit(0)
+
+    for i in range(gpu_count):
+        try:
+            torch.cuda.set_device(i)
+            # Allocate, write, read back, verify
+            t = torch.randn(1024, device="cuda")
+            assert t.sum().isfinite(), f"GPU {i}: non-finite sum"
+            del t
+            torch.cuda.empty_cache()
+        except Exception as e:
+            results["errors"].append(f"GPU {i}: {e}")
+            results["passed"] = False
+
+    print(json.dumps(results))
+""")
+
+
+@dataclass
+class CUDAValidationResult:
+    """Result of CUDA validation across all GPUs."""
+
+    passed: bool
+    gpu_count: int = 0
+    errors: List[str] = field(default_factory=list)
+    error: Optional[str] = None  # check-level error (couldn't run at all)
+
+
+class CUDAValidator:
+    """Validates CUDA context creation and memory on each GPU."""
+
+    def check(self) -> CUDAValidationResult:
+        """Run CUDA validation script as a subprocess."""
+        try:
+            result = subprocess.run(
+                [sys.executable, "-c", _CUDA_TEST_SCRIPT],
+                capture_output=True,
+                text=True,
+                timeout=120,  # generous timeout for multi-GPU test
+            )
+
+            if result.returncode != 0:
+                return CUDAValidationResult(
+                    passed=False,
+                    error=f"CUDA test script failed: {result.stderr.strip()}",
+                )
+
+            data = json.loads(result.stdout.strip())
+            return CUDAValidationResult(
+                passed=data.get("passed", False),
+                gpu_count=data.get("gpu_count", 0),
+                errors=data.get("errors", []),
+            )
+
+        except subprocess.TimeoutExpired:
+            return CUDAValidationResult(
+                passed=False,
+                error="CUDA validation timed out",
+            )
+        except Exception as e:
+            return CUDAValidationResult(
+                passed=False,
+                error=str(e),
+            )
+
+    def to_check_results(self, result: CUDAValidationResult, node_name: str) -> List[CheckResult]:
+        """Convert CUDAValidationResult to CheckResult list for the watcher."""
+        if not result.passed:
+            error_msg = "; ".join(result.errors) if result.errors else (result.error or "Unknown CUDA failure")
+            return [
+                CheckResult(
+                    check_name="CudaValidationFailed",
+                    is_healthy=False,
+                    is_fatal=True,
+                    error_codes=["CUDA_VALIDATION_FAILED"],
+                    message=f"CUDA validation failed on {node_name}: {error_msg}",
+                    entities_impacted=[{"entityType": "NODE", "entityValue": node_name}],
+                    metadata={"gpu_count": str(result.gpu_count)},
+                )
+            ]
+        else:
+            return [
+                CheckResult(
+                    check_name="CudaValidationFailed",
+                    is_healthy=True,
+                    is_fatal=False,
+                    error_codes=[],
+                    message=f"CUDA validation passed on {node_name} ({result.gpu_count} GPUs)",
+                    entities_impacted=[{"entityType": "NODE", "entityValue": node_name}],
+                )
+            ]
diff --git a/health-monitors/fabric-manager-monitor/fabric_manager_monitor/checkers/fabric_check.py b/health-monitors/fabric-manager-monitor/fabric_manager_monitor/checkers/fabric_check.py
new file mode 100644
index 000000000..6e369273d
--- /dev/null
+++ b/health-monitors/fabric-manager-monitor/fabric_manager_monitor/checkers/fabric_check.py
@@ -0,0 +1,162 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""NVLink fabric health via DCGM metrics HTTP endpoint.
+
+Queries the DCGM exporter's Prometheus endpoint for NVLink bandwidth and
+error counters. False-positive mitigation: NVLink bandwidth is normally
+zero when no multi-GPU workload is running, so this check alone doesn't
+flag unhealthy -- the watcher correlates with Fabric Manager status.
+"""
+
+import logging as log
+from dataclasses import dataclass
+from typing import Dict, List, Optional
+
+import requests
+
+from .types import CheckResult
+
+
+@dataclass
+class NVLinkStatus:
+    """NVLink fabric health summary for the node."""
+
+    healthy: bool
+    total_tx_bytes: float = 0.0
+    total_rx_bytes: float = 0.0
+    crc_error_count: float = 0.0
+    bandwidth_zero: bool = True
+    error: Optional[str] = None
+
+
+class NVLinkFabricChecker:
+    """Checks NVLink fabric health via DCGM exporter metrics."""
+
+    # DCGM metric names we care about
+    _TX_METRIC = "DCGM_FI_PROF_NVLINK_TX_BYTES"
+    _RX_METRIC = "DCGM_FI_PROF_NVLINK_RX_BYTES"
+    _BW_METRIC = "DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL"
+    _CRC_METRIC = "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL"
+
+    def __init__(self, dcgm_url: str = "http://localhost:9400"):
+        self._dcgm_url = dcgm_url.rstrip("/")
+
+    def check(self) -> NVLinkStatus:
+        """Query DCGM exporter and assess NVLink health."""
+        try:
+            metrics = self._fetch_metrics()
+        except Exception as e:
+            return NVLinkStatus(
+                healthy=True,  # can't determine -- assume healthy
+                error=f"Failed to fetch DCGM metrics: {e}",
+            )
+
+        tx = self._sum_metric(metrics, self._TX_METRIC)
+        rx = self._sum_metric(metrics, self._RX_METRIC)
+        bw = self._sum_metric(metrics, self._BW_METRIC)
+        crc = self._sum_metric(metrics, self._CRC_METRIC)
+
+        bandwidth_zero = (tx + rx + bw) == 0.0
+        has_errors = crc > 0
+
+        # NVLink bandwidth being zero is normal when idle.
+        # We only flag unhealthy if CRC errors are accumulating.
+        # The correlation with Fabric Manager down is done in the watcher.
+        healthy = not has_errors
+
+        return NVLinkStatus(
+            healthy=healthy,
+            total_tx_bytes=tx,
+            total_rx_bytes=rx,
+            crc_error_count=crc,
+            bandwidth_zero=bandwidth_zero,
+        )
+
+    def _fetch_metrics(self) -> Dict[str, list]:
+        """Fetch and parse Prometheus text format from DCGM exporter."""
+        resp = requests.get(
+            f"{self._dcgm_url}/metrics",
+            timeout=10,
+        )
+        resp.raise_for_status()
+        return self._parse_prometheus_text(resp.text)
+
+    def _parse_prometheus_text(self, text: str) -> Dict[str, list]:
+        """Parse Prometheus exposition format into {metric_name: [values]}."""
+        metrics: Dict[str, list] = {}
+        for line in text.splitlines():
+            line = line.strip()
+            if not line or line.startswith("#"):
+                continue
+            # Lines: metric{labels} value [timestamp]
+            try:
+                parts = line.split(" ")
+                if len(parts) < 2:
+                    continue
+                name_and_labels = parts[0]
+                value_str = parts[1]  # value is always the second field
+                name = name_and_labels.split("{")[0]
+                value = float(value_str)
+                metrics.setdefault(name, []).append(value)
+            except (ValueError, IndexError):
+                continue
+        return metrics
+
+    def _sum_metric(self, metrics: Dict[str, list], name: str) -> float:
+        """Sum all values for a given metric name across GPUs."""
+        values = metrics.get(name, [])
+        return sum(values)
+
+    def to_check_results(
+        self, status: NVLinkStatus, node_name: str, fabric_manager_down: bool
+    ) -> List[CheckResult]:
+        """Convert NVLinkStatus to CheckResult list for the watcher.
+
+        False-positive mitigation: only flag unhealthy when NVLink has CRC errors
+        OR bandwidth is zero AND Fabric Manager is down.
+        """
+        fabric_nvlink_degraded = not status.healthy or (status.bandwidth_zero and fabric_manager_down)
+
+        if fabric_nvlink_degraded:
+            return [
+                CheckResult(
+                    check_name="NvlinkFabricDegraded",
+                    is_healthy=False,
+                    is_fatal=True,
+                    error_codes=["NVLINK_FABRIC_DEGRADED"],
+                    message=(
+                        f"NVLink fabric degraded on {node_name} "
+                        f"(crc_errors={status.crc_error_count:.0f}, "
+                        f"bw_zero={status.bandwidth_zero}, fm_down={fabric_manager_down})"
+                    ),
+                    entities_impacted=[{"entityType": "NODE", "entityValue": node_name}],
+                    metadata={
+                        "crc_error_count": str(status.crc_error_count),
+                        "bandwidth_zero": str(status.bandwidth_zero),
+                        "fabric_manager_down": str(fabric_manager_down),
+                    },
+                )
+            ]
+        else:
+            return [
+                CheckResult(
+                    check_name="NvlinkFabricDegraded",
+                    is_healthy=True,
+                    is_fatal=False,
+                    error_codes=[],
+                    message=f"NVLink fabric healthy on {node_name}",
+                    entities_impacted=[{"entityType": "NODE", "entityValue": node_name}],
+                )
+            ]
diff --git a/health-monitors/fabric-manager-monitor/fabric_manager_monitor/checkers/pcie_check.py b/health-monitors/fabric-manager-monitor/fabric_manager_monitor/checkers/pcie_check.py
new file mode 100644
index 000000000..45ddb60fb
--- /dev/null
+++ b/health-monitors/fabric-manager-monitor/fabric_manager_monitor/checkers/pcie_check.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""PCIe link health check -- detects link downtraining.
+
+Compares current PCIe link generation and width against maximum values.
+On P5.48xlarge with H100 GPUs, expected: Gen5 x16 for all 8 GPUs.
+On P4d.24xlarge with A100 GPUs, expected: Gen4 x16 for all 8 GPUs.
+A drop (e.g. Gen5->Gen3 or x16->x8) indicates hardware degradation.
+"""
+
+import logging as log
+import subprocess
+from dataclasses import dataclass
+from typing import List, Optional
+
+from .types import CheckResult
+
+
+@dataclass
+class PCIeStatus:
+    """PCIe link status for a single GPU."""
+
+    gpu_index: int
+    link_gen_current: int
+    link_gen_max: int
+    link_width_current: int
+    link_width_max: int
+    degraded: bool
+    error: Optional[str] = None
+
+
+class PCIeChecker:
+    """Checks PCIe link width and generation for all GPUs via nsenter nvidia-smi."""
+
+    def check(self) -> List[PCIeStatus]:
+        """Query nvidia-smi for PCIe link status on all GPUs via nsenter."""
+        try:
+            result = subprocess.run(
+                [
+                    "nsenter",
+                    "-t",
+                    "1",
+                    "-m",
+                    "--",
+                    "nvidia-smi",
+                    "--query-gpu=index,pcie.link.gen.current,pcie.link.gen.max,"
+                    "pcie.link.width.current,pcie.link.width.max",
+                    "--format=csv,noheader,nounits",
+                ],
+                capture_output=True,
+                text=True,
+                timeout=15,
+            )
+
+            if result.returncode != 0:
+                log.error(f"nvidia-smi PCIe query failed: {result.stderr.strip()}")
+                return []
+
+            return self._parse_output(result.stdout)
+
+        except subprocess.TimeoutExpired:
+            log.error("nvidia-smi PCIe query timed out")
+            return []
+        except FileNotFoundError:
+            log.error("nvidia-smi not found")
+            return []
+        except Exception as e:
+            log.error(f"PCIe check failed: {e}")
+            return []
+
+    def _parse_output(self, output: str) -> List[PCIeStatus]:
+        """Parse nvidia-smi CSV output into PCIeStatus objects."""
+        results = []
+        for line in output.strip().splitlines():
+            parts = [p.strip() for p in line.split(",")]
+            if len(parts) != 5:
+                continue
+            try:
+                idx = int(parts[0])
+                gen_cur = int(parts[1])
+                gen_max = int(parts[2])
+                width_cur = int(parts[3])
+                width_max = int(parts[4])
+
+                degraded = (gen_cur < gen_max) or (width_cur < width_max)
+
+                results.append(
+                    PCIeStatus(
+                        gpu_index=idx,
+                        link_gen_current=gen_cur,
+                        link_gen_max=gen_max,
+                        link_width_current=width_cur,
+                        link_width_max=width_max,
+                        degraded=degraded,
+                    )
+                )
+            except (ValueError, IndexError) as e:
+                log.warning(f"Failed to parse PCIe line '{line}': {e}")
+
+        return results
+
+    def to_check_results(self, statuses: List[PCIeStatus], node_name: str) -> List[CheckResult]:
+        """Convert PCIeStatus list to CheckResult list for the watcher."""
+        results = []
+        for pcie in statuses:
+            if pcie.degraded:
+                results.append(
+                    CheckResult(
+                        check_name="PcieLinkDegraded",
+                        is_healthy=False,
+                        is_fatal=True,
+                        error_codes=["PCIE_LINK_DEGRADED"],
+                        message=(
+                            f"PCIe link degraded on {node_name} GPU {pcie.gpu_index}: "
+                            f"Gen{pcie.link_gen_current} x{pcie.link_width_current} "
+                            f"(max Gen{pcie.link_gen_max} x{pcie.link_width_max})"
+                        ),
+                        entities_impacted=[{"entityType": "GPU", "entityValue": str(pcie.gpu_index)}],
+                    )
+                )
+            else:
+                results.append(
+                    CheckResult(
+                        check_name="PcieLinkDegraded",
+                        is_healthy=True,
+                        is_fatal=False,
+                        error_codes=[],
+                        message=f"PCIe link healthy on {node_name} GPU {pcie.gpu_index}",
+                        entities_impacted=[{"entityType": "GPU", "entityValue": str(pcie.gpu_index)}],
+                    )
+                )
+        return results
diff --git a/health-monitors/fabric-manager-monitor/fabric_manager_monitor/checkers/service_check.py b/health-monitors/fabric-manager-monitor/fabric_manager_monitor/checkers/service_check.py
new file mode 100644
index 000000000..5531f8cf1
--- /dev/null
+++ b/health-monitors/fabric-manager-monitor/fabric_manager_monitor/checkers/service_check.py
@@ -0,0 +1,272 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Systemd service health checks for Fabric Manager and GPU services.
+
+Uses nsenter to inspect host systemd services from within a container.
+Includes flap detection (rapid restart cycling) and journal error parsing.
+NRestarts is queried in a separate systemctl call for compatibility with
+older systemd versions that do not support it in a combined --property list.
+"""
+
+import logging as log
+import subprocess
+import time
+from collections import deque
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Dict, List, Optional
+
+from .types import CheckResult
+
+
+class ErrorCategory(Enum):
+    NVSWITCH_ERROR = "nvswitch_error"
+    INITIALIZATION_FAILED = "initialization_failed"
+    TIMEOUT = "timeout"
+    GENERAL_ERROR = "general_error"
+
+
+# Journal patterns that indicate specific failure modes
+_ERROR_PATTERNS = {
+    ErrorCategory.NVSWITCH_ERROR: [
+        "nvswitch",
+        "NVSwitch",
+        "fabric error",
+    ],
+    ErrorCategory.INITIALIZATION_FAILED: [
+        "initialization failed",
+        "failed to initialize",
+        "Init Failed",
+        "unable to start",
+    ],
+    ErrorCategory.TIMEOUT: [
+        "timed out",
+        "timeout",
+        "deadline exceeded",
+    ],
+    ErrorCategory.GENERAL_ERROR: [
+        "error",
+        "fatal",
+        "failed",
+    ],
+}
+
+
+@dataclass
+class ServiceStatus:
+    """Result of a single systemd service check."""
+
+    name: str
+    active: bool  # True if ActiveState == "active"
+    sub_state: str = ""  # e.g. "running", "dead", "failed"
+    main_pid: int = 0
+    n_restarts: int = 0
+    start_timestamp: str = ""
+    error: Optional[str] = None  # non-None if the check itself failed
+
+
+@dataclass
+class FabricManagerStatus(ServiceStatus):
+    """Extended status for Fabric Manager with journal analysis."""
+
+    journal_errors: List[ErrorCategory] = field(default_factory=list)
+    flapping: bool = False
+
+
+class ServiceChecker:
+    """Checks host systemd services via nsenter."""
+
+    # Additional GPU-related services to check alongside Fabric Manager
+    DEFAULT_GPU_SERVICES = [
+        "nvidia-persistenced",
+        "nv-hostengine",
+    ]
+
+    def __init__(self, flap_window: int = 600, flap_threshold: int = 3):
+        self._flap_window = flap_window
+        self._flap_threshold = flap_threshold
+        # Track restart timestamps per service for flap detection
+        self._restart_history: Dict[str, deque] = {}
+        # Track last-seen restart count to detect new restarts
+        self._last_restart_count: Dict[str, int] = {}
+
+    def _run_host_cmd(self, cmd: List[str], timeout: int = 10) -> subprocess.CompletedProcess:
+        """Run a command on the host via nsenter into PID 1's mount namespace."""
+        full_cmd = ["nsenter", "-t", "1", "-m", "--"] + cmd
+        return subprocess.run(
+            full_cmd,
+            capture_output=True,
+            text=True,
+            timeout=timeout,
+        )
+
+    def check_service(self, service_name: str) -> ServiceStatus:
+        """Check a single systemd service via nsenter.
+
+        Parses systemctl show output for ActiveState, SubState, MainPID,
+        and ExecMainStartTimestamp. NRestarts is queried separately since
+        older systemd versions don't support it in a combined property list.
+        """
+        try:
+            result = self._run_host_cmd([
+                "systemctl",
+                "show",
+                service_name,
+                "--property=ActiveState,SubState,MainPID,ExecMainStartTimestamp",
+            ])
+
+            if result.returncode != 0 and not result.stdout.strip():
+                return ServiceStatus(
+                    name=service_name,
+                    active=False,
+                    error=f"systemctl show failed: {result.stderr.strip()}",
+                )
+
+            props = {}
+            for line in result.stdout.strip().splitlines():
+                if "=" in line:
+                    key, _, value = line.partition("=")
+                    props[key.strip()] = value.strip()
+
+            active_state = props.get("ActiveState", "unknown")
+
+            # NRestarts isn't available on older systemd; query separately
+            n_restarts = self._get_restart_count(service_name)
+
+            # Flap detection
+            self._update_flap_tracking(service_name, n_restarts)
+
+            return ServiceStatus(
+                name=service_name,
+                active=(active_state == "active"),
+                sub_state=props.get("SubState", ""),
+                main_pid=int(props.get("MainPID", "0")),
+                n_restarts=n_restarts,
+                start_timestamp=props.get("ExecMainStartTimestamp", ""),
+            )
+
+        except subprocess.TimeoutExpired:
+            return ServiceStatus(
+                name=service_name,
+                active=False,
+                error="systemctl show timed out",
+            )
+        except Exception as e:
+            return ServiceStatus(
+                name=service_name,
+                active=False,
+                error=str(e),
+            )
+
+    def _get_restart_count(self, service_name: str) -> int:
+        """Get NRestarts from systemd, returning 0 if unsupported."""
+        try:
+            result = self._run_host_cmd([
+                "systemctl",
+                "show",
+                service_name,
+                "--property=NRestarts",
+            ])
+            if result.returncode == 0 and result.stdout.strip():
+                _, _, val = result.stdout.strip().partition("=")
+                return int(val)
+        except Exception:
+            pass
+        return 0
+
+    def _update_flap_tracking(self, service_name: str, current_restarts: int) -> None:
+        """Track restart events for flap detection."""
+        if service_name not in self._restart_history:
+            self._restart_history[service_name] = deque()
+            self._last_restart_count[service_name] = current_restarts
+            return
+
+        last_count = self._last_restart_count[service_name]
+        if current_restarts > last_count:
+            # New restarts detected -- record timestamp for each
+            now = time.monotonic()
+            for _ in range(current_restarts - last_count):
+                self._restart_history[service_name].append(now)
+            self._last_restart_count[service_name] = current_restarts
+
+        # Prune entries outside the flap window
+        cutoff = time.monotonic() - self._flap_window
+        history = self._restart_history[service_name]
+        while history and history[0] < cutoff:
+            history.popleft()
+
+    def is_flapping(self, service_name: str) -> bool:
+        """Return True if the service has restarted too many times within the window."""
+        history = self._restart_history.get(service_name, deque())
+        return len(history) >= self._flap_threshold
+
+    def check_fabric_manager(self) -> FabricManagerStatus:
+        """Check Fabric Manager with journal error analysis."""
+        base = self.check_service("nvidia-fabricmanager")
+
+        journal_errors = self._parse_journal_errors("nvidia-fabricmanager")
+        flapping = self.is_flapping("nvidia-fabricmanager")
+
+        return FabricManagerStatus(
+            name=base.name,
+            active=base.active,
+            sub_state=base.sub_state,
+            main_pid=base.main_pid,
+            n_restarts=base.n_restarts,
+            start_timestamp=base.start_timestamp,
+            error=base.error,
+            journal_errors=journal_errors,
+            flapping=flapping,
+        )
+
+    def _parse_journal_errors(self, service_name: str) -> List[ErrorCategory]:
+        """Scan recent journal entries for known error patterns."""
+        try:
+            result = self._run_host_cmd(
+                [
+                    "journalctl",
+                    "-u",
+                    service_name,
+                    "--since",
+                    "5 minutes ago",
+                    "--no-pager",
+                    "-q",
+                ],
+                timeout=15,
+            )
+
+            if result.returncode != 0 or not result.stdout.strip():
+                return []
+
+            found: List[ErrorCategory] = []
+            text = result.stdout.lower()
+            for category, patterns in _ERROR_PATTERNS.items():
+                if any(p.lower() in text for p in patterns):
+                    found.append(category)
+
+            return found
+
+        except (subprocess.TimeoutExpired, Exception) as e:
+            log.warning(f"Journal parsing failed for {service_name}: {e}")
+            return []
+
+    def check_all_gpu_services(self, service_names: Optional[List[str]] = None) -> Dict[str, ServiceStatus]:
+        """Check all configured GPU services."""
+        if service_names is None:
+            service_names = self.DEFAULT_GPU_SERVICES
+        results = {}
+        for name in service_names:
+            results[name] = self.check_service(name)
+        return results
diff --git a/health-monitors/fabric-manager-monitor/fabric_manager_monitor/checkers/types.py b/health-monitors/fabric-manager-monitor/fabric_manager_monitor/checkers/types.py
new file mode 100644
index 000000000..46eaff07b
--- /dev/null
+++ b/health-monitors/fabric-manager-monitor/fabric_manager_monitor/checkers/types.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import abc
+import dataclasses
+from typing import Dict, List, Optional
+
+
+@dataclasses.dataclass
+class CheckResult:
+    """Result of a single health check.
+
+    Attributes:
+        check_name: Identifier for the check type, e.g. "FabricManagerServiceDown", "PcieLinkDegraded".
+        is_healthy: True if the check passed without issues.
+        is_fatal: True if the failure warrants immediate remediation (e.g. node restart).
+        error_codes: Machine-readable error codes for downstream processing.
+        message: Human-readable description of the check result.
+        entities_impacted: List of impacted entities, e.g. [{"entityType": "GPU", "entityValue": "0"}].
+        metadata: Optional key-value metadata attached to the health event.
+    """
+
+    check_name: str
+    is_healthy: bool
+    is_fatal: bool
+    error_codes: List[str]
+    message: str
+    entities_impacted: List[Dict[str, str]]
+    metadata: Optional[Dict[str, str]] = None
+
+
+class CallbackInterface(abc.ABC):
+    @abc.abstractmethod
+    def health_check_completed(self, results: List[CheckResult]) -> None:
+        """Called after each check cycle with the aggregated results from all checkers."""
+        pass
diff --git a/health-monitors/fabric-manager-monitor/fabric_manager_monitor/checkers/watcher.py b/health-monitors/fabric-manager-monitor/fabric_manager_monitor/checkers/watcher.py
new file mode 100644
index 000000000..7b9929a95
--- /dev/null
+++ b/health-monitors/fabric-manager-monitor/fabric_manager_monitor/checkers/watcher.py
@@ -0,0 +1,301 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Main polling loop for fabric-manager-monitor.
+
+Runs all enabled health checks on a configurable interval and fires
+callbacks (e.g. PlatformConnectorEventProcessor) with the aggregated
+results. Mirrors the DCGMWatcher pattern from gpu-health-monitor.
+"""
+
+import logging as log
+import time
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+from threading import Event
+from typing import List
+
+from fabric_manager_monitor import metrics
+from .clock_check import ClockChecker
+from .cuda_validation import CUDAValidator
+from .fabric_check import NVLinkFabricChecker
+from .pcie_check import PCIeChecker
+from .service_check import ServiceChecker
+from .types import CallbackInterface, CheckResult
+
+
+class FabricManagerWatcher:
+    """Orchestrates all health checks and fires callbacks with results.
+
+    Follows the same callback pattern as DCGMWatcher: a list of CallbackInterface
+    implementations are invoked after each check cycle.
+    """
+
+    def __init__(
+        self,
+        poll_interval: int,
+        callbacks: List[CallbackInterface],
+        node_name: str,
+        boot_grace_period: int = 300,
+        flap_window: int = 600,
+        flap_threshold: int = 3,
+        enable_fabric_check: bool = True,
+        enable_pcie_check: bool = True,
+        enable_clock_check: bool = True,
+        enable_nvlink_check: bool = True,
+        enable_cuda_validation: bool = False,
+        dcgm_exporter_url: str = "http://localhost:9400",
+        clock_throttle_ratio: float = 0.85,
+    ) -> None:
+        self._poll_interval = poll_interval
+        self._callbacks = callbacks
+        self._node_name = node_name
+        self._boot_grace_period = boot_grace_period
+        self._start_time = time.monotonic()
+        self._callback_thread_pool = ThreadPoolExecutor()
+
+        # Track cross-check state for correlation
+        self._fabric_manager_down = False
+
+        # Initialize checkers and build the check list based on enabled flags
+        self._checkers: List[tuple[str, callable]] = []
+
+        if enable_fabric_check:
+            self._service_checker = ServiceChecker(
+                flap_window=flap_window,
+                flap_threshold=flap_threshold,
+            )
+            self._checkers.append(("services", self._run_service_checks))
+
+        if enable_pcie_check:
+            self._pcie_checker = PCIeChecker()
+            self._checkers.append(("pcie", self._run_pcie_checks))
+
+        if enable_clock_check:
+            self._clock_checker = ClockChecker(throttle_ratio=clock_throttle_ratio)
+            self._checkers.append(("clocks", self._run_clock_checks))
+
+        if enable_nvlink_check:
+            self._nvlink_checker = NVLinkFabricChecker(dcgm_url=dcgm_exporter_url)
+            self._checkers.append(("nvlink", self._run_nvlink_checks))
+
+        if enable_cuda_validation:
+            self._cuda_validator = CUDAValidator()
+            self._checkers.append(("cuda", self._run_cuda_checks))
+
+    def _in_grace_period(self) -> bool:
+        return (time.monotonic() - self._start_time) < self._boot_grace_period
+
+    def _fire_callback_funcs(self, results: List[CheckResult]) -> None:
+        """Invoke health_check_completed on all registered callbacks."""
+
+        def done_callback(class_name: str, future):
+            e = future.exception()
+            if e is not None:
+                log.exception(f"Callback failed: {e}")
+                metrics.callback_failures.labels(class_name, "health_check_completed").inc()
+            else:
+                metrics.callback_success.labels(class_name, "health_check_completed").inc()
+
+        for callback in self._callbacks:
+            log.debug(f"Invoking health_check_completed on {callback.__class__.__name__}")
+            self._callback_thread_pool.submit(callback.health_check_completed, results).add_done_callback(
+                partial(done_callback, callback.__class__.__name__)
+            )
+
+    def start(self, exit: Event) -> None:
+        """Run the polling loop until exit is signaled."""
+        log.info(
+            f"Starting FabricManagerWatcher on {self._node_name} with "
+            f"{len(self._checkers)} checkers, poll_interval={self._poll_interval}s"
+        )
+
+        while not exit.is_set():
+            with metrics.overall_reconcile_loop_time.time():
+                results: List[CheckResult] = []
+
+                for name, check_func in self._checkers:
+                    with metrics.check_duration.labels(name).time():
+                        try:
+                            check_results = check_func()
+                            results.extend(check_results)
+                        except Exception as e:
+                            log.error(f"Check '{name}' failed with exception: {e}")
+                            metrics.check_errors.labels(name).inc()
+
+                # Update overall node health metric
+                if self._in_grace_period():
+                    metrics.gpu_node_health_up.labels(self._node_name).set(1)
+                    log.debug("In boot grace period, reporting healthy")
+                else:
+                    overall_healthy = all(r.is_healthy for r in results) if results else True
+                    metrics.gpu_node_health_up.labels(self._node_name).set(1 if overall_healthy else 0)
+
+                # Fire callbacks with all results
+                if results:
+                    self._fire_callback_funcs(results)
+
+            log.debug("Waiting till next cycle")
+            exit.wait(self._poll_interval)
+
+        # Cleanup on exit
+        self._callback_thread_pool.shutdown(cancel_futures=True)
+
+    def _run_service_checks(self) -> List[CheckResult]:
+        """Check Fabric Manager and GPU services."""
+        results: List[CheckResult] = []
+
+        fm = self._service_checker.check_fabric_manager()
+        self._fabric_manager_down = not fm.active
+
+        # Update Prometheus metrics
+        metrics.fabric_manager_up.labels(self._node_name).set(1 if fm.active else 0)
+        if fm.active:
+            metrics.fabric_manager_last_healthy_seconds.labels(self._node_name).set(time.time())
+
+        if fm.flapping:
+            log.warning(f"Fabric Manager is flapping on {self._node_name}")
+
+        if fm.journal_errors:
+            log.warning(f"Fabric Manager journal errors on {self._node_name}: {[e.value for e in fm.journal_errors]}")
+
+        if not fm.active and not self._in_grace_period():
+            error_codes = ["FABRIC_MANAGER_NOT_RUNNING"]
+            if fm.flapping:
+                error_codes.append("FABRIC_MANAGER_FLAPPING")
+            if fm.journal_errors:
+                error_codes.extend([f"JOURNAL_{e.value.upper()}" for e in fm.journal_errors])
+
+            results.append(
+                CheckResult(
+                    check_name="FabricManagerServiceDown",
+                    is_healthy=False,
+                    is_fatal=True,
+                    error_codes=error_codes,
+                    message=f"Fabric Manager is {fm.sub_state} on {self._node_name}",
+                    entities_impacted=[{"entityType": "NODE", "entityValue": self._node_name}],
+                    metadata={
+                        "sub_state": fm.sub_state,
+                        "n_restarts": str(fm.n_restarts),
+                        "flapping": str(fm.flapping),
+                    },
+                )
+            )
+        elif fm.active:
+            results.append(
+                CheckResult(
+                    check_name="FabricManagerServiceDown",
+                    is_healthy=True,
+                    is_fatal=False,
+                    error_codes=[],
+                    message=f"Fabric Manager is running on {self._node_name}",
+                    entities_impacted=[{"entityType": "NODE", "entityValue": self._node_name}],
+                )
+            )
+
+        # Check additional GPU services
+        svc_results = self._service_checker.check_all_gpu_services()
+        for svc_name, status in svc_results.items():
+            metrics.nvidia_service_up.labels(self._node_name, svc_name).set(1 if status.active else 0)
+            if not status.active and not self._in_grace_period():
+                results.append(
+                    CheckResult(
+                        check_name="GpuServiceDown",
+                        is_healthy=False,
+                        is_fatal=False,
+                        error_codes=["GPU_SERVICE_NOT_RUNNING"],
+                        message=f"Service {svc_name} is {status.sub_state} on {self._node_name}",
+                        entities_impacted=[{"entityType": "NODE", "entityValue": self._node_name}],
+                        metadata={"service_name": svc_name, "sub_state": status.sub_state},
+                    )
+                )
+            elif status.active:
+                results.append(
+                    CheckResult(
+                        check_name="GpuServiceDown",
+                        is_healthy=True,
+                        is_fatal=False,
+                        error_codes=[],
+                        message=f"Service {svc_name} is running on {self._node_name}",
+                        entities_impacted=[{"entityType": "NODE", "entityValue": self._node_name}],
+                        metadata={"service_name": svc_name},
+                    )
+                )
+
+        return results
+
+    def _run_pcie_checks(self) -> List[CheckResult]:
+        """Check PCIe link health for all GPUs."""
+        statuses = self._pcie_checker.check()
+
+        # Update Prometheus metrics
+        for pcie in statuses:
+            gpu = str(pcie.gpu_index)
+            metrics.pcie_link_width.labels(self._node_name, gpu).set(pcie.link_width_current)
+            metrics.pcie_link_gen.labels(self._node_name, gpu).set(pcie.link_gen_current)
+            metrics.pcie_link_degraded.labels(self._node_name, gpu).set(1 if pcie.degraded else 0)
+            if pcie.degraded:
+                log.warning(
+                    f"PCIe degraded on {self._node_name} GPU {gpu}: "
+                    f"Gen{pcie.link_gen_current} x{pcie.link_width_current} "
+                    f"(max Gen{pcie.link_gen_max} x{pcie.link_width_max})"
+                )
+
+        return self._pcie_checker.to_check_results(statuses, self._node_name)
+
+    def _run_clock_checks(self) -> List[CheckResult]:
+        """Check GPU clock throttling."""
+        statuses = self._clock_checker.check()
+
+        # Update Prometheus metrics
+        for clk in statuses:
+            gpu = str(clk.gpu_index)
+            metrics.gpu_clock_throttled.labels(self._node_name, gpu).set(1 if clk.throttled else 0)
+            metrics.gpu_clock_ratio.labels(self._node_name, gpu).set(clk.clock_ratio)
+            if clk.throttled:
+                log.warning(
+                    f"GPU {gpu} throttled on {self._node_name}: "
+                    f"{clk.graphics_clock_current}/{clk.graphics_clock_max} MHz "
+                    f"(ratio={clk.clock_ratio:.2f}, reasons={clk.throttle_reasons})"
+                )
+
+        return self._clock_checker.to_check_results(statuses, self._node_name)
+
+    def _run_nvlink_checks(self) -> List[CheckResult]:
+        """Check NVLink fabric health."""
+        status = self._nvlink_checker.check()
+
+        # False-positive mitigation: only flag unhealthy when NVLink has CRC errors
+        # OR bandwidth is zero AND Fabric Manager is down
+        fabric_nvlink_degraded = not status.healthy or (status.bandwidth_zero and self._fabric_manager_down)
+        metrics.nvlink_fabric_healthy.labels(self._node_name).set(0 if fabric_nvlink_degraded else 1)
+
+        if fabric_nvlink_degraded and not self._in_grace_period():
+            log.error(
+                f"NVLink fabric degraded on {self._node_name} "
+                f"(crc_errors={status.crc_error_count:.0f}, "
+                f"bw_zero={status.bandwidth_zero}, fm_down={self._fabric_manager_down})"
+            )
+
+        return self._nvlink_checker.to_check_results(status, self._node_name, self._fabric_manager_down)
+
+    def _run_cuda_checks(self) -> List[CheckResult]:
+        """Run CUDA validation."""
+        result = self._cuda_validator.check()
+
+        metrics.cuda_validation_passed.labels(self._node_name).set(1 if result.passed else 0)
+        if not result.passed:
+            log.error(f"CUDA validation FAILED on {self._node_name}: {result.errors or result.error}")
+
+        return self._cuda_validator.to_check_results(result, self._node_name)
diff --git a/health-monitors/fabric-manager-monitor/fabric_manager_monitor/cli.py b/health-monitors/fabric-manager-monitor/fabric_manager_monitor/cli.py
new file mode 100644
index 000000000..ebbe9442b
--- /dev/null
+++ b/health-monitors/fabric-manager-monitor/fabric_manager_monitor/cli.py
@@ -0,0 +1,160 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import signal
+import sys
+import logging as log
+from importlib.metadata import version as get_package_version
+from threading import Event
+
+import click
+from prometheus_client import start_http_server
+
+from fabric_manager_monitor.checkers.watcher import FabricManagerWatcher
+from fabric_manager_monitor.logger import set_default_structured_logger_with_level
+from fabric_manager_monitor.platform_connector.event_processor import PlatformConnectorEventProcessor
+from fabric_manager_monitor.protos import health_event_pb2 as platformconnector_pb2
+
+
+@click.command()
+@click.option(
+    "--platform-connector-socket",
+    type=str,
+    required=True,
+    help="Unix socket path for gRPC connection to platform-connector",
+)
+@click.option("--port", type=int, default=9101, help="Prometheus metrics HTTP server port")
+@click.option("--poll-interval", type=int, default=30, help="Seconds between check cycles")
+@click.option(
+    "--node-name",
+    type=str,
+    default=None,
+    help="Node name (defaults to NODE_NAME or HOSTNAME env var)",
+)
+@click.option("--boot-grace-period", type=int, default=300, help="Seconds after startup to suppress unhealthy alerts")
+@click.option("--flap-window", type=int, default=600, help="Seconds window for counting service restarts")
+@click.option("--flap-threshold", type=int, default=3, help="Restart count within flap window to flag flapping")
+@click.option("--enable-fabric-check/--disable-fabric-check", default=True, help="Enable Fabric Manager service check")
+@click.option("--enable-pcie-check/--disable-pcie-check", default=True, help="Enable PCIe link health check")
+@click.option("--enable-clock-check/--disable-clock-check", default=True, help="Enable GPU clock throttle check")
+@click.option("--enable-nvlink-check/--disable-nvlink-check", default=True, help="Enable NVLink fabric health check")
+@click.option(
+    "--enable-cuda-validation/--disable-cuda-validation",
+    default=False,
+    help="Enable CUDA context validation (resource intensive, disabled by default)",
+)
+@click.option(
+    "--dcgm-exporter-url",
+    type=str,
+    default="http://localhost:9400",
+    help="DCGM exporter Prometheus endpoint URL",
+)
+@click.option(
+    "--clock-throttle-ratio",
+    type=float,
+    default=0.85,
+    help="Clock ratio threshold below which GPU is considered throttled",
+)
+@click.option(
+    "--processing-strategy",
+    type=str,
+    default="EXECUTE_REMEDIATION",
+    help="Event processing strategy: EXECUTE_REMEDIATION or STORE_ONLY",
+)
+@click.option("--verbose", is_flag=True, default=False, help="Enable debug logging")
+def cli(
+    platform_connector_socket,
+    port,
+    poll_interval,
+    node_name,
+    boot_grace_period,
+    flap_window,
+    flap_threshold,
+    enable_fabric_check,
+    enable_pcie_check,
+    enable_clock_check,
+    enable_nvlink_check,
+    enable_cuda_validation,
+    dcgm_exporter_url,
+    clock_throttle_ratio,
+    processing_strategy,
+    verbose,
+):
+    exit = Event()
+
+    # Resolve node name from CLI or environment
+    if node_name is None:
+        node_name = os.getenv("NODE_NAME", os.getenv("HOSTNAME", ""))
+    if not node_name:
+        log.fatal("Failed to determine node name from --node-name, NODE_NAME, or HOSTNAME")
+        sys.exit(1)
+
+    # Initialize structured JSON logging
+    # Version is read from package metadata (set at build time via poetry version)
+    version = get_package_version("fabric-manager-monitor")
+    log_level = "debug" if verbose else os.getenv("LOG_LEVEL", "info")
+    set_default_structured_logger_with_level("fabric-manager-monitor", version, log_level)
+
+    # Validate processing strategy
+    try:
+        processing_strategy_value = platformconnector_pb2.ProcessingStrategy.Value(processing_strategy)
+    except ValueError:
+        valid_strategies = list(platformconnector_pb2.ProcessingStrategy.keys())
+        log.fatal(f"Invalid processing_strategy '{processing_strategy}'. Valid options are: {valid_strategies}")
+        sys.exit(1)
+
+    log.info(f"Event handling strategy configured to: {processing_strategy_value}")
+    log.info("Initialization completed")
+
+    # Create event processor (platform-connector gRPC client)
+    event_processor = PlatformConnectorEventProcessor(
+        socket_path=platform_connector_socket,
+        node_name=node_name,
+        processing_strategy=processing_strategy_value,
+    )
+
+    # Start Prometheus HTTP server
+    prom_server, t = start_http_server(port)
+
+    def process_exit_signal(signum, frame):
+        exit.set()
+        prom_server.shutdown()
+        t.join()
+
+    signal.signal(signal.SIGTERM, process_exit_signal)
+    signal.signal(signal.SIGINT, process_exit_signal)
+
+    # Create watcher with all enabled checks
+    watcher = FabricManagerWatcher(
+        poll_interval=poll_interval,
+        callbacks=[event_processor],
+        node_name=node_name,
+        boot_grace_period=boot_grace_period,
+        flap_window=flap_window,
+        flap_threshold=flap_threshold,
+        enable_fabric_check=enable_fabric_check,
+        enable_pcie_check=enable_pcie_check,
+        enable_clock_check=enable_clock_check,
+        enable_nvlink_check=enable_nvlink_check,
+        enable_cuda_validation=enable_cuda_validation,
+        dcgm_exporter_url=dcgm_exporter_url,
+        clock_throttle_ratio=clock_throttle_ratio,
+    )
+
+    watcher.start(exit)
+
+
+if __name__ == "__main__":
+    cli()
diff --git a/health-monitors/fabric-manager-monitor/fabric_manager_monitor/logger.py b/health-monitors/fabric-manager-monitor/fabric_manager_monitor/logger.py
new file mode 100644
index 000000000..ff0c37b65
--- /dev/null
+++ b/health-monitors/fabric-manager-monitor/fabric_manager_monitor/logger.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Structured JSON logging for fabric-manager-monitor using structlog.
+
+Usage:
+    from fabric_manager_monitor.logger import set_default_structured_logger_with_level
+    import logging as log
+
+    # At application startup
+    set_default_structured_logger_with_level("fabric-manager-monitor", "v0.1.0", "info")
+
+    # Use standard logging
+    log.info("Application started")
+    log.info("Processing checks", extra={"check_count": 5})
+"""
+
+import logging
+import sys
+from typing import Any, Callable, Final
+
+import structlog
+
+# Log level mapping from string to logging constants
+_LEVEL_MAP: Final[dict[str, int]] = {
+    "debug": logging.DEBUG,
+    "info": logging.INFO,
+    "warn": logging.WARNING,
+    "warning": logging.WARNING,
+    "error": logging.ERROR,
+    "critical": logging.CRITICAL,
+}
+
+
+def _parse_log_level(level: str) -> int:
+    """Convert a string log level to a logging constant."""
+    return _LEVEL_MAP.get(level.lower().strip(), logging.INFO)
+
+
+def _make_module_version_injector(
+    module: str, version: str
+) -> Callable[[logging.Logger | None, str, dict[str, Any]], dict[str, Any]]:
+    """Create a processor that injects module and version into every log entry."""
+
+    def inject_module_version(
+        logger: logging.Logger | None,
+        method_name: str,
+        event_dict: dict[str, Any],
+    ) -> dict[str, Any]:
+        event_dict["module"] = module
+        event_dict["version"] = version
+        return event_dict
+
+    return inject_module_version
+
+
+def set_default_structured_logger_with_level(module: str, version: str, level: str) -> None:
+    """
+    Initialize the structured logger with the specified log level.
+
+    Args:
+        module: The name of the module/application using the logger.
+        version: The version of the module/application (e.g., "v1.0.0").
+        level: The log level as a string (e.g., "debug", "info", "warn", "error").
+    """
+    log_level = _parse_log_level(level)
+
+    # Create processor that injects module/version into every log
+    inject_module_version = _make_module_version_injector(module, version)
+
+    # Configure standard library root logger
+    root_logger = logging.getLogger()
+    root_logger.setLevel(log_level)
+
+    # Remove all existing handlers
+    for handler in root_logger.handlers[:]:
+        root_logger.removeHandler(handler)
+
+    # Create handler that writes to stderr
+    handler = logging.StreamHandler(sys.stderr)
+    handler.setLevel(log_level)
+
+    # Use structlog's processors with module/version injection
+    handler.setFormatter(
+        structlog.stdlib.ProcessorFormatter(
+            foreign_pre_chain=[
+                structlog.stdlib.add_log_level,
+                structlog.stdlib.ExtraAdder(),
+                structlog.processors.TimeStamper(fmt="iso"),
+                inject_module_version,
+            ],
+            processors=[
+                structlog.stdlib.ProcessorFormatter.remove_processors_meta,
+                structlog.processors.add_log_level,
+                structlog.processors.JSONRenderer(),
+            ],
+        )
+    )
+
+    root_logger.addHandler(handler)
diff --git a/health-monitors/fabric-manager-monitor/fabric_manager_monitor/metrics.py b/health-monitors/fabric-manager-monitor/fabric_manager_monitor/metrics.py
new file mode 100644
index 000000000..5fd279e00
--- /dev/null
+++ b/health-monitors/fabric-manager-monitor/fabric_manager_monitor/metrics.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Prometheus metric definitions for health checks."""
+
+from prometheus_client import Counter, Gauge, Histogram
+
+# --- Check infrastructure ---
+check_duration = Histogram(
+    "fabric_monitor_check_duration_seconds",
+    "Duration of individual health check execution",
+    labelnames=["check_name"],
+    buckets=(0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0),
+)
+
+check_errors = Counter(
+    "fabric_monitor_check_errors_total",
+    "Total errors encountered during health checks",
+    labelnames=["check_name"],
+)
+
+overall_reconcile_loop_time = Histogram(
+    "fabric_monitor_reconcile_time",
+    "Amount of time spent running a single reconcile loop",
+)
+
+callback_failures = Counter(
+    "fabric_monitor_callback_failures",
+    "Number of times a callback function has thrown an exception",
+    labelnames=["class_name", "func_name"],
+)
+
+callback_success = Counter(
+    "fabric_monitor_callback_success",
+    "Number of times a callback function has successfully completed",
+    labelnames=["class_name", "func_name"],
+)
+
+# --- Fabric Manager ---
+fabric_manager_up = Gauge(
+    "fabric_manager_up",
+    "Fabric Manager service status (1=running, 0=down)",
+    labelnames=["node"],
+)
+
+fabric_manager_last_healthy_seconds = Gauge(
+    "fabric_manager_last_healthy_seconds",
+    "Unix timestamp of last healthy Fabric Manager observation",
+    labelnames=["node"],
+)
+
+# --- GPU systemd services ---
+nvidia_service_up = Gauge(
+    "nvidia_service_up",
+    "NVIDIA systemd service status (1=running, 0=down)",
+    labelnames=["node", "service_name"],
+)
+
+# --- PCIe link health ---
+pcie_link_width = Gauge(
+    "pcie_link_width",
+    "Current PCIe link width",
+    labelnames=["node", "gpu"],
+)
+
+pcie_link_gen = Gauge(
+    "pcie_link_gen",
+    "Current PCIe link generation",
+    labelnames=["node", "gpu"],
+)
+
+pcie_link_degraded = Gauge(
+    "pcie_link_degraded",
+    "PCIe link degraded (1=degraded, 0=normal)",
+    labelnames=["node", "gpu"],
+)
+
+# --- NVLink fabric ---
+nvlink_fabric_healthy = Gauge(
+    "nvlink_fabric_healthy",
+    "NVLink fabric health (1=healthy, 0=degraded)",
+    labelnames=["node"],
+)
+
+# --- Clock throttling ---
+gpu_clock_throttled = Gauge(
+    "gpu_clock_throttled",
+    "GPU clock throttled (1=throttled, 0=normal)",
+    labelnames=["node", "gpu"],
+)
+
+gpu_clock_ratio = Gauge(
+    "gpu_clock_ratio",
+    "GPU clock ratio (current/max, 1.0=full speed)",
+    labelnames=["node", "gpu"],
+)
+
+# --- CUDA validation ---
+cuda_validation_passed = Gauge(
+    "cuda_validation_passed",
+    "CUDA validation result (1=passed, 0=failed)",
+    labelnames=["node"],
+)
+
+# --- Overall node health ---
+gpu_node_health_up = Gauge(
+    "gpu_node_health_up",
+    "Overall GPU node health (1=healthy, 0=unhealthy)",
+    labelnames=["node"],
+)
+
+# --- Active health events ---
+active_health_events = Gauge(
+    "fabric_monitor_active_health_events",
+    "Number of active health events by type and severity",
+    labelnames=["event_type", "gpu_id", "severity"],
+)
diff --git a/health-monitors/fabric-manager-monitor/fabric_manager_monitor/platform_connector/__init__.py b/health-monitors/fabric-manager-monitor/fabric_manager_monitor/platform_connector/__init__.py
new file mode 100644
index 000000000..341a77c5b
--- /dev/null
+++ b/health-monitors/fabric-manager-monitor/fabric_manager_monitor/platform_connector/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/health-monitors/fabric-manager-monitor/fabric_manager_monitor/platform_connector/event_processor.py b/health-monitors/fabric-manager-monitor/fabric_manager_monitor/platform_connector/event_processor.py
new file mode 100644
index 000000000..8c9b854cb
--- /dev/null
+++ b/health-monitors/fabric-manager-monitor/fabric_manager_monitor/platform_connector/event_processor.py
@@ -0,0 +1,167 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Platform-connector event processor.
+
+Implements the CallbackInterface to convert CheckResults into protobuf HealthEvents
+and send them to the platform-connector via gRPC over a Unix domain socket.
+"""
+
+import dataclasses
+import logging as log
+from time import sleep
+from typing import List
+
+import grpc
+from google.protobuf.timestamp_pb2 import Timestamp
+
+from fabric_manager_monitor.checkers.types import CallbackInterface, CheckResult
+from fabric_manager_monitor.protos import (
+    health_event_pb2 as platformconnector_pb2,
+    health_event_pb2_grpc as platformconnector_pb2_grpc,
+)
+from . import metrics
+
+MAX_RETRIES = 5
+INITIAL_DELAY = 2
+MAX_DELAY = 15
+
+
+@dataclasses.dataclass
+class CachedEntityState:
+    is_fatal: bool
+    is_healthy: bool
+
+
+class PlatformConnectorEventProcessor(CallbackInterface):
+    """Converts check results to HealthEvents and sends them via gRPC to platform-connector."""
+
+    def __init__(
+        self,
+        socket_path: str,
+        node_name: str,
+        processing_strategy: platformconnector_pb2.ProcessingStrategy,
+    ) -> None:
+        self._socket_path = socket_path
+        self._node_name = node_name
+        self._version = 1
+        self._agent = "fabric-manager-monitor"
+        self._component_class = "INFRASTRUCTURE"
+        self._processing_strategy = processing_strategy
+        self.entity_cache: dict[str, CachedEntityState] = {}
+
+    def _build_cache_key(self, check_name: str, entities_impacted: List[dict]) -> str:
+        """Build a cache key from check name and impacted entities."""
+        entity_str = "|".join(
+            f"{e['entityType']}:{e['entityValue']}"
+            for e in sorted(entities_impacted, key=lambda e: (e["entityType"], e["entityValue"]))
+        )
+        return f"{check_name}|{entity_str}"
+
+    def _get_recommended_action(self, result: CheckResult) -> int:
+        """Map check result to a RecommendedAction enum value.
+
+        Fatal infrastructure failures (Fabric Manager down, PCIe degraded) recommend RESTART_BM.
+        Non-fatal issues (clock throttle, NVLink CRC) recommend CONTACT_SUPPORT.
+        Healthy results use NONE.
+        """
+        if result.is_healthy:
+            return platformconnector_pb2.NONE
+        if result.is_fatal:
+            return platformconnector_pb2.RESTART_BM
+        return platformconnector_pb2.CONTACT_SUPPORT
+
+    def health_check_completed(self, results: List[CheckResult]) -> None:
+        """Process check results and send state-change HealthEvents to platform-connector."""
+        with metrics.health_events_publish_time_to_grpc_channel.labels(
+            "health_check_completed_to_grpc_channel"
+        ).time():
+            log.debug("received callback for health check completed")
+            timestamp = Timestamp()
+            timestamp.GetCurrentTime()
+
+            health_events = []
+            pending_cache_updates: dict[str, CachedEntityState] = {}
+
+            for result in results:
+                cache_key = self._build_cache_key(result.check_name, result.entities_impacted)
+                cached = self.entity_cache.get(cache_key)
+
+                # Only send if state changed (or first observation)
+                if cached is None or cached.is_fatal != result.is_fatal or cached.is_healthy != result.is_healthy:
+                    entities = [
+                        platformconnector_pb2.Entity(entityType=e["entityType"], entityValue=e["entityValue"])
+                        for e in result.entities_impacted
+                    ]
+
+                    recommended_action = self._get_recommended_action(result)
+
+                    health_event = platformconnector_pb2.HealthEvent(
+                        version=self._version,
+                        agent=self._agent,
+                        componentClass=self._component_class,
+                        checkName=result.check_name,
+                        isFatal=result.is_fatal,
+                        isHealthy=result.is_healthy,
+                        message=result.message,
+                        recommendedAction=recommended_action,
+                        errorCode=result.error_codes,
+                        entitiesImpacted=entities,
+                        metadata=result.metadata or {},
+                        generatedTimestamp=timestamp,
+                        nodeName=self._node_name,
+                        processingStrategy=self._processing_strategy,
+                    )
+                    health_events.append(health_event)
+                    pending_cache_updates[cache_key] = CachedEntityState(
+                        is_fatal=result.is_fatal, is_healthy=result.is_healthy
+                    )
+
+            log.debug(f"fabric manager health events to send: {len(health_events)}")
+            if len(health_events):
+                try:
+                    if self.send_health_event_with_retries(health_events):
+                        # Only update cache after successful send
+                        for key, state in pending_cache_updates.items():
+                            self.entity_cache[key] = state
+                            log.info(f"Updated cache for key {key} with value {state} after successful send")
+                except Exception as e:
+                    log.error(f"Exception while sending health events: {e}")
+
+    def send_health_event_with_retries(self, health_events: list[platformconnector_pb2.HealthEvent]) -> bool:
+        """Send health events to the platform connector with retries.
+
+        Returns:
+            True if the send was successful, False if all retries were exhausted.
+            Cache updates should only be performed by the caller when this returns True.
+        """
+        delay = INITIAL_DELAY
+        for _ in range(MAX_RETRIES):
+            with grpc.insecure_channel(f"unix://{self._socket_path}") as chan:
+                stub = platformconnector_pb2_grpc.PlatformConnectorStub(chan)
+                try:
+                    stub.HealthEventOccurredV1(platformconnector_pb2.HealthEvents(events=health_events, version=1))
+                    metrics.events_sent_success.inc()
+                    return True
+                except grpc.RpcError as e:
+                    log.error(f"Failed to send health event to UDS: {e}")
+                    sleep(delay)
+                    delay = min(delay * 1.5, MAX_DELAY)
+                    continue
+        metrics.events_sent_error.inc()
+        log.warning(
+            f"Failed to send health event after {MAX_RETRIES} retries. "
+            "Events will be retried on next health check cycle."
+        )
+        return False
diff --git a/health-monitors/fabric-manager-monitor/fabric_manager_monitor/platform_connector/metrics.py b/health-monitors/fabric-manager-monitor/fabric_manager_monitor/platform_connector/metrics.py
new file mode 100644
index 000000000..b6ad95df2
--- /dev/null
+++ b/health-monitors/fabric-manager-monitor/fabric_manager_monitor/platform_connector/metrics.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Prometheus metric definitions for gRPC event publishing."""
+
+from prometheus_client import Counter, Histogram
+
+health_events_publish_time_to_grpc_channel = Histogram(
+    "fabric_monitor_health_events_publish_time_to_grpc_channel",
+    "Amount of time spent publishing health events on the gRPC channel",
+    labelnames=["operation_name"],
+)
+
+events_sent_success = Counter(
+    "fabric_monitor_events_sent_success",
+    "Total number of successful health event sends to platform-connector UDS",
+)
+
+events_sent_error = Counter(
+    "fabric_monitor_events_sent_error",
+    "Total number of failed health event sends to platform-connector UDS",
+)
diff --git a/health-monitors/fabric-manager-monitor/fabric_manager_monitor/protos/__init__.py b/health-monitors/fabric-manager-monitor/fabric_manager_monitor/protos/__init__.py
new file mode 100644
index 000000000..341a77c5b
--- /dev/null
+++ b/health-monitors/fabric-manager-monitor/fabric_manager_monitor/protos/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/health-monitors/fabric-manager-monitor/fabric_manager_monitor/protos/health_event_pb2.py b/health-monitors/fabric-manager-monitor/fabric_manager_monitor/protos/health_event_pb2.py
new file mode 100644
index 000000000..6ff02b4ee
--- /dev/null
+++ b/health-monitors/fabric-manager-monitor/fabric_manager_monitor/protos/health_event_pb2.py
@@ -0,0 +1,55 @@
+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# NO CHECKED-IN PROTOBUF GENCODE
+# source: health_event.proto
+# Protobuf Python Version: 6.31.1
+"""Generated protocol buffer code."""
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import descriptor_pool as _descriptor_pool
+from google.protobuf import runtime_version as _runtime_version
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf.internal import builder as _builder
+_runtime_version.ValidateProtobufRuntimeVersion(
+    _runtime_version.Domain.PUBLIC,
+    6,
+    31,
+    1,
+    '',
+    'health_event.proto'
+)
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__pb2
+from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2
+
+
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x12health_event.proto\x12\ndatamodels\x1a\x1fgoogle/protobuf/timestamp.proto\x1a\x1bgoogle/protobuf/empty.proto\"H\n\x0cHealthEvents\x12\x0f\n\x07version\x18\x01 \x01(\r\x12\'\n\x06\x65vents\x18\x02 \x03(\x0b\x32\x17.datamodels.HealthEvent\"1\n\x06\x45ntity\x12\x12\n\nentityType\x18\x01 \x01(\t\x12\x13\n\x0b\x65ntityValue\x18\x02 \x01(\t\"\xf9\x04\n\x0bHealthEvent\x12\x0f\n\x07version\x18\x01 \x01(\r\x12\r\n\x05\x61gent\x18\x02 \x01(\t\x12\x16\n\x0e\x63omponentClass\x18\x03 \x01(\t\x12\x11\n\tcheckName\x18\x04 \x01(\t\x12\x0f\n\x07isFatal\x18\x05 \x01(\x08\x12\x11\n\tisHealthy\x18\x06 \x01(\x08\x12\x0f\n\x07message\x18\x07 \x01(\t\x12\x38\n\x11recommendedAction\x18\x08 \x01(\x0e\x32\x1d.datamodels.RecommendedAction\x12\x11\n\terrorCode\x18\t \x03(\t\x12,\n\x10\x65ntitiesImpacted\x18\n \x03(\x0b\x32\x12.datamodels.Entity\x12\x37\n\x08metadata\x18\x0b \x03(\x0b\x32%.datamodels.HealthEvent.MetadataEntry\x12\x36\n\x12generatedTimestamp\x18\x0c \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\x10\n\x08nodeName\x18\r \x01(\t\x12;\n\x13quarantineOverrides\x18\x0e \x01(\x0b\x32\x1e.datamodels.BehaviourOverrides\x12\x36\n\x0e\x64rainOverrides\x18\x0f \x01(\x0b\x32\x1e.datamodels.BehaviourOverrides\x12:\n\x12processingStrategy\x18\x10 \x01(\x0e\x32\x1e.datamodels.ProcessingStrategy\x12\n\n\x02id\x18\x11 \x01(\t\x1a/\n\rMetadataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"1\n\x12\x42\x65haviourOverrides\x12\r\n\x05\x66orce\x18\x01 \x01(\x08\x12\x0c\n\x04skip\x18\x02 \x01(\x08*N\n\x12ProcessingStrategy\x12\x0f\n\x0bUNSPECIFIED\x10\x00\x12\x17\n\x13\x45XECUTE_REMEDIATION\x10\x01\x12\x0e\n\nSTORE_ONLY\x10\x02*\xa8\x01\n\x11RecommendedAction\x12\x08\n\x04NONE\x10\x00\x12\x13\n\x0f\x43OMPONENT_RESET\x10\x02\x12\x13\n\x0f\x43ONTACT_SUPPORT\x10\x05\x12\x11\n\rRUN_FIELDDIAG\x10\x06\x12\x0e\n\nRESTART_VM\x10\x0f\x12\x0e\n\nRESTART_BM\x10\x18\x12\x0e\n\nREPLACE_VM\x10\x19\x12\x0f\n\x0bRUN_DCGMEUD\x10\x1a\x12\x0b\n\x07UNKNOWN\x10\x63\x32`\n\x11PlatformConnector\x12K\n\x15HealthEventOccurredV1\x12\x18.datamodels.HealthEvents\x1a\x16.google.protobuf.Empty\"\x00\x42\x35Z3github.com/nvidia/nvsentinel/data-models/pkg/protosb\x06proto3')
+
+_globals = globals()
+_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
+_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'health_event_pb2', _globals)
+if not _descriptor._USE_C_DESCRIPTORS:
+  _globals['DESCRIPTOR']._loaded_options = None
+  _globals['DESCRIPTOR']._serialized_options = b'Z3github.com/nvidia/nvsentinel/data-models/pkg/protos'
+  _globals['_HEALTHEVENT_METADATAENTRY']._loaded_options = None
+  _globals['_HEALTHEVENT_METADATAENTRY']._serialized_options = b'8\001'
+  _globals['_PROCESSINGSTRATEGY']._serialized_start=908
+  _globals['_PROCESSINGSTRATEGY']._serialized_end=986
+  _globals['_RECOMMENDEDACTION']._serialized_start=989
+  _globals['_RECOMMENDEDACTION']._serialized_end=1157
+  _globals['_HEALTHEVENTS']._serialized_start=96
+  _globals['_HEALTHEVENTS']._serialized_end=168
+  _globals['_ENTITY']._serialized_start=170
+  _globals['_ENTITY']._serialized_end=219
+  _globals['_HEALTHEVENT']._serialized_start=222
+  _globals['_HEALTHEVENT']._serialized_end=855
+  _globals['_HEALTHEVENT_METADATAENTRY']._serialized_start=808
+  _globals['_HEALTHEVENT_METADATAENTRY']._serialized_end=855
+  _globals['_BEHAVIOUROVERRIDES']._serialized_start=857
+  _globals['_BEHAVIOUROVERRIDES']._serialized_end=906
+  _globals['_PLATFORMCONNECTOR']._serialized_start=1159
+  _globals['_PLATFORMCONNECTOR']._serialized_end=1255
+# @@protoc_insertion_point(module_scope)
diff --git a/health-monitors/fabric-manager-monitor/fabric_manager_monitor/protos/health_event_pb2.pyi b/health-monitors/fabric-manager-monitor/fabric_manager_monitor/protos/health_event_pb2.pyi
new file mode 100644
index 000000000..fa1695c5e
--- /dev/null
+++ b/health-monitors/fabric-manager-monitor/fabric_manager_monitor/protos/health_event_pb2.pyi
@@ -0,0 +1,111 @@
+import datetime
+
+from google.protobuf import timestamp_pb2 as _timestamp_pb2
+from google.protobuf import empty_pb2 as _empty_pb2
+from google.protobuf.internal import containers as _containers
+from google.protobuf.internal import enum_type_wrapper as _enum_type_wrapper
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from collections.abc import Iterable as _Iterable, Mapping as _Mapping
+from typing import ClassVar as _ClassVar, Optional as _Optional, Union as _Union
+
+DESCRIPTOR: _descriptor.FileDescriptor
+
+class ProcessingStrategy(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
+    __slots__ = ()
+    UNSPECIFIED: _ClassVar[ProcessingStrategy]
+    EXECUTE_REMEDIATION: _ClassVar[ProcessingStrategy]
+    STORE_ONLY: _ClassVar[ProcessingStrategy]
+
+class RecommendedAction(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
+    __slots__ = ()
+    NONE: _ClassVar[RecommendedAction]
+    COMPONENT_RESET: _ClassVar[RecommendedAction]
+    CONTACT_SUPPORT: _ClassVar[RecommendedAction]
+    RUN_FIELDDIAG: _ClassVar[RecommendedAction]
+    RESTART_VM: _ClassVar[RecommendedAction]
+    RESTART_BM: _ClassVar[RecommendedAction]
+    REPLACE_VM: _ClassVar[RecommendedAction]
+    RUN_DCGMEUD: _ClassVar[RecommendedAction]
+    UNKNOWN: _ClassVar[RecommendedAction]
+UNSPECIFIED: ProcessingStrategy
+EXECUTE_REMEDIATION: ProcessingStrategy
+STORE_ONLY: ProcessingStrategy
+NONE: RecommendedAction
+COMPONENT_RESET: RecommendedAction
+CONTACT_SUPPORT: RecommendedAction
+RUN_FIELDDIAG: RecommendedAction
+RESTART_VM: RecommendedAction
+RESTART_BM: RecommendedAction
+REPLACE_VM: RecommendedAction
+RUN_DCGMEUD: RecommendedAction
+UNKNOWN: RecommendedAction
+
+class HealthEvents(_message.Message):
+    __slots__ = ("version", "events")
+    VERSION_FIELD_NUMBER: _ClassVar[int]
+    EVENTS_FIELD_NUMBER: _ClassVar[int]
+    version: int
+    events: _containers.RepeatedCompositeFieldContainer[HealthEvent]
+    def __init__(self, version: _Optional[int] = ..., events: _Optional[_Iterable[_Union[HealthEvent, _Mapping]]] = ...) -> None: ...
+
+class Entity(_message.Message):
+    __slots__ = ("entityType", "entityValue")
+    ENTITYTYPE_FIELD_NUMBER: _ClassVar[int]
+    ENTITYVALUE_FIELD_NUMBER: _ClassVar[int]
+    entityType: str
+    entityValue: str
+    def __init__(self, entityType: _Optional[str] = ..., entityValue: _Optional[str] = ...) -> None: ...
+
+class HealthEvent(_message.Message):
+    __slots__ = ("version", "agent", "componentClass", "checkName", "isFatal", "isHealthy", "message", "recommendedAction", "errorCode", "entitiesImpacted", "metadata", "generatedTimestamp", "nodeName", "quarantineOverrides", "drainOverrides", "processingStrategy", "id")
+    class MetadataEntry(_message.Message):
+        __slots__ = ("key", "value")
+        KEY_FIELD_NUMBER: _ClassVar[int]
+        VALUE_FIELD_NUMBER: _ClassVar[int]
+        key: str
+        value: str
+        def __init__(self, key: _Optional[str] = ..., value: _Optional[str] = ...) -> None: ...
+    VERSION_FIELD_NUMBER: _ClassVar[int]
+    AGENT_FIELD_NUMBER: _ClassVar[int]
+    COMPONENTCLASS_FIELD_NUMBER: _ClassVar[int]
+    CHECKNAME_FIELD_NUMBER: _ClassVar[int]
+    ISFATAL_FIELD_NUMBER: _ClassVar[int]
+    ISHEALTHY_FIELD_NUMBER: _ClassVar[int]
+    MESSAGE_FIELD_NUMBER: _ClassVar[int]
+    RECOMMENDEDACTION_FIELD_NUMBER: _ClassVar[int]
+    ERRORCODE_FIELD_NUMBER: _ClassVar[int]
+    ENTITIESIMPACTED_FIELD_NUMBER: _ClassVar[int]
+    METADATA_FIELD_NUMBER: _ClassVar[int]
+    GENERATEDTIMESTAMP_FIELD_NUMBER: _ClassVar[int]
+    NODENAME_FIELD_NUMBER: _ClassVar[int]
+    QUARANTINEOVERRIDES_FIELD_NUMBER: _ClassVar[int]
+    DRAINOVERRIDES_FIELD_NUMBER: _ClassVar[int]
+    PROCESSINGSTRATEGY_FIELD_NUMBER: _ClassVar[int]
+    ID_FIELD_NUMBER: _ClassVar[int]
+    version: int
+    agent: str
+    componentClass: str
+    checkName: str
+    isFatal: bool
+    isHealthy: bool
+    message: str
+    recommendedAction: RecommendedAction
+    errorCode: _containers.RepeatedScalarFieldContainer[str]
+    entitiesImpacted: _containers.RepeatedCompositeFieldContainer[Entity]
+    metadata: _containers.ScalarMap[str, str]
+    generatedTimestamp: _timestamp_pb2.Timestamp
+    nodeName: str
+    quarantineOverrides: BehaviourOverrides
+    drainOverrides: BehaviourOverrides
+    processingStrategy: ProcessingStrategy
+    id: str
+    def __init__(self, version: _Optional[int] = ..., agent: _Optional[str] = ..., componentClass: _Optional[str] = ..., checkName: _Optional[str] = ..., isFatal: bool = ..., isHealthy: bool = ..., message: _Optional[str] = ..., recommendedAction: _Optional[_Union[RecommendedAction, str]] = ..., errorCode: _Optional[_Iterable[str]] = ..., entitiesImpacted: _Optional[_Iterable[_Union[Entity, _Mapping]]] = ..., metadata: _Optional[_Mapping[str, str]] = ..., generatedTimestamp: _Optional[_Union[datetime.datetime, _timestamp_pb2.Timestamp, _Mapping]] = ..., nodeName: _Optional[str] = ..., quarantineOverrides: _Optional[_Union[BehaviourOverrides, _Mapping]] = ..., drainOverrides: _Optional[_Union[BehaviourOverrides, _Mapping]] = ..., processingStrategy: _Optional[_Union[ProcessingStrategy, str]] = ..., id: _Optional[str] = ...) -> None: ...
+
+class BehaviourOverrides(_message.Message):
+    __slots__ = ("force", "skip")
+    FORCE_FIELD_NUMBER: _ClassVar[int]
+    SKIP_FIELD_NUMBER: _ClassVar[int]
+    force: bool
+    skip: bool
+    def __init__(self, force: bool = ..., skip: bool = ...) -> None: ...
diff --git a/health-monitors/fabric-manager-monitor/fabric_manager_monitor/protos/health_event_pb2_grpc.py b/health-monitors/fabric-manager-monitor/fabric_manager_monitor/protos/health_event_pb2_grpc.py
new file mode 100644
index 000000000..b28adea79
--- /dev/null
+++ b/health-monitors/fabric-manager-monitor/fabric_manager_monitor/protos/health_event_pb2_grpc.py
@@ -0,0 +1,98 @@
+# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
+"""Client and server classes corresponding to protobuf-defined services."""
+import grpc
+import warnings
+
+from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2
+from . import health_event_pb2 as health__event__pb2
+
+GRPC_GENERATED_VERSION = '1.75.1'
+GRPC_VERSION = grpc.__version__
+_version_not_supported = False
+
+try:
+    from grpc._utilities import first_version_is_lower
+    _version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION)
+except ImportError:
+    _version_not_supported = True
+
+if _version_not_supported:
+    raise RuntimeError(
+        f'The grpc package installed is at version {GRPC_VERSION},'
+        + f' but the generated code in health_event_pb2_grpc.py depends on'
+        + f' grpcio>={GRPC_GENERATED_VERSION}.'
+        + f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}'
+        + f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.'
+    )
+
+
+class PlatformConnectorStub(object):
+    """Missing associated documentation comment in .proto file."""
+
+    def __init__(self, channel):
+        """Constructor.
+
+        Args:
+            channel: A grpc.Channel.
+        """
+        self.HealthEventOccurredV1 = channel.unary_unary(
+                '/datamodels.PlatformConnector/HealthEventOccurredV1',
+                request_serializer=health__event__pb2.HealthEvents.SerializeToString,
+                response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+                _registered_method=True)
+
+
+class PlatformConnectorServicer(object):
+    """Missing associated documentation comment in .proto file."""
+
+    def HealthEventOccurredV1(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+
+def add_PlatformConnectorServicer_to_server(servicer, server):
+    rpc_method_handlers = {
+            'HealthEventOccurredV1': grpc.unary_unary_rpc_method_handler(
+                    servicer.HealthEventOccurredV1,
+                    request_deserializer=health__event__pb2.HealthEvents.FromString,
+                    response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
+            ),
+    }
+    generic_handler = grpc.method_handlers_generic_handler(
+            'datamodels.PlatformConnector', rpc_method_handlers)
+    server.add_generic_rpc_handlers((generic_handler,))
+    server.add_registered_method_handlers('datamodels.PlatformConnector', rpc_method_handlers)
+
+
+ # This class is part of an EXPERIMENTAL API.
+class PlatformConnector(object):
+    """Missing associated documentation comment in .proto file."""
+
+    @staticmethod
+    def HealthEventOccurredV1(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
+            target,
+            '/datamodels.PlatformConnector/HealthEventOccurredV1',
+            health__event__pb2.HealthEvents.SerializeToString,
+            google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+            _registered_method=True)
diff --git a/health-monitors/fabric-manager-monitor/pyproject.toml b/health-monitors/fabric-manager-monitor/pyproject.toml
new file mode 100644
index 000000000..0b1c50dcd
--- /dev/null
+++ b/health-monitors/fabric-manager-monitor/pyproject.toml
@@ -0,0 +1,70 @@
+[tool.poetry]
+name = "fabric-manager-monitor"
+version = "0.1.0"
+description = "GPU node health monitor for Fabric Manager and infrastructure failures"
+authors = ["Community Contributors"]
+readme = "README.md"
+
+[tool.poetry.dependencies]
+python = "^3.10"
+click = "^8.3.1"
+grpcio = "^1.78.0"
+prometheus-client = "^0.24.1"
+protobuf = ">=6.31.1,<7.0.0"
+googleapis-common-protos = ">=1.56.0"
+structlog = "^25.1.0"
+requests = "^2.32.2"
+
+[tool.poetry.group.dev.dependencies]
+black = "^26.1.0"
+coverage = "^7.13.4"
+grpcio-tools = "^1.78.0"
+pytest = "^9.0.2"
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
+
+[tool.poetry.scripts]
+fabric_manager_monitor = "fabric_manager_monitor.cli:cli"
+
+[tool.black]
+line-length = 120
+include = '\.pyi?$'
+exclude = '''
+
+(
+  /(
+      \.eggs         # exclude a few common directories in the
+    | \.git          # root of the project
+    | \.hg
+    | \.mypy_cache
+    | \.tox
+    | \.venv
+    | _build
+    | buck-out
+    | build
+    | dist
+  )/
+  | fabric_manager_monitor/protos
+
+)
+'''
+
+[tool.coverage.report]
+exclude_also = [
+    "def __repr__",
+    "if self.debug:",
+    "if settings.DEBUG",
+    "raise AssertionError",
+    "raise NotImplementedError",
+    "if 0:",
+    "if __name__ == .__main__.:",
+    "if TYPE_CHECKING:",
+    "class .*\\bProtocol\\):",
+    "@(abc\\.)?abstractmethod",
+]
+omit = [
+    "tests/*",
+    "fabric_manager_monitor/protos/*"
+]