Skip to content

Commit 0de510d

Browse files
yroblataskbot
andauthored
fix(vmcp): /status uses live health monitor state (#4135)
The /status endpoint was reading backend health from the static registry snapshot (set at discovery time and never updated), while /api/backends/health correctly read from the live health monitor. This caused the two endpoints to report inconsistent state for the same backend (issue #4103). Fix buildStatusResponse() to call GetAllBackendHealthStates() and prefer the monitor's runtime health over the registry's initial value. When health monitoring is disabled the registry value is used as before, preserving backwards compatibility. Add unit tests that assert /status reflects live monitor state for both healthy and unhealthy transitions, and an e2e It block in the circuit breaker lifecycle suite that compares both endpoints side-by-side once the unstable backend's circuit breaker opens. Closes: #4103 Co-authored-by: taskbot <taskbot@users.noreply.github.com>
1 parent a0f2504 commit 0de510d

4 files changed

Lines changed: 401 additions & 60 deletions

File tree

pkg/vmcp/server/status.go

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212
"github.com/stacklok/toolhive/pkg/versions"
1313
"github.com/stacklok/toolhive/pkg/vmcp"
1414
authtypes "github.com/stacklok/toolhive/pkg/vmcp/auth/types"
15+
"github.com/stacklok/toolhive/pkg/vmcp/health"
1516
)
1617

1718
// StatusResponse represents the vMCP server's operational status.
@@ -63,17 +64,36 @@ func (s *Server) buildStatusResponse(ctx context.Context) StatusResponse {
6364
backends := s.backendRegistry.List(ctx)
6465
backendStatuses := make([]BackendStatus, 0, len(backends))
6566

67+
// Get live health states from the health monitor (if enabled) so that
68+
// /status reflects the same runtime health as /api/backends/health.
69+
// Skip the call — and the map allocation — entirely when monitoring is
70+
// disabled. Reading from a nil map is safe in Go and returns zero values.
71+
s.healthMonitorMu.RLock()
72+
healthMon := s.healthMonitor
73+
s.healthMonitorMu.RUnlock()
74+
75+
var liveHealthStates map[string]*health.State
76+
if healthMon != nil {
77+
liveHealthStates = healthMon.GetAllBackendStates()
78+
}
79+
6680
hasHealthyBackend := false
6781
for _, backend := range backends {
82+
// Prefer the live health monitor state over the static registry value.
83+
healthStatus := backend.HealthStatus
84+
if liveState, ok := liveHealthStates[backend.ID]; ok {
85+
healthStatus = liveState.Status
86+
}
87+
6888
status := BackendStatus{
6989
Name: backend.Name,
70-
Health: string(backend.HealthStatus),
90+
Health: string(healthStatus),
7191
Transport: backend.TransportType,
7292
AuthType: getAuthType(backend.AuthConfig),
7393
}
7494
backendStatuses = append(backendStatuses, status)
7595

76-
if backend.HealthStatus == vmcp.BackendHealthy {
96+
if healthStatus == vmcp.BackendHealthy {
7797
hasHealthyBackend = true
7898
}
7999
}

pkg/vmcp/server/status_test.go

Lines changed: 227 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ package server_test
66
import (
77
"context"
88
"encoding/json"
9+
"errors"
910
"net/http"
1011
"testing"
1112
"time"
@@ -19,6 +20,7 @@ import (
1920
"github.com/stacklok/toolhive/pkg/vmcp/aggregator"
2021
authtypes "github.com/stacklok/toolhive/pkg/vmcp/auth/types"
2122
discoveryMocks "github.com/stacklok/toolhive/pkg/vmcp/discovery/mocks"
23+
"github.com/stacklok/toolhive/pkg/vmcp/health"
2224
"github.com/stacklok/toolhive/pkg/vmcp/mocks"
2325
"github.com/stacklok/toolhive/pkg/vmcp/router"
2426
"github.com/stacklok/toolhive/pkg/vmcp/server"
@@ -40,66 +42,15 @@ type BackendStatus struct {
4042
AuthType string `json:"auth_type,omitempty"`
4143
}
4244

43-
// createTestServerWithBackends creates a test server instance with custom backends.
45+
// createTestServerWithBackends creates a test server instance with custom backends
46+
// and no health monitoring. It is a convenience wrapper around createTestServerWithHealthMonitor.
4447
func createTestServerWithBackends(t *testing.T, backends []vmcp.Backend, groupRef string) *server.Server {
4548
t.Helper()
46-
47-
ctrl := gomock.NewController(t)
48-
t.Cleanup(ctrl.Finish)
49-
50-
mockBackendClient := mocks.NewMockBackendClient(ctrl)
51-
mockDiscoveryMgr := discoveryMocks.NewMockManager(ctrl)
52-
rt := router.NewDefaultRouter()
53-
54-
port := networking.FindAvailable()
55-
require.NotZero(t, port, "Failed to find available port")
56-
57-
mockDiscoveryMgr.EXPECT().
58-
Discover(gomock.Any(), gomock.Any()).
59-
Return(&aggregator.AggregatedCapabilities{
60-
Tools: []vmcp.Tool{},
61-
Resources: []vmcp.Resource{},
62-
Prompts: []vmcp.Prompt{},
63-
RoutingTable: &vmcp.RoutingTable{
64-
Tools: make(map[string]*vmcp.BackendTarget),
65-
Resources: make(map[string]*vmcp.BackendTarget),
66-
Prompts: make(map[string]*vmcp.BackendTarget),
67-
},
68-
Metadata: &aggregator.AggregationMetadata{},
69-
}, nil).
70-
AnyTimes()
71-
mockDiscoveryMgr.EXPECT().Stop().AnyTimes()
72-
73-
ctx, cancel := context.WithCancel(t.Context())
74-
75-
srv, err := server.New(ctx, &server.Config{
76-
Name: "test-vmcp",
77-
Version: "1.0.0",
78-
Host: "127.0.0.1",
79-
Port: port,
80-
GroupRef: groupRef,
81-
SessionFactory: newNoopMockFactory(t),
82-
}, rt, mockBackendClient, mockDiscoveryMgr, vmcp.NewImmutableRegistry(backends), nil)
83-
require.NoError(t, err)
84-
85-
t.Cleanup(cancel)
86-
errCh := make(chan error, 1)
87-
go func() {
88-
if err := srv.Start(ctx); err != nil {
89-
errCh <- err
90-
}
91-
}()
92-
93-
select {
94-
case <-srv.Ready():
95-
case err := <-errCh:
96-
t.Fatalf("Server failed to start: %v", err)
97-
case <-time.After(5 * time.Second):
98-
t.Fatalf("Server did not become ready within 5s (address: %s)", srv.Address())
99-
}
100-
101-
time.Sleep(10 * time.Millisecond)
102-
return srv
49+
return createTestServerWithHealthMonitor(t, backends,
50+
health.MonitorConfig{}, // zero value → no health monitor started
51+
nil, // no mock expectations needed
52+
groupRef,
53+
)
10354
}
10455

10556
func TestStatusEndpoint_HTTPBehavior(t *testing.T) {
@@ -242,3 +193,221 @@ func TestStatusEndpoint_BackendFieldMapping(t *testing.T) {
242193
assert.Equal(t, "streamable-http", b.Transport)
243194
assert.Equal(t, authtypes.StrategyTypeTokenExchange, b.AuthType)
244195
}
196+
197+
// createTestServerWithHealthMonitor creates a test server with health monitoring enabled.
198+
// setupMock configures mock expectations on the backend client (e.g. ListCapabilities responses for health checks).
199+
// groupRef is set in the server config (empty string is fine for tests that don't need it).
200+
func createTestServerWithHealthMonitor(
201+
t *testing.T,
202+
backends []vmcp.Backend,
203+
monitorCfg health.MonitorConfig,
204+
setupMock func(mockClient *mocks.MockBackendClient),
205+
groupRef string,
206+
) *server.Server {
207+
t.Helper()
208+
209+
// ctrl.Finish must run last so that all mock calls have already stopped.
210+
// t.Cleanup is LIFO, so register it first — it will execute third.
211+
ctrl := gomock.NewController(t)
212+
t.Cleanup(ctrl.Finish)
213+
214+
mockBackendClient := mocks.NewMockBackendClient(ctrl)
215+
mockDiscoveryMgr := discoveryMocks.NewMockManager(ctrl)
216+
rt := router.NewDefaultRouter()
217+
218+
if setupMock != nil {
219+
setupMock(mockBackendClient)
220+
}
221+
222+
port := networking.FindAvailable()
223+
require.NotZero(t, port, "Failed to find available port")
224+
225+
mockDiscoveryMgr.EXPECT().
226+
Discover(gomock.Any(), gomock.Any()).
227+
Return(&aggregator.AggregatedCapabilities{
228+
Tools: []vmcp.Tool{},
229+
Resources: []vmcp.Resource{},
230+
Prompts: []vmcp.Prompt{},
231+
RoutingTable: &vmcp.RoutingTable{
232+
Tools: make(map[string]*vmcp.BackendTarget),
233+
Resources: make(map[string]*vmcp.BackendTarget),
234+
Prompts: make(map[string]*vmcp.BackendTarget),
235+
},
236+
Metadata: &aggregator.AggregationMetadata{},
237+
}, nil).
238+
AnyTimes()
239+
mockDiscoveryMgr.EXPECT().Stop().AnyTimes()
240+
241+
ctx, cancel := context.WithCancel(t.Context())
242+
243+
var healthMonCfg *health.MonitorConfig
244+
if (monitorCfg != health.MonitorConfig{}) {
245+
healthMonCfg = &monitorCfg
246+
}
247+
srv, err := server.New(ctx, &server.Config{
248+
Name: "test-vmcp",
249+
Version: "1.0.0",
250+
Host: "127.0.0.1",
251+
Port: port,
252+
GroupRef: groupRef,
253+
HealthMonitorConfig: healthMonCfg,
254+
SessionFactory: newNoopMockFactory(t),
255+
}, rt, mockBackendClient, mockDiscoveryMgr, vmcp.NewImmutableRegistry(backends), nil)
256+
require.NoError(t, err)
257+
258+
type startResult struct {
259+
err error
260+
}
261+
done := make(chan startResult, 1)
262+
go func() {
263+
done <- startResult{err: srv.Start(ctx)}
264+
}()
265+
266+
// Cleanup order (LIFO):
267+
// 1. cancel() — stops the server and health monitor goroutines
268+
// 2. <-done — waits for srv.Start (and all goroutines) to return
269+
// 3. ctrl.Finish — validates mock expectations after all calls have stopped
270+
t.Cleanup(func() {
271+
result := <-done
272+
if result.err != nil && !errors.Is(result.err, context.Canceled) {
273+
t.Errorf("server exited with unexpected error: %v", result.err)
274+
}
275+
})
276+
t.Cleanup(cancel)
277+
278+
select {
279+
case <-srv.Ready():
280+
case result := <-done:
281+
t.Fatalf("server exited before becoming ready: %v", result.err)
282+
case <-time.After(5 * time.Second):
283+
t.Fatalf("Server did not become ready within 5s (address: %s)", srv.Address())
284+
}
285+
286+
return srv
287+
}
288+
289+
// queryStatus fetches and decodes /status from the given server.
290+
func queryStatus(t *testing.T, srv *server.Server) StatusResponse {
291+
t.Helper()
292+
resp, err := http.Get("http://" + srv.Address() + "/status")
293+
require.NoError(t, err)
294+
defer resp.Body.Close()
295+
require.Equal(t, http.StatusOK, resp.StatusCode, "unexpected HTTP status from /status")
296+
var status StatusResponse
297+
require.NoError(t, json.NewDecoder(resp.Body).Decode(&status))
298+
return status
299+
}
300+
301+
// TestStatusEndpoint_ReflectsLiveHealthMonitor_Unhealthy verifies the fix for
302+
// https://github.com/stacklok/toolhive/issues/4103: /status must report the
303+
// same health state as the live health monitor, not the stale registry value.
304+
//
305+
// Without the fix, a backend registered as "healthy" would always appear healthy
306+
// in /status even after the health monitor had marked it unhealthy.
307+
func TestStatusEndpoint_ReflectsLiveHealthMonitor_Unhealthy(t *testing.T) {
308+
t.Parallel()
309+
310+
// Backend starts as "healthy" in the registry – this is the stale value
311+
// that the old code would always return from /status.
312+
backends := []vmcp.Backend{{
313+
ID: "b1",
314+
Name: "test-backend",
315+
TransportType: "streamable-http",
316+
HealthStatus: vmcp.BackendHealthy,
317+
}}
318+
319+
monitorCfg := health.MonitorConfig{
320+
CheckInterval: 5 * time.Millisecond,
321+
UnhealthyThreshold: 1, // one failure → unhealthy immediately
322+
Timeout: time.Second,
323+
}
324+
325+
srv := createTestServerWithHealthMonitor(t, backends, monitorCfg, func(mockClient *mocks.MockBackendClient) {
326+
// All health checks fail – the monitor should mark the backend unhealthy.
327+
mockClient.EXPECT().
328+
ListCapabilities(gomock.Any(), gomock.Any()).
329+
Return(nil, errors.New("connection refused")).
330+
AnyTimes()
331+
}, "")
332+
333+
// Poll /status until the live monitor state propagates. If the fix is
334+
// absent the backend stays "healthy" forever and the assertion times out.
335+
require.Eventually(t, func() bool {
336+
status := queryStatus(t, srv)
337+
if len(status.Backends) == 0 {
338+
return false
339+
}
340+
return status.Backends[0].Health == string(vmcp.BackendUnhealthy)
341+
}, 5*time.Second, 20*time.Millisecond, "expected /status to report backend as unhealthy")
342+
343+
// The overall server health flag must also be false when no backend is healthy.
344+
status := queryStatus(t, srv)
345+
assert.False(t, status.Healthy)
346+
require.Len(t, status.Backends, 1)
347+
assert.Equal(t, string(vmcp.BackendUnhealthy), status.Backends[0].Health)
348+
}
349+
350+
// TestStatusEndpoint_ReflectsLiveHealthMonitor_Healthy confirms that /status
351+
// correctly reports a backend as healthy when the health monitor records success.
352+
func TestStatusEndpoint_ReflectsLiveHealthMonitor_Healthy(t *testing.T) {
353+
t.Parallel()
354+
355+
backends := []vmcp.Backend{{
356+
ID: "b1",
357+
Name: "test-backend",
358+
TransportType: "streamable-http",
359+
HealthStatus: vmcp.BackendUnknown, // registry starts with unknown
360+
}}
361+
362+
monitorCfg := health.MonitorConfig{
363+
CheckInterval: 5 * time.Millisecond,
364+
UnhealthyThreshold: 3,
365+
Timeout: time.Second,
366+
}
367+
368+
srv := createTestServerWithHealthMonitor(t, backends, monitorCfg, func(mockClient *mocks.MockBackendClient) {
369+
// Health checks succeed – the monitor should mark the backend healthy.
370+
mockClient.EXPECT().
371+
ListCapabilities(gomock.Any(), gomock.Any()).
372+
Return(&vmcp.CapabilityList{}, nil).
373+
AnyTimes()
374+
}, "")
375+
376+
// Poll until the healthy state from the monitor appears in /status.
377+
require.Eventually(t, func() bool {
378+
status := queryStatus(t, srv)
379+
if len(status.Backends) == 0 {
380+
return false
381+
}
382+
return status.Backends[0].Health == string(vmcp.BackendHealthy)
383+
}, 5*time.Second, 20*time.Millisecond, "expected /status to report backend as healthy")
384+
385+
status := queryStatus(t, srv)
386+
assert.True(t, status.Healthy)
387+
require.Len(t, status.Backends, 1)
388+
assert.Equal(t, string(vmcp.BackendHealthy), status.Backends[0].Health)
389+
}
390+
391+
// TestStatusEndpoint_FallsBackToRegistry_WhenMonitorDisabled confirms the
392+
// no-monitor path is unchanged: health status comes from the registry.
393+
func TestStatusEndpoint_FallsBackToRegistry_WhenMonitorDisabled(t *testing.T) {
394+
t.Parallel()
395+
396+
backends := []vmcp.Backend{
397+
{ID: "b1", Name: "healthy-backend", HealthStatus: vmcp.BackendHealthy},
398+
{ID: "b2", Name: "unhealthy-backend", HealthStatus: vmcp.BackendUnhealthy},
399+
}
400+
401+
// createTestServerWithBackends does NOT configure a health monitor.
402+
srv := createTestServerWithBackends(t, backends, "")
403+
404+
status := queryStatus(t, srv)
405+
406+
require.Len(t, status.Backends, 2)
407+
healthByName := make(map[string]string)
408+
for _, b := range status.Backends {
409+
healthByName[b.Name] = b.Health
410+
}
411+
assert.Equal(t, string(vmcp.BackendHealthy), healthByName["healthy-backend"])
412+
assert.Equal(t, string(vmcp.BackendUnhealthy), healthByName["unhealthy-backend"])
413+
}

0 commit comments

Comments
 (0)