-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhealth.py
More file actions
196 lines (159 loc) · 7.13 KB
/
health.py
File metadata and controls
196 lines (159 loc) · 7.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
import requests
import os
import time
import streamlit as st
from logger_config import setup_logger
logger = setup_logger(__name__, os.path.join("logs", "health.log"))
def check_ollama_health(host=None, override=False):
"""
Check if Ollama service is healthy by attempting to get model list.
Args:
host: Ollama API host (defaults to environment variable)
override: If True, bypass actual health check and return healthy
Returns:
tuple: (is_healthy, message)
"""
# If override is enabled, skip the actual health check
if override:
logger.info("Ollama health check bypassed due to manual override")
return True, "Manually marked as healthy"
if host is None:
host = os.environ.get("OLLAMA_HOST", "http://ollama:11434")
url = f"{host}/api/tags"
try:
# Increase timeout for slower networks/systems
response = requests.get(url, timeout=10)
if response.status_code == 200:
models = response.json().get("models", [])
if not models:
# Handle case where response is valid but no models are returned
logger.warning(f"Ollama is running but no models are available")
return True, "Healthy, but no models detected."
available_models = [model["name"] for model in models]
logger.info(f"Ollama is healthy. Available models: {available_models}")
return True, f"Healthy. {len(available_models)} models available."
else:
logger.error(f"Ollama returned status code {response.status_code}")
return False, f"Error: Status code {response.status_code}"
except requests.exceptions.ConnectionError:
logger.error("Failed to connect to Ollama service on any endpoint")
return False, "Error: Connection failed"
except requests.exceptions.Timeout:
logger.error("Ollama service request timed out")
return False, "Error: Connection timeout"
except Exception as e:
logger.error(f"Unexpected error checking Ollama health: {str(e)}")
return False, f"Error: {str(e)}"
def check_brightdata_connectivity():
"""
Check if Bright Data credentials are valid by testing connection.
Returns:
tuple: (is_healthy, message)
"""
try:
from selenium.webdriver.chromium.remote_connection import (
ChromiumRemoteConnection,
)
import selenium.webdriver as webdriver
BRIGHTDATA_USER = os.getenv("BRIGHTDATA_USER")
BRIGHTDATA_PASSWORD = os.getenv("BRIGHTDATA_PASSWORD")
if not BRIGHTDATA_USER or not BRIGHTDATA_PASSWORD:
logger.error("Missing Bright Data credentials")
return False, "Error: Missing credentials"
AUTH = f"{BRIGHTDATA_USER}:{BRIGHTDATA_PASSWORD}"
SBR_WEBDRIVER = f"https://{AUTH}@brd.superproxy.io:9515"
try:
# Test connection
sbr_connection = ChromiumRemoteConnection(SBR_WEBDRIVER, "goog", "chrome")
options = webdriver.ChromeOptions()
driver = webdriver.Remote(sbr_connection, options=options)
driver.quit()
logger.info("Bright Data connection successful")
return True, "Healthy. Connection successful."
except Exception as e:
logger.error(f"Bright Data connection failed: {str(e)}")
return False, f"Error: {str(e)}"
except Exception as e:
logger.error(f"Error setting up Bright Data check: {str(e)}")
return False, f"Setup error: {str(e)}"
def add_health_status_sidebar():
"""Add health status information to the Streamlit sidebar."""
with st.sidebar:
st.header("System Health")
# Initialize the override flag if it doesn't exist
if "ollama_override" not in st.session_state:
st.session_state.ollama_override = False
# Subheader for Ollama service
st.subheader("Ollama LLM Service")
# Check actual Ollama health status (for logging purposes)
actual_status, actual_msg = check_ollama_health(override=False)
# Display based on override status
if st.session_state.ollama_override:
# Show success message when overridden
st.success("✓ Manually marked as healthy")
# Add reset button
if st.button("Reset Override (Check Actual Status)"):
# Clear the override flag
st.session_state.ollama_override = False
# No rerun needed - will update on next render
else:
# Show actual status when not overridden
if actual_status:
st.success(actual_msg)
else:
# Show error and troubleshooting when there's a connection issue
st.error(actual_msg)
with st.expander("Troubleshooting"):
st.markdown(
"""
If you're seeing a connection error but know Ollama is running:
1. Check that the OLLAMA_HOST environment variable is set correctly
2. Ensure the Ollama container is running: `docker-compose ps`
3. Try restarting Ollama: `docker-compose restart ollama`
4. Check Ollama logs: `docker-compose logs ollama`
"""
)
# Manual override option
if st.button("Override (Mark as Healthy)"):
st.session_state.ollama_override = True
# No rerun needed - will update on next render
# Bright Data health check section
st.subheader("Bright Data Service")
if st.button("Check Bright Data Connection"):
with st.spinner("Testing connection..."):
bd_status, bd_msg = check_brightdata_connectivity()
if bd_status:
st.success(bd_msg)
else:
st.error(bd_msg)
def run_periodic_health_checks(interval_seconds=300):
"""
Run periodic health checks and log results.
This can be run in a background thread.
Args:
interval_seconds: Time between checks in seconds
"""
while True:
logger.info("Running periodic health checks")
# Check Ollama - never use override for background checks
ollama_status, ollama_msg = check_ollama_health(override=False)
if not ollama_status:
logger.warning(f"Ollama health check failed: {ollama_msg}")
# Sleep until next check
time.sleep(interval_seconds)
# Function to check overall application health for Docker healthcheck
def check_app_health():
"""
Check if the application is healthy.
Returns exit code 0 if healthy, 1 if unhealthy.
"""
try:
# Check Ollama - never use override for Docker health checks
ollama_status, _ = check_ollama_health(override=False)
if not ollama_status:
return 1
# Add any other critical health checks here
return 0
except Exception as e:
logger.error(f"Health check failed with error: {str(e)}")
return 1