diff --git a/pyproject.toml b/pyproject.toml index 07e2954..339ab58 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,7 @@ dependencies = [ "cachetools>=5.5.0", "gidgethub>=5.3.0", "gidgetlab>=2.0.1", + "prometheus-client>=0.20.0", "pydantic>=2.10.6", "pydantic-settings>=2.8.1", "pysmee>=0.1.1", diff --git a/src/ci_relay/metrics.py b/src/ci_relay/metrics.py new file mode 100644 index 0000000..a79f015 --- /dev/null +++ b/src/ci_relay/metrics.py @@ -0,0 +1,213 @@ +""" +Prometheus metrics for the CI relay application. + +This module defines all the metrics that are collected throughout the application +to monitor webhook reception, job triggering, error rates, and GitHub workflow triggering. +""" + +from prometheus_client import Counter, Histogram, Gauge, Info +import time + + +# Webhook reception metrics +webhooks_received_total = Counter( + 'ci_relay_webhooks_received_total', + 'Total number of webhooks received', + ['source', 'event_type'] # source = github|gitlab, event_type = push|pull_request|job_hook|etc +) + +webhook_processing_duration_seconds = Histogram( + 'ci_relay_webhook_processing_duration_seconds', + 'Time spent processing webhooks', + ['source', 'event_type'] +) + +webhook_processing_errors_total = Counter( + 'ci_relay_webhook_processing_errors_total', + 'Total number of webhook processing errors', + ['source', 'event_type', 'error_type'] +) + +# GitLab job triggering metrics +gitlab_jobs_triggered_total = Counter( + 'ci_relay_gitlab_jobs_triggered_total', + 'Total number of GitLab pipelines/jobs triggered', + ['repo_name', 'trigger_source'] # trigger_source = pull_request|push|comment|rerun +) + +gitlab_job_trigger_errors_total = Counter( + 'ci_relay_gitlab_job_trigger_errors_total', + 'Total number of GitLab job trigger errors', + ['repo_name', 'error_type'] +) + +gitlab_job_trigger_duration_seconds = Histogram( + 'ci_relay_gitlab_job_trigger_duration_seconds', + 'Time spent triggering GitLab jobs', + ['repo_name'] +) + +# GitHub workflow triggering metrics (workloads triggered back to GitHub) +github_workflows_triggered_total = Counter( + 'ci_relay_github_workflows_triggered_total', + 'Total number of GitHub workflows triggered', + ['repo_name', 'job_status'] # job_status = success|failed|canceled|etc +) + +github_workflow_trigger_errors_total = Counter( + 'ci_relay_github_workflow_trigger_errors_total', + 'Total number of GitHub workflow trigger errors', + ['repo_name', 'error_type'] +) + +github_workflow_trigger_duration_seconds = Histogram( + 'ci_relay_github_workflow_trigger_duration_seconds', + 'Time spent triggering GitHub workflows', + ['repo_name'] +) + +# GitHub status update metrics +github_status_updates_total = Counter( + 'ci_relay_github_status_updates_total', + 'Total number of GitHub status updates posted', + ['repo_name', 'status', 'conclusion'] # status = in_progress|completed, conclusion = success|failure|neutral|cancelled +) + +github_status_update_errors_total = Counter( + 'ci_relay_github_status_update_errors_total', + 'Total number of GitHub status update errors', + ['repo_name', 'error_type'] +) + +# GitLab API interaction metrics +gitlab_api_calls_total = Counter( + 'ci_relay_gitlab_api_calls_total', + 'Total number of GitLab API calls', + ['endpoint', 'method', 'status_code'] +) + +gitlab_api_call_duration_seconds = Histogram( + 'ci_relay_gitlab_api_call_duration_seconds', + 'Duration of GitLab API calls', + ['endpoint', 'method'] +) + +# GitHub API interaction metrics +github_api_calls_total = Counter( + 'ci_relay_github_api_calls_total', + 'Total number of GitHub API calls', + ['endpoint', 'method', 'status_code'] +) + +github_api_call_duration_seconds = Histogram( + 'ci_relay_github_api_call_duration_seconds', + 'Duration of GitHub API calls', + ['endpoint', 'method'] +) + +# Application-level metrics +app_errors_total = Counter( + 'ci_relay_app_errors_total', + 'Total number of application errors', + ['error_type', 'component'] # component = github_router|gitlab_router|web|etc +) + +active_installations = Gauge( + 'ci_relay_active_installations', + 'Number of active GitHub app installations' +) + +# Health check metrics +health_check_status = Gauge( + 'ci_relay_health_check_status', + 'Health check status (1 = healthy, 0 = unhealthy)', + ['service'] # service = github|gitlab|overall +) + +health_check_duration_seconds = Histogram( + 'ci_relay_health_check_duration_seconds', + 'Duration of health checks', + ['service'] +) + +# Application info +app_info = Info( + 'ci_relay_app', + 'CI Relay application information' +) + + +class MetricsContext: + """Context manager for timing operations and handling errors with metrics.""" + + def __init__(self, histogram, error_counter, labels=None, error_labels=None): + self.histogram = histogram + self.error_counter = error_counter + self.labels = labels or [] + self.error_labels = error_labels or [] + self.start_time = None + + def __enter__(self): + self.start_time = time.time() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if self.start_time is not None: + duration = time.time() - self.start_time + self.histogram.labels(*self.labels).observe(duration) + + if exc_type is not None: + error_type = exc_type.__name__ + self.error_counter.labels(*self.error_labels, error_type).inc() + + return False # Don't suppress exceptions + + +def track_webhook_processing(source: str, event_type: str): + """Context manager for tracking webhook processing metrics.""" + return MetricsContext( + webhook_processing_duration_seconds.labels(source, event_type), + webhook_processing_errors_total, + labels=[source, event_type], + error_labels=[source, event_type] + ) + + +def track_gitlab_job_trigger(repo_name: str): + """Context manager for tracking GitLab job trigger metrics.""" + return MetricsContext( + gitlab_job_trigger_duration_seconds.labels(repo_name), + gitlab_job_trigger_errors_total, + labels=[repo_name], + error_labels=[repo_name] + ) + + +def track_github_workflow_trigger(repo_name: str): + """Context manager for tracking GitHub workflow trigger metrics.""" + return MetricsContext( + github_workflow_trigger_duration_seconds.labels(repo_name), + github_workflow_trigger_errors_total, + labels=[repo_name], + error_labels=[repo_name] + ) + + +def track_gitlab_api_call(endpoint: str, method: str): + """Context manager for tracking GitLab API call metrics.""" + return MetricsContext( + gitlab_api_call_duration_seconds.labels(endpoint, method), + Counter('dummy_gitlab_api_errors', 'dummy'), # We'll handle status codes separately + labels=[endpoint, method], + error_labels=[] + ) + + +def track_github_api_call(endpoint: str, method: str): + """Context manager for tracking GitHub API call metrics.""" + return MetricsContext( + github_api_call_duration_seconds.labels(endpoint, method), + Counter('dummy_github_api_errors', 'dummy'), # We'll handle status codes separately + labels=[endpoint, method], + error_labels=[] + ) \ No newline at end of file diff --git a/src/ci_relay/web.py b/src/ci_relay/web.py index 2b8e6d9..1a7b6af 100644 --- a/src/ci_relay/web.py +++ b/src/ci_relay/web.py @@ -9,12 +9,14 @@ import cachetools from aiolimiter import AsyncLimiter import tenacity +from prometheus_client import generate_latest, CONTENT_TYPE_LATEST from ci_relay.config import Config from ci_relay.github.router import router as github_router from ci_relay.gitlab.router import router as gitlab_router import ci_relay.github.utils as github_utils from ci_relay.exceptions import UnrecoverableError +from ci_relay import metrics def add_task(app: Sanic, task): @@ -27,24 +29,29 @@ def add_task(app: Sanic, task): retry=tenacity.retry_if_not_exception_type(UnrecoverableError), ) async def handle_gitlab_webhook(request, *, app: Sanic): - async with aiohttp.ClientSession(loop=app.loop) as session: - event = GitLabEvent.from_http( - request.headers, request.body, secret=app.config.GITLAB_WEBHOOK_SECRET - ) - - gl = gidgetlab.aiohttp.GitLabAPI( - session, - requester="acts", - access_token=app.config.GITLAB_ACCESS_TOKEN, - url=app.config.GITLAB_API_URL, - ) - - logger.debug("Dispatching event %s", event.event) - try: - await gitlab_router.dispatch(event, session=session, app=app, gl=gl) - except BaseException as e: - logger.error("GitLab dispatch caught exception: %s", e, exc_info=e) - raise + # Record webhook reception + metrics.webhooks_received_total.labels('gitlab', request.headers.get('X-Gitlab-Event', 'unknown')).inc() + + with metrics.track_webhook_processing('gitlab', request.headers.get('X-Gitlab-Event', 'unknown')): + async with aiohttp.ClientSession(loop=app.loop) as session: + event = GitLabEvent.from_http( + request.headers, request.body, secret=app.config.GITLAB_WEBHOOK_SECRET + ) + + gl = gidgetlab.aiohttp.GitLabAPI( + session, + requester="acts", + access_token=app.config.GITLAB_ACCESS_TOKEN, + url=app.config.GITLAB_API_URL, + ) + + logger.debug("Dispatching event %s", event.event) + try: + await gitlab_router.dispatch(event, session=session, app=app, gl=gl) + except BaseException as e: + logger.error("GitLab dispatch caught exception: %s", e, exc_info=e) + metrics.app_errors_total.labels(e.__class__.__name__, 'gitlab_router').inc() + raise @tenacity.retry( @@ -53,32 +60,38 @@ async def handle_gitlab_webhook(request, *, app: Sanic): retry=tenacity.retry_if_not_exception_type(UnrecoverableError), ) async def handle_github_webhook(request, *, app: Sanic): - async with aiohttp.ClientSession(loop=app.loop) as session: - event = GitHubEvent.from_http( - request.headers, request.body, secret=app.config.WEBHOOK_SECRET - ) - - assert "installation" in event.data - installation_id = event.data["installation"]["id"] - logger.debug("Installation id: %s", installation_id) - - gh = await github_utils.client_for_installation( - app=app, installation_id=installation_id, session=session - ) - - gl = gidgetlab.aiohttp.GitLabAPI( - session, - requester="acts", - access_token=app.config.GITLAB_ACCESS_TOKEN, - url=app.config.GITLAB_API_URL, - ) - - logger.debug("Dispatching event %s", event.event) - try: - await github_router.dispatch(event, session=session, gh=gh, app=app, gl=gl) - except BaseException as e: - logger.error("GitHub dispatch caught exception: %s", e, exc_info=e) - raise + # Record webhook reception + event_type = request.headers.get('X-GitHub-Event', 'unknown') + metrics.webhooks_received_total.labels('github', event_type).inc() + + with metrics.track_webhook_processing('github', event_type): + async with aiohttp.ClientSession(loop=app.loop) as session: + event = GitHubEvent.from_http( + request.headers, request.body, secret=app.config.WEBHOOK_SECRET + ) + + assert "installation" in event.data + installation_id = event.data["installation"]["id"] + logger.debug("Installation id: %s", installation_id) + + gh = await github_utils.client_for_installation( + app=app, installation_id=installation_id, session=session + ) + + gl = gidgetlab.aiohttp.GitLabAPI( + session, + requester="acts", + access_token=app.config.GITLAB_ACCESS_TOKEN, + url=app.config.GITLAB_API_URL, + ) + + logger.debug("Dispatching event %s", event.event) + try: + await github_router.dispatch(event, session=session, gh=gh, app=app, gl=gl) + except BaseException as e: + logger.error("GitHub dispatch caught exception: %s", e, exc_info=e) + metrics.app_errors_total.labels(e.__class__.__name__, 'github_router').inc() + raise def create_app(*, config: Config | None = None): @@ -111,6 +124,14 @@ async def index(request): logger.debug("status check") return response.text("ok") + @app.route("/metrics") + async def metrics_endpoint(request): + """Prometheus metrics endpoint""" + return response.text( + generate_latest(), + content_type=CONTENT_TYPE_LATEST + ) + @app.route("/health") async def health(request): if not limiter.has_capacity(): diff --git a/uv.lock b/uv.lock index 1675162..8ed9378 100644 --- a/uv.lock +++ b/uv.lock @@ -277,6 +277,7 @@ dependencies = [ { name = "cachetools" }, { name = "gidgethub" }, { name = "gidgetlab" }, + { name = "prometheus-client" }, { name = "pydantic" }, { name = "pydantic-settings" }, { name = "pysmee" }, @@ -305,6 +306,7 @@ requires-dist = [ { name = "cachetools", specifier = ">=5.5.0" }, { name = "gidgethub", specifier = ">=5.3.0" }, { name = "gidgetlab", specifier = ">=2.0.1" }, + { name = "prometheus-client", specifier = ">=0.20.0" }, { name = "pydantic", specifier = ">=2.10.6" }, { name = "pydantic-settings", specifier = ">=2.8.1" }, { name = "pysmee", specifier = ">=0.1.1" }, @@ -743,6 +745,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538 }, ] +[[package]] +name = "prometheus-client" +version = "0.22.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/5e/cf/40dde0a2be27cc1eb41e333d1a674a74ce8b8b0457269cc640fd42b07cf7/prometheus_client-0.22.1.tar.gz", hash = "sha256:190f1331e783cf21eb60bca559354e0a4d4378facecf78f5428c39b675d20d28", size = 69746 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/32/ae/ec06af4fe3ee72d16973474f122541746196aaa16cea6f66d18b963c6177/prometheus_client-0.22.1-py3-none-any.whl", hash = "sha256:cca895342e308174341b2cbf99a56bef291fbc0ef7b9e5412a0f26d653ba7094", size = 58694 }, +] + [[package]] name = "propcache" version = "0.3.1"