diff --git a/.env.example b/.env.example index 4820cb155..d30118b9b 100644 --- a/.env.example +++ b/.env.example @@ -63,6 +63,42 @@ DB_MAX_RETRIES=5 # Retry interval in milliseconds (default: 2000) DB_RETRY_INTERVAL_MS=2000 +# Database Session Configuration +# Autocommit mode for database sessions (default: false) +# +# RECOMMENDED: Keep DB_AUTOCOMMIT=false for ALL databases (SQLite, PostgreSQL, MySQL/MariaDB) +# +# Why autocommit=false (default): +# - Ensures multi-step operations are atomic (all succeed or all fail) +# - Prevents data corruption when errors occur mid-transaction +# - Required for ACID compliance in complex operations +# +# Database-specific behavior with autocommit=false: +# - PostgreSQL: Read-only queries log harmless "ROLLBACK" entries (cosmetic only, no performance cost) +# - MySQL/MariaDB: Similar behavior - implicit transactions for each operation +# - SQLite: No log noise, works perfectly with WAL mode for concurrency +# +# Why you should NOT set autocommit=true: +# - BREAKS transaction atomicity (e.g., team creation can leave orphaned records) +# - 127 explicit db.commit() calls in codebase rely on transaction control +# - Data corruption risk if multi-step operations fail partway through +# +# Example of what breaks with autocommit=true: +# Step 1: Create team (commits immediately) +# Step 2: Add owner (if this fails, team exists without owner - corruption!) +# +# NOTE: Health check endpoints (/health, /ready) use a dedicated connection with +# autocommit=True to eliminate PostgreSQL log noise, while all other operations +# maintain autocommit=False for data integrity. Best of both worlds! +# +# See issue #1108 for details on PostgreSQL rollback behavior. +DB_AUTOCOMMIT=false + +# Auto-flush changes to database before queries (default: false) +# - false: Manual flush control (default) +# - true: Automatically flush pending changes before queries +DB_AUTOFLUSH=false + # Cache Backend Configuration # Options: database (default), memory (in-process), redis (distributed) # - database: Uses SQLite/PostgreSQL for persistence (good for single-node) diff --git a/README.md b/README.md index f8e9ef032..f6cad3c8a 100644 --- a/README.md +++ b/README.md @@ -1544,6 +1544,10 @@ mcpgateway | `DB_POOL_RECYCLE`. | Recycle connections (secs) | `3600` | int > 0 | | `DB_MAX_RETRIES` . | Max Retry Attempts | `3` | int > 0 | | `DB_RETRY_INTERVAL_MS` | Retry Interval (ms) | `2000` | int > 0 | +| `DB_AUTOCOMMIT` | Auto-commit mode (⚠️ breaks atomicity) | `false` | `true`, `false` | +| `DB_AUTOFLUSH` | Auto-flush changes before queries | `false` | `true`, `false` | + +> ⚠️ **Important**: Keep `DB_AUTOCOMMIT=false` (default) for data integrity. Setting to `true` breaks transaction atomicity and risks data corruption in multi-step operations. Health check endpoints (`/health`, `/ready`) use a dedicated autocommit connection to eliminate PostgreSQL log noise while maintaining `autocommit=false` for all other operations. See [issue #1108](https://github.com/IBM/mcp-context-forge/issues/1108) for details. ### Cache Backend diff --git a/mcpgateway/config.py b/mcpgateway/config.py index b5546edb7..467aa53d9 100644 --- a/mcpgateway/config.py +++ b/mcpgateway/config.py @@ -812,6 +812,8 @@ def parse_issuers(cls, v): db_pool_recycle: int = 3600 db_max_retries: int = 3 db_retry_interval_ms: int = 2000 + db_autocommit: bool = False # RECOMMENDED: False for atomicity. True breaks multi-step transactions. + db_autoflush: bool = False # Auto-flush changes to database before queries # Cache cache_type: Literal["redis", "memory", "none", "database"] = "database" # memory or redis or database diff --git a/mcpgateway/db.py b/mcpgateway/db.py index 7799c48d4..ec11ee97b 100644 --- a/mcpgateway/db.py +++ b/mcpgateway/db.py @@ -29,7 +29,7 @@ # Third-Party import jsonschema -from sqlalchemy import BigInteger, Boolean, Column, create_engine, DateTime, event, Float, ForeignKey, func, Index, Integer, JSON, make_url, select, String, Table, Text, UniqueConstraint +from sqlalchemy import BigInteger, Boolean, Column, create_engine, DateTime, event, Float, ForeignKey, func, Index, Integer, JSON, make_url, select, String, Table, text, Text, UniqueConstraint from sqlalchemy.event import listen from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.hybrid import hybrid_property @@ -172,7 +172,15 @@ def set_sqlite_pragma(dbapi_conn, _connection_record): # Session factory -SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) +# Use configurable autocommit and autoflush settings +# +# IMPORTANT: Keep DB_AUTOCOMMIT=false (default) for data integrity +# - Ensures multi-step operations are atomic (all succeed or all fail) +# - Read-only operations (like health checks) will log "ROLLBACK" in PostgreSQL - this is harmless +# - Setting to true eliminates rollback noise but BREAKS atomicity and risks data corruption +# +# See issue #1108 for details on PostgreSQL rollback behavior +SessionLocal = sessionmaker(autocommit=settings.db_autocommit, autoflush=settings.db_autoflush, bind=engine) def refresh_slugs_on_startup(): @@ -3263,6 +3271,34 @@ def get_db() -> Generator[Session, Any, None]: db.close() +def check_db_health() -> bool: + """ + Perform a lightweight database health check without generating transaction logs. + + This function uses a raw connection with AUTOCOMMIT isolation level to avoid + generating unnecessary BEGIN/ROLLBACK transaction logs in PostgreSQL (issue #1108). + + Unlike get_db() which uses autocommit=False for data integrity, this health check + uses autocommit=True to eliminate log noise while still verifying connectivity. + + Returns: + bool: True if database is healthy, False otherwise. + + Examples: + >>> check_db_health() # doctest: +SKIP + True + """ + try: + # Use raw connection with AUTOCOMMIT to avoid transaction log noise + # This is safe for read-only health checks and eliminates PostgreSQL ROLLBACK logs + with engine.connect().execution_options(isolation_level="AUTOCOMMIT") as connection: + connection.execute(text("SELECT 1")) + return True + except Exception as e: + logger.error(f"Database health check failed: {e}") + return False + + # Create all tables def init_db(): """ diff --git a/mcpgateway/main.py b/mcpgateway/main.py index 9b970e55e..31f51e51b 100644 --- a/mcpgateway/main.py +++ b/mcpgateway/main.py @@ -46,7 +46,6 @@ from fastapi.staticfiles import StaticFiles from fastapi.templating import Jinja2Templates from pydantic import ValidationError -from sqlalchemy import text from sqlalchemy.exc import IntegrityError from sqlalchemy.orm import Session from starlette.middleware.base import BaseHTTPMiddleware @@ -61,7 +60,7 @@ from mcpgateway.bootstrap_db import main as bootstrap_db from mcpgateway.cache import ResourceCache, SessionRegistry from mcpgateway.config import jsonpath_modifier, settings -from mcpgateway.db import refresh_slugs_on_startup, SessionLocal +from mcpgateway.db import check_db_health, refresh_slugs_on_startup, SessionLocal from mcpgateway.db import Tool as DbTool from mcpgateway.handlers.sampling import SamplingHandler from mcpgateway.middleware.rbac import get_current_user_with_permissions, require_permission @@ -3775,45 +3774,41 @@ async def reset_metrics(entity: Optional[str] = None, entity_id: Optional[int] = # Healthcheck # #################### @app.get("/health") -async def healthcheck(db: Session = Depends(get_db)): +async def healthcheck(): """ Perform a basic health check to verify database connectivity. - Args: - db: SQLAlchemy session dependency. + Uses a lightweight connection with AUTOCOMMIT isolation level to avoid + generating unnecessary transaction logs in PostgreSQL (issue #1108). Returns: A dictionary with the health status and optional error message. """ - try: - # Execute the query using text() for an explicit textual SQL expression. - db.execute(text("SELECT 1")) - except Exception as e: - error_message = f"Database connection error: {str(e)}" - logger.error(error_message) - return {"status": "unhealthy", "error": error_message} - return {"status": "healthy"} + # Use dedicated health check function that doesn't generate transaction log noise + is_healthy = await asyncio.to_thread(check_db_health) + + if is_healthy: + return {"status": "healthy"} + return {"status": "unhealthy", "error": "Database connection error"} @app.get("/ready") -async def readiness_check(db: Session = Depends(get_db)): +async def readiness_check(): """ Perform a readiness check to verify if the application is ready to receive traffic. - Args: - db: SQLAlchemy session dependency. + Uses a lightweight connection with AUTOCOMMIT isolation level to avoid + generating unnecessary transaction logs in PostgreSQL (issue #1108). Returns: JSONResponse with status 200 if ready, 503 if not. """ - try: - # Run the blocking DB check in a thread to avoid blocking the event loop - await asyncio.to_thread(db.execute, text("SELECT 1")) + # Use dedicated health check function that doesn't generate transaction log noise + is_ready = await asyncio.to_thread(check_db_health) + + if is_ready: return JSONResponse(content={"status": "ready"}, status_code=200) - except Exception as e: - error_message = f"Readiness check failed: {str(e)}" - logger.error(error_message) - return JSONResponse(content={"status": "not ready", "error": error_message}, status_code=503) + return JSONResponse(content={"status": "not ready", "error": "Database connection error"}, status_code=503) @app.get("/health/security", tags=["health"]) diff --git a/mcpgateway/middleware/token_scoping.py b/mcpgateway/middleware/token_scoping.py index 63c5ed8e3..21c7148f7 100644 --- a/mcpgateway/middleware/token_scoping.py +++ b/mcpgateway/middleware/token_scoping.py @@ -383,7 +383,7 @@ def _check_team_membership(self, payload: dict) -> bool: finally: db.close() - def _check_resource_team_ownership(self, request_path: str, token_teams: list) -> bool: + def _check_resource_team_ownership(self, request_path: str, token_teams: list) -> bool: # pylint: disable=too-many-return-statements """ Check if the requested resource is accessible by the token. diff --git a/tests/e2e/test_main_apis.py b/tests/e2e/test_main_apis.py index b2264cd7f..4980d1aa7 100644 --- a/tests/e2e/test_main_apis.py +++ b/tests/e2e/test_main_apis.py @@ -1921,33 +1921,26 @@ async def test_empty_request_body(self, client: AsyncClient, mock_auth): assert any("Field required" in str(error) for error in errors) async def test_internal_server_error(self, client: AsyncClient, mock_auth): - """Simulate internal server error by patching a dependency.""" + """Simulate internal server error by patching the health check function.""" # First-Party - from mcpgateway.main import app, get_db + from mcpgateway import main as main_module - def failing_db(): - def _gen(): - raise Exception("Simulated DB failure") - yield + # Mock check_db_health to simulate database failure + original_check_db_health = main_module.check_db_health - return _gen() + def failing_health_check(): + return False - original_override = app.dependency_overrides.get(get_db) - app.dependency_overrides[get_db] = failing_db + main_module.check_db_health = failing_health_check try: response = await client.get("/health", headers=TEST_AUTH_HEADER) - # Some test setups may still return 200 with error info in body, so check both - if response.status_code == 500: - assert True - else: - # Accept 200 only if error is present in response - data = response.json() - assert "error" in data or data.get("status") != "healthy" + # Health check should return 200 with status "unhealthy" + assert response.status_code == 200 + data = response.json() + assert data.get("status") == "unhealthy" or "error" in data finally: - if original_override: - app.dependency_overrides[get_db] = original_override - else: - app.dependency_overrides.pop(get_db, None) + # Restore original function + main_module.check_db_health = original_check_db_health async def test_validation_error(self, client: AsyncClient, mock_auth): """Test validation error for endpoint expecting required fields."""