Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,42 @@ DB_MAX_RETRIES=5
# Retry interval in milliseconds (default: 2000)
DB_RETRY_INTERVAL_MS=2000

# Database Session Configuration
# Autocommit mode for database sessions (default: false)
#
# RECOMMENDED: Keep DB_AUTOCOMMIT=false for ALL databases (SQLite, PostgreSQL, MySQL/MariaDB)
#
# Why autocommit=false (default):
# - Ensures multi-step operations are atomic (all succeed or all fail)
# - Prevents data corruption when errors occur mid-transaction
# - Required for ACID compliance in complex operations
#
# Database-specific behavior with autocommit=false:
# - PostgreSQL: Read-only queries log harmless "ROLLBACK" entries (cosmetic only, no performance cost)
# - MySQL/MariaDB: Similar behavior - implicit transactions for each operation
# - SQLite: No log noise, works perfectly with WAL mode for concurrency
#
# Why you should NOT set autocommit=true:
# - BREAKS transaction atomicity (e.g., team creation can leave orphaned records)
# - 127 explicit db.commit() calls in codebase rely on transaction control
# - Data corruption risk if multi-step operations fail partway through
#
# Example of what breaks with autocommit=true:
# Step 1: Create team (commits immediately)
# Step 2: Add owner (if this fails, team exists without owner - corruption!)
#
# NOTE: Health check endpoints (/health, /ready) use a dedicated connection with
# autocommit=True to eliminate PostgreSQL log noise, while all other operations
# maintain autocommit=False for data integrity. Best of both worlds!
#
# See issue #1108 for details on PostgreSQL rollback behavior.
DB_AUTOCOMMIT=false

# Auto-flush changes to database before queries (default: false)
# - false: Manual flush control (default)
# - true: Automatically flush pending changes before queries
DB_AUTOFLUSH=false

# Cache Backend Configuration
# Options: database (default), memory (in-process), redis (distributed)
# - database: Uses SQLite/PostgreSQL for persistence (good for single-node)
Expand Down
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -1544,6 +1544,10 @@ mcpgateway
| `DB_POOL_RECYCLE`. | Recycle connections (secs) | `3600` | int > 0 |
| `DB_MAX_RETRIES` . | Max Retry Attempts | `3` | int > 0 |
| `DB_RETRY_INTERVAL_MS` | Retry Interval (ms) | `2000` | int > 0 |
| `DB_AUTOCOMMIT` | Auto-commit mode (⚠️ breaks atomicity) | `false` | `true`, `false` |
| `DB_AUTOFLUSH` | Auto-flush changes before queries | `false` | `true`, `false` |

> ⚠️ **Important**: Keep `DB_AUTOCOMMIT=false` (default) for data integrity. Setting to `true` breaks transaction atomicity and risks data corruption in multi-step operations. Health check endpoints (`/health`, `/ready`) use a dedicated autocommit connection to eliminate PostgreSQL log noise while maintaining `autocommit=false` for all other operations. See [issue #1108](https://github.com/IBM/mcp-context-forge/issues/1108) for details.

### Cache Backend

Expand Down
2 changes: 2 additions & 0 deletions mcpgateway/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -812,6 +812,8 @@ def parse_issuers(cls, v):
db_pool_recycle: int = 3600
db_max_retries: int = 3
db_retry_interval_ms: int = 2000
db_autocommit: bool = False # RECOMMENDED: False for atomicity. True breaks multi-step transactions.
db_autoflush: bool = False # Auto-flush changes to database before queries

# Cache
cache_type: Literal["redis", "memory", "none", "database"] = "database" # memory or redis or database
Expand Down
40 changes: 38 additions & 2 deletions mcpgateway/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

# Third-Party
import jsonschema
from sqlalchemy import BigInteger, Boolean, Column, create_engine, DateTime, event, Float, ForeignKey, func, Index, Integer, JSON, make_url, select, String, Table, Text, UniqueConstraint
from sqlalchemy import BigInteger, Boolean, Column, create_engine, DateTime, event, Float, ForeignKey, func, Index, Integer, JSON, make_url, select, String, Table, text, Text, UniqueConstraint
from sqlalchemy.event import listen
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.hybrid import hybrid_property
Expand Down Expand Up @@ -172,7 +172,15 @@ def set_sqlite_pragma(dbapi_conn, _connection_record):


# Session factory
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
# Use configurable autocommit and autoflush settings
#
# IMPORTANT: Keep DB_AUTOCOMMIT=false (default) for data integrity
# - Ensures multi-step operations are atomic (all succeed or all fail)
# - Read-only operations (like health checks) will log "ROLLBACK" in PostgreSQL - this is harmless
# - Setting to true eliminates rollback noise but BREAKS atomicity and risks data corruption
#
# See issue #1108 for details on PostgreSQL rollback behavior
SessionLocal = sessionmaker(autocommit=settings.db_autocommit, autoflush=settings.db_autoflush, bind=engine)


def refresh_slugs_on_startup():
Expand Down Expand Up @@ -3263,6 +3271,34 @@ def get_db() -> Generator[Session, Any, None]:
db.close()


def check_db_health() -> bool:
"""
Perform a lightweight database health check without generating transaction logs.

This function uses a raw connection with AUTOCOMMIT isolation level to avoid
generating unnecessary BEGIN/ROLLBACK transaction logs in PostgreSQL (issue #1108).

Unlike get_db() which uses autocommit=False for data integrity, this health check
uses autocommit=True to eliminate log noise while still verifying connectivity.

Returns:
bool: True if database is healthy, False otherwise.

Examples:
>>> check_db_health() # doctest: +SKIP
True
"""
try:
# Use raw connection with AUTOCOMMIT to avoid transaction log noise
# This is safe for read-only health checks and eliminates PostgreSQL ROLLBACK logs
with engine.connect().execution_options(isolation_level="AUTOCOMMIT") as connection:
connection.execute(text("SELECT 1"))
return True
except Exception as e:
logger.error(f"Database health check failed: {e}")
return False


# Create all tables
def init_db():
"""
Expand Down
41 changes: 18 additions & 23 deletions mcpgateway/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@
from fastapi.staticfiles import StaticFiles
from fastapi.templating import Jinja2Templates
from pydantic import ValidationError
from sqlalchemy import text
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import Session
from starlette.middleware.base import BaseHTTPMiddleware
Expand All @@ -61,7 +60,7 @@
from mcpgateway.bootstrap_db import main as bootstrap_db
from mcpgateway.cache import ResourceCache, SessionRegistry
from mcpgateway.config import jsonpath_modifier, settings
from mcpgateway.db import refresh_slugs_on_startup, SessionLocal
from mcpgateway.db import check_db_health, refresh_slugs_on_startup, SessionLocal
from mcpgateway.db import Tool as DbTool
from mcpgateway.handlers.sampling import SamplingHandler
from mcpgateway.middleware.rbac import get_current_user_with_permissions, require_permission
Expand Down Expand Up @@ -3775,45 +3774,41 @@ async def reset_metrics(entity: Optional[str] = None, entity_id: Optional[int] =
# Healthcheck #
####################
@app.get("/health")
async def healthcheck(db: Session = Depends(get_db)):
async def healthcheck():
"""
Perform a basic health check to verify database connectivity.

Args:
db: SQLAlchemy session dependency.
Uses a lightweight connection with AUTOCOMMIT isolation level to avoid
generating unnecessary transaction logs in PostgreSQL (issue #1108).

Returns:
A dictionary with the health status and optional error message.
"""
try:
# Execute the query using text() for an explicit textual SQL expression.
db.execute(text("SELECT 1"))
except Exception as e:
error_message = f"Database connection error: {str(e)}"
logger.error(error_message)
return {"status": "unhealthy", "error": error_message}
return {"status": "healthy"}
# Use dedicated health check function that doesn't generate transaction log noise
is_healthy = await asyncio.to_thread(check_db_health)

if is_healthy:
return {"status": "healthy"}
return {"status": "unhealthy", "error": "Database connection error"}


@app.get("/ready")
async def readiness_check(db: Session = Depends(get_db)):
async def readiness_check():
"""
Perform a readiness check to verify if the application is ready to receive traffic.

Args:
db: SQLAlchemy session dependency.
Uses a lightweight connection with AUTOCOMMIT isolation level to avoid
generating unnecessary transaction logs in PostgreSQL (issue #1108).

Returns:
JSONResponse with status 200 if ready, 503 if not.
"""
try:
# Run the blocking DB check in a thread to avoid blocking the event loop
await asyncio.to_thread(db.execute, text("SELECT 1"))
# Use dedicated health check function that doesn't generate transaction log noise
is_ready = await asyncio.to_thread(check_db_health)

if is_ready:
return JSONResponse(content={"status": "ready"}, status_code=200)
except Exception as e:
error_message = f"Readiness check failed: {str(e)}"
logger.error(error_message)
return JSONResponse(content={"status": "not ready", "error": error_message}, status_code=503)
return JSONResponse(content={"status": "not ready", "error": "Database connection error"}, status_code=503)


@app.get("/health/security", tags=["health"])
Expand Down
2 changes: 1 addition & 1 deletion mcpgateway/middleware/token_scoping.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,7 @@ def _check_team_membership(self, payload: dict) -> bool:
finally:
db.close()

def _check_resource_team_ownership(self, request_path: str, token_teams: list) -> bool:
def _check_resource_team_ownership(self, request_path: str, token_teams: list) -> bool: # pylint: disable=too-many-return-statements
"""
Check if the requested resource is accessible by the token.

Expand Down
33 changes: 13 additions & 20 deletions tests/e2e/test_main_apis.py
Original file line number Diff line number Diff line change
Expand Up @@ -1921,33 +1921,26 @@ async def test_empty_request_body(self, client: AsyncClient, mock_auth):
assert any("Field required" in str(error) for error in errors)

async def test_internal_server_error(self, client: AsyncClient, mock_auth):
"""Simulate internal server error by patching a dependency."""
"""Simulate internal server error by patching the health check function."""
# First-Party
from mcpgateway.main import app, get_db
from mcpgateway import main as main_module

def failing_db():
def _gen():
raise Exception("Simulated DB failure")
yield
# Mock check_db_health to simulate database failure
original_check_db_health = main_module.check_db_health

return _gen()
def failing_health_check():
return False

original_override = app.dependency_overrides.get(get_db)
app.dependency_overrides[get_db] = failing_db
main_module.check_db_health = failing_health_check
try:
response = await client.get("/health", headers=TEST_AUTH_HEADER)
# Some test setups may still return 200 with error info in body, so check both
if response.status_code == 500:
assert True
else:
# Accept 200 only if error is present in response
data = response.json()
assert "error" in data or data.get("status") != "healthy"
# Health check should return 200 with status "unhealthy"
assert response.status_code == 200
data = response.json()
assert data.get("status") == "unhealthy" or "error" in data
finally:
if original_override:
app.dependency_overrides[get_db] = original_override
else:
app.dependency_overrides.pop(get_db, None)
# Restore original function
main_module.check_db_health = original_check_db_health

async def test_validation_error(self, client: AsyncClient, mock_auth):
"""Test validation error for endpoint expecting required fields."""
Expand Down
Loading