94b5caa7e5
Defense-in-depth for the deploy pipeline. Today a backend image shipped ahead of an un-applied migration; the Tree model selected columns the DB didn't have yet, so every trees query 500'd with an opaque UndefinedColumnError and the UI showed no trees. The root cause (deploys not running migrations) is fixed separately; this makes the *symptom* impossible to miss. - app/core/schema_version.py: compare the DB's stamped alembic head to the head(s) baked into the image's migration scripts. A DB with no alembic_version table (e.g. a create_all test DB) is treated as current, so this stays quiet outside real deployments. Uses to_regclass so a missing table never poisons the caller's transaction. - /health/ready: returns 503 with an explicit "drift: db=… expected=…" message when the schema is behind, instead of reporting ready and serving 500s. - Startup lifespan: logs CRITICAL on drift (advisory — never blocks startup). Liveness (/health) is untouched, so a drifted container isn't killed into a crash-loop — it's loudly degraded and self-heals once migrations apply. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> Signed-off-by: Justin Paul <justin@jpaul.me>
54 lines
1.9 KiB
Python
54 lines
1.9 KiB
Python
"""Liveness and readiness endpoints.
|
|
|
|
- ``/health`` — liveness: the process is up. No dependencies touched.
|
|
- ``/health/ready`` — readiness: dependencies (Postgres) are reachable.
|
|
|
|
Orchestrators and Caddy probe these; they are intentionally outside the
|
|
versioned ``/api`` surface.
|
|
"""
|
|
|
|
from fastapi import APIRouter, Response, status
|
|
from sqlalchemy import text
|
|
|
|
from app.core.config import get_settings
|
|
from app.core.db import get_engine
|
|
from app.core.schema_version import schema_is_current
|
|
|
|
router = APIRouter(tags=["health"])
|
|
|
|
|
|
@router.get("/health")
|
|
async def health() -> dict:
|
|
settings = get_settings()
|
|
return {
|
|
"status": "ok",
|
|
"service": settings.app_name,
|
|
"version": settings.version,
|
|
"env": settings.app_env,
|
|
}
|
|
|
|
|
|
@router.get("/health/ready")
|
|
async def ready(response: Response) -> dict:
|
|
checks: dict[str, str] = {}
|
|
try:
|
|
async with get_engine().connect() as conn:
|
|
await conn.execute(text("SELECT 1"))
|
|
checks["database"] = "ok"
|
|
# Schema drift = code ahead of the DB; queries would 500. Fail
|
|
# readiness loudly rather than serve a broken surface.
|
|
ok, db, expected = await schema_is_current(conn)
|
|
if not ok:
|
|
checks["schema"] = (
|
|
f"drift: db={sorted(db) or ['none']} expected={sorted(expected)} "
|
|
"— run 'alembic upgrade head'"
|
|
)
|
|
response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE
|
|
return {"status": "not ready", "checks": checks}
|
|
checks["schema"] = "ok"
|
|
return {"status": "ready", "checks": checks}
|
|
except Exception as exc: # noqa: BLE001 — surface any failure as "not ready"
|
|
checks.setdefault("database", "error")
|
|
response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE
|
|
return {"status": "not ready", "checks": checks, "detail": str(exc)}
|