Guard against schema drift: fail readiness + log loudly when DB is behind code

Defense-in-depth for the deploy pipeline. Today a backend image shipped ahead
of an un-applied migration; the Tree model selected columns the DB didn't have
yet, so every trees query 500'd with an opaque UndefinedColumnError and the UI
showed no trees. The root cause (deploys not running migrations) is fixed
separately; this makes the *symptom* impossible to miss.

- app/core/schema_version.py: compare the DB's stamped alembic head to the
  head(s) baked into the image's migration scripts. A DB with no alembic_version
  table (e.g. a create_all test DB) is treated as current, so this stays quiet
  outside real deployments. Uses to_regclass so a missing table never poisons
  the caller's transaction.
- /health/ready: returns 503 with an explicit "drift: db=… expected=…" message
  when the schema is behind, instead of reporting ready and serving 500s.
- Startup lifespan: logs CRITICAL on drift (advisory — never blocks startup).

Liveness (/health) is untouched, so a drifted container isn't killed into a
crash-loop — it's loudly degraded and self-heals once migrations apply.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: Justin Paul <justin@jpaul.me>
This commit is contained in:
2026-06-09 21:55:21 -04:00
parent f8fa23c1f6
commit 94b5caa7e5
4 changed files with 145 additions and 2 deletions
+14 -2
View File
@@ -12,6 +12,7 @@ from sqlalchemy import text
from app.core.config import get_settings
from app.core.db import get_engine
from app.core.schema_version import schema_is_current
router = APIRouter(tags=["health"])
@@ -33,9 +34,20 @@ async def ready(response: Response) -> dict:
try:
async with get_engine().connect() as conn:
await conn.execute(text("SELECT 1"))
checks["database"] = "ok"
checks["database"] = "ok"
# Schema drift = code ahead of the DB; queries would 500. Fail
# readiness loudly rather than serve a broken surface.
ok, db, expected = await schema_is_current(conn)
if not ok:
checks["schema"] = (
f"drift: db={sorted(db) or ['none']} expected={sorted(expected)} "
"— run 'alembic upgrade head'"
)
response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE
return {"status": "not ready", "checks": checks}
checks["schema"] = "ok"
return {"status": "ready", "checks": checks}
except Exception as exc: # noqa: BLE001 — surface any failure as "not ready"
checks["database"] = "error"
checks.setdefault("database", "error")
response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE
return {"status": "not ready", "checks": checks, "detail": str(exc)}
+59
View File
@@ -0,0 +1,59 @@
"""Schema-drift detection — a safety net for the deploy pipeline.
If a deploy ships code whose models reference a column a migration hasn't added
yet (the code is ahead of the DB), every query against that table 500s with an
opaque ``UndefinedColumnError``. That is exactly the failure that took the tree
list down once: the backend image advanced but ``alembic upgrade head`` hadn't
run on the server.
The real prevention is auto-migrate on deploy (the entrypoint runs
``alembic upgrade head`` when ``RUN_MIGRATIONS=1``). This module is defense in
depth: it makes the drift *loud and explicit* — a readiness failure and a
CRITICAL startup log — instead of a silent storm of 500s, so a half-applied
deploy is obvious within seconds.
"""
from functools import lru_cache
from pathlib import Path
from sqlalchemy import text
from sqlalchemy.ext.asyncio import AsyncConnection
# app/core/schema_version.py -> backend/ (parents: core, app, backend)
_MIGRATIONS_DIR = Path(__file__).resolve().parents[2] / "migrations"
@lru_cache
def expected_heads() -> frozenset[str]:
"""Revision head(s) baked into this image's migration scripts. Static for a
given build, so cache it."""
from alembic.config import Config
from alembic.script import ScriptDirectory
cfg = Config()
cfg.set_main_option("script_location", str(_MIGRATIONS_DIR))
return frozenset(ScriptDirectory.from_config(cfg).get_heads())
async def db_heads(conn: AsyncConnection) -> frozenset[str] | None:
"""Revision(s) the database is stamped at, or ``None`` when the DB is not
Alembic-managed (no ``alembic_version`` table — e.g. a test DB built straight
from ``create_all``). ``to_regclass`` returns NULL rather than erroring when
the table is absent, so this never poisons the caller's transaction."""
if await conn.scalar(text("SELECT to_regclass('public.alembic_version')")) is None:
return None
result = await conn.execute(text("SELECT version_num FROM alembic_version"))
return frozenset(row[0] for row in result)
async def schema_is_current(
conn: AsyncConnection,
) -> tuple[bool, frozenset[str], frozenset[str]]:
"""``(ok, db, expected)``. ``ok`` is True when the DB is stamped at the
code's head(s). A DB with no ``alembic_version`` table is treated as current
(not Alembic-managed → nothing to compare), so this stays quiet in tests."""
expected = expected_heads()
current = await db_heads(conn)
if current is None:
return True, frozenset(), expected
return current == expected, current, expected
+30
View File
@@ -7,6 +7,7 @@ engine is the single enforcement point for reads.
import logging
import sys
from contextlib import asynccontextmanager
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse
@@ -14,6 +15,8 @@ from fastapi.responses import JSONResponse
from app.api.health import router as health_router
from app.api.v1 import api_router
from app.core.config import get_settings
from app.core.db import get_engine
from app.core.schema_version import schema_is_current
from app.services.exceptions import Conflict, Forbidden, NotFound
@@ -30,6 +33,32 @@ def _configure_logging() -> None:
app_logger.propagate = False
async def _check_schema_drift() -> None:
"""On startup, shout if the DB schema is behind the code. The entrypoint
runs migrations when RUN_MIGRATIONS=1; this catches the case where that
didn't happen, so a half-applied deploy is obvious in the logs instead of a
silent storm of 500s. Never blocks startup — purely advisory."""
logger = logging.getLogger("provenance")
try:
async with get_engine().connect() as conn:
ok, db, expected = await schema_is_current(conn)
if not ok:
logger.critical(
"SCHEMA DRIFT: database is at %s but this build expects %s. "
"Run 'alembic upgrade head' — queries will fail until migrated.",
sorted(db) or ["none"],
sorted(expected),
)
except Exception as exc: # noqa: BLE001 — advisory only; never block startup
logger.warning("schema drift check skipped: %s", exc)
@asynccontextmanager
async def _lifespan(app: FastAPI):
await _check_schema_drift()
yield
def _register_error_handlers(app: FastAPI) -> None:
@app.exception_handler(NotFound)
async def _not_found(request: Request, exc: NotFound) -> JSONResponse:
@@ -51,6 +80,7 @@ def create_app() -> FastAPI:
title=settings.app_name,
version=settings.version,
description="Provenance API — family and land provenance.",
lifespan=_lifespan,
)
app.include_router(health_router)
app.include_router(api_router)
+42
View File
@@ -0,0 +1,42 @@
"""Schema-drift guard: the DB-vs-code head check behind /health/ready and the
startup log. Regression cover for the outage where the backend image shipped
ahead of an un-applied migration and every trees query 500'd."""
from sqlalchemy import text
from app.core.schema_version import db_heads, expected_heads, schema_is_current
def test_expected_heads_is_a_single_known_head():
heads = expected_heads()
# Linear migration history → exactly one head, and it's a real revision id.
assert len(heads) == 1
assert all(h and isinstance(h, str) for h in heads)
async def test_schema_is_current_detects_drift(db_session):
conn = await db_session.connection()
# The test DB is built from create_all (no alembic_version table), so it is
# not Alembic-managed and the check stays quiet — treated as current.
await conn.execute(text("DROP TABLE IF EXISTS alembic_version"))
assert await db_heads(conn) is None
ok, _, _ = await schema_is_current(conn)
assert ok is True
# Stamp an old/wrong revision → drift detected.
await conn.execute(text("CREATE TABLE alembic_version (version_num varchar(32) NOT NULL)"))
await conn.execute(text("INSERT INTO alembic_version (version_num) VALUES ('0000deadbeef')"))
ok, db, expected = await schema_is_current(conn)
assert ok is False
assert db == frozenset({"0000deadbeef"})
# Stamp the code's real head → current again.
head = next(iter(expected))
await conn.execute(text("DELETE FROM alembic_version"))
await conn.execute(text("INSERT INTO alembic_version (version_num) VALUES (:h)"), {"h": head})
ok, _, _ = await schema_is_current(conn)
assert ok is True
# Leave no alembic_version behind for other tests.
await conn.execute(text("DROP TABLE IF EXISTS alembic_version"))