Tree Cleanup tool: bulk fixes with preview → approve

A new per-tree Cleanup page (and cleanup_service + endpoints), each fix
preview-first per the propose-then-approve rule:

- Mark deceased by birth year: lists people born ≤ a cutoff (default 1930) not
  already deceased; apply sets is_living=false for the ones you keep checked.
- Set sex from a source GEDCOM: upload the source .ged (it carries SEX); matches
  by name and proposes sex only where it's missing — far more accurate than
  guessing from first names. Review, then apply.
- Names that look broken: flags date-in-surname / date-in-given / no-surname /
  packed given names, with inline editable given+surname; fix the checked ones.

No migration (uses existing columns). 55 backend tests pass (preview+apply for
all three); frontend builds.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-08 10:17:01 -04:00
parent 97f7a9e0ff
commit aa62ca490e
9 changed files with 1737 additions and 0 deletions
+2
View File
@@ -5,6 +5,7 @@ from fastapi import APIRouter
from app.api.v1 import (
auth,
citations,
cleanup,
events,
gedcom,
media,
@@ -28,3 +29,4 @@ api_router.include_router(sources.router)
api_router.include_router(citations.router)
api_router.include_router(media.router)
api_router.include_router(gedcom.router)
api_router.include_router(cleanup.router)
+91
View File
@@ -0,0 +1,91 @@
import uuid
from fastapi import APIRouter, File, UploadFile
from app.api.deps import CurrentUser, SessionDep
from app.schemas.cleanup import (
CleanupResult,
DeceasedApply,
DeceasedCandidate,
GenderApply,
GenderProposal,
NameApply,
NameIssue,
)
from app.services import cleanup_service, tree_service
router = APIRouter(prefix="/trees", tags=["cleanup"])
@router.get("/{tree_id}/cleanup/deceased", response_model=list[DeceasedCandidate])
async def preview_deceased(
tree_id: uuid.UUID,
session: SessionDep,
current: CurrentUser,
born_on_or_before: int = 1930,
) -> list[DeceasedCandidate]:
tree = await tree_service.get_tree(session, viewer_id=current.id, tree_id=tree_id)
rows = await cleanup_service.preview_deceased(
session, actor=current, tree=tree, year=born_on_or_before
)
return [DeceasedCandidate(**r) for r in rows]
@router.post("/{tree_id}/cleanup/deceased", response_model=CleanupResult)
async def apply_deceased(
tree_id: uuid.UUID, data: DeceasedApply, session: SessionDep, current: CurrentUser
) -> CleanupResult:
tree = await tree_service.get_tree(session, viewer_id=current.id, tree_id=tree_id)
n = await cleanup_service.apply_deceased(
session, actor=current, tree=tree, person_ids=data.person_ids
)
return CleanupResult(updated=n)
@router.post("/{tree_id}/cleanup/gender/preview", response_model=list[GenderProposal])
async def preview_gender(
tree_id: uuid.UUID,
session: SessionDep,
current: CurrentUser,
file: UploadFile = File(...),
) -> list[GenderProposal]:
tree = await tree_service.get_tree(session, viewer_id=current.id, tree_id=tree_id)
text = (await file.read()).decode("utf-8", errors="replace")
rows = await cleanup_service.preview_gender(
session, actor=current, tree=tree, gedcom_text=text
)
return [GenderProposal(**r) for r in rows]
@router.post("/{tree_id}/cleanup/gender", response_model=CleanupResult)
async def apply_gender(
tree_id: uuid.UUID, data: GenderApply, session: SessionDep, current: CurrentUser
) -> CleanupResult:
tree = await tree_service.get_tree(session, viewer_id=current.id, tree_id=tree_id)
n = await cleanup_service.apply_gender(
session,
actor=current,
tree=tree,
updates=[u.model_dump() for u in data.updates],
)
return CleanupResult(updated=n)
@router.get("/{tree_id}/cleanup/names", response_model=list[NameIssue])
async def preview_names(
tree_id: uuid.UUID, session: SessionDep, current: CurrentUser
) -> list[NameIssue]:
tree = await tree_service.get_tree(session, viewer_id=current.id, tree_id=tree_id)
rows = await cleanup_service.preview_names(session, actor=current, tree=tree)
return [NameIssue(**r) for r in rows]
@router.post("/{tree_id}/cleanup/names", response_model=CleanupResult)
async def apply_names(
tree_id: uuid.UUID, data: NameApply, session: SessionDep, current: CurrentUser
) -> CleanupResult:
tree = await tree_service.get_tree(session, viewer_id=current.id, tree_id=tree_id)
n = await cleanup_service.apply_names(
session, actor=current, tree=tree, edits=[e.model_dump() for e in data.edits]
)
return CleanupResult(updated=n)
+50
View File
@@ -0,0 +1,50 @@
import uuid
from pydantic import BaseModel
class DeceasedCandidate(BaseModel):
person_id: uuid.UUID
name: str
birth_year: int
class DeceasedApply(BaseModel):
person_ids: list[uuid.UUID]
class GenderProposal(BaseModel):
person_id: uuid.UUID
name: str
proposed_gender: str
class GenderUpdate(BaseModel):
person_id: uuid.UUID
gender: str
class GenderApply(BaseModel):
updates: list[GenderUpdate]
class NameIssue(BaseModel):
name_id: uuid.UUID
person_id: uuid.UUID
given: str | None = None
surname: str | None = None
issue: str
class NameEdit(BaseModel):
name_id: uuid.UUID
given: str | None = None
surname: str | None = None
class NameApply(BaseModel):
edits: list[NameEdit]
class CleanupResult(BaseModel):
updated: int
+267
View File
@@ -0,0 +1,267 @@
"""Bulk tree cleanup — preview/apply pairs for common import messes.
Per the project's #1 rule (the assistant proposes, humans approve), each fix has
a *preview* that returns the proposed changes and an *apply* that commits only
the ids/edits the user confirmed. Nothing here mutates without an explicit apply
call carrying the user's selections.
"""
import re
import uuid
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.models.event import Event
from app.models.person import Name, Person
from app.models.tree import Tree
from app.models.user import User
from app.services import gedcom, privacy
from app.services.audit import record_audit
from app.services.exceptions import Forbidden, NotFound
async def _require_editor(session: AsyncSession, *, actor: User, tree: Tree) -> None:
if not await privacy.can_edit_tree(session, user_id=actor.id, tree=tree):
raise Forbidden("not an editor of this tree")
async def _persons(session: AsyncSession, tree_id: uuid.UUID) -> list[Person]:
return list(
(
await session.execute(
select(Person).where(Person.tree_id == tree_id, Person.deleted_at.is_(None))
)
).scalars().all()
)
async def _primary_name_by_person(
session: AsyncSession, tree_id: uuid.UUID
) -> dict[uuid.UUID, Name]:
names = (
await session.execute(
select(Name)
.where(Name.tree_id == tree_id, Name.deleted_at.is_(None))
.order_by(Name.is_primary.desc(), Name.sort_order)
)
).scalars().all()
out: dict[uuid.UUID, Name] = {}
for n in names:
out.setdefault(n.person_id, n)
return out
async def _birth_year_by_person(session: AsyncSession, tree_id: uuid.UUID) -> dict[uuid.UUID, int]:
evs = (
await session.execute(
select(Event).where(
Event.tree_id == tree_id,
Event.deleted_at.is_(None),
Event.event_type == "birth",
)
)
).scalars().all()
out: dict[uuid.UUID, int] = {}
for e in evs:
if not e.person_id or e.person_id in out:
continue
y = e.date_start.year if e.date_start else None
if y is None:
ys = gedcom._year(e.date_value)
y = int(ys) if ys else None
if y is not None:
out[e.person_id] = y
return out
def _display(n: Name | None) -> str:
if n is None:
return "Unnamed"
return " ".join(x for x in (n.given, n.surname) if x) or (n.display_name or "Unnamed")
# ---- 1. Mark deceased by birth year -------------------------------------------------
async def preview_deceased(
session: AsyncSession, *, actor: User, tree: Tree, year: int
) -> list[dict]:
await _require_editor(session, actor=actor, tree=tree)
names = await _primary_name_by_person(session, tree.id)
years = await _birth_year_by_person(session, tree.id)
out: list[dict] = []
for p in await _persons(session, tree.id):
if p.is_living is False: # already deceased
continue
by = years.get(p.id)
if by is not None and by <= year:
out.append(
{"person_id": str(p.id), "name": _display(names.get(p.id)), "birth_year": by}
)
out.sort(key=lambda r: r["birth_year"])
return out
async def apply_deceased(
session: AsyncSession, *, actor: User, tree: Tree, person_ids: list[uuid.UUID]
) -> int:
await _require_editor(session, actor=actor, tree=tree)
persons = (
await session.execute(
select(Person).where(
Person.tree_id == tree.id,
Person.deleted_at.is_(None),
Person.id.in_(person_ids),
)
)
).scalars().all()
for p in persons:
p.is_living = False
record_audit(
session,
action="cleanup_deceased",
entity_type="Tree",
entity_id=tree.id,
tree_id=tree.id,
actor_user_id=actor.id,
after={"count": len(persons)},
)
await session.commit()
return len(persons)
# ---- 2. Re-derive gender from a source GEDCOM (matches by name) ----------------------
async def preview_gender(
session: AsyncSession, *, actor: User, tree: Tree, gedcom_text: str
) -> list[dict]:
await _require_editor(session, actor=actor, tree=tree)
name2sex: dict[str, str] = {}
for rec in gedcom.parse_records(gedcom_text):
if rec.tag != "INDI":
continue
summ = gedcom._person_summary(rec)
sex = gedcom._sex(rec.text("SEX"))
if sex and summ["norm"]:
name2sex.setdefault(summ["norm"], sex)
names = await _primary_name_by_person(session, tree.id)
out: list[dict] = []
for p in await _persons(session, tree.id):
if p.gender: # only fill in what's missing
continue
nm = names.get(p.id)
if nm is None:
continue
proposed = name2sex.get(gedcom._norm(nm.given, nm.surname))
if proposed:
out.append({"person_id": str(p.id), "name": _display(nm), "proposed_gender": proposed})
out.sort(key=lambda r: r["name"])
return out
async def apply_gender(
session: AsyncSession, *, actor: User, tree: Tree, updates: list[dict]
) -> int:
"""updates: [{person_id, gender}]."""
await _require_editor(session, actor=actor, tree=tree)
wanted = {uuid.UUID(str(u["person_id"])): u["gender"] for u in updates if u.get("gender")}
persons = (
await session.execute(
select(Person).where(
Person.tree_id == tree.id,
Person.deleted_at.is_(None),
Person.id.in_(wanted.keys()),
)
)
).scalars().all()
for p in persons:
p.gender = wanted[p.id]
record_audit(
session,
action="cleanup_gender",
entity_type="Tree",
entity_id=tree.id,
tree_id=tree.id,
actor_user_id=actor.id,
after={"count": len(persons)},
)
await session.commit()
return len(persons)
# ---- 3. Flag malformed names for review --------------------------------------------
_YEAR_RE = re.compile(r"\b\d{3,4}\b")
def _name_issue(n: Name) -> str | None:
given = (n.given or "").strip()
surname = (n.surname or "").strip()
if _YEAR_RE.search(surname) or re.search(r"\d", surname):
return "date_in_surname"
if re.search(r"\d", given):
return "date_in_given"
# A given name with many tokens often means a maiden+married name was packed
# in (e.g. "Mary Smith Jones") — surface it for a human to split.
if surname == "" and len(given.split()) >= 2:
return "no_surname"
if len(given.split()) >= 3:
return "packed_given"
return None
async def preview_names(session: AsyncSession, *, actor: User, tree: Tree) -> list[dict]:
await _require_editor(session, actor=actor, tree=tree)
names = (
await session.execute(
select(Name).where(Name.tree_id == tree.id, Name.deleted_at.is_(None))
)
).scalars().all()
out: list[dict] = []
for n in names:
issue = _name_issue(n)
if issue:
out.append({
"name_id": str(n.id),
"person_id": str(n.person_id),
"given": n.given,
"surname": n.surname,
"issue": issue,
})
return out
async def apply_names(
session: AsyncSession, *, actor: User, tree: Tree, edits: list[dict]
) -> int:
"""edits: [{name_id, given, surname}] — the user's corrected values."""
await _require_editor(session, actor=actor, tree=tree)
by_id = {uuid.UUID(str(e["name_id"])): e for e in edits}
rows = (
await session.execute(
select(Name).where(
Name.tree_id == tree.id,
Name.deleted_at.is_(None),
Name.id.in_(by_id.keys()),
)
)
).scalars().all()
if len(rows) != len(by_id):
raise NotFound("one or more names not found in this tree")
for n in rows:
e = by_id[n.id]
n.given = (e.get("given") or "").strip() or None
n.surname = (e.get("surname") or "").strip() or None
n.display_name = None # rebuild from parts
record_audit(
session,
action="cleanup_names",
entity_type="Tree",
entity_id=tree.id,
tree_id=tree.id,
actor_user_id=actor.id,
after={"count": len(rows)},
)
await session.commit()
return len(rows)
+107
View File
@@ -0,0 +1,107 @@
"""Tree cleanup: preview/apply for deceased-by-year, gender-from-source, names."""
from tests.conftest import auth, register
async def _tree(client, email):
h = auth(await register(client, email))
tid = (await client.post("/api/v1/trees", json={"name": "T"}, headers=h)).json()["id"]
return h, tid
async def _person(client, h, tid, given, surname=None):
return (
await client.post(
f"/api/v1/trees/{tid}/persons", json={"given": given, "surname": surname}, headers=h
)
).json()["id"]
async def _birth(client, h, tid, pid, year):
await client.post(
f"/api/v1/trees/{tid}/events",
json={"event_type": "birth", "person_id": pid, "date_value": str(year)},
headers=h,
)
async def test_deceased_preview_and_apply(client):
h, tid = await _tree(client, "cl-dec@example.com")
old = await _person(client, h, tid, "Josias", "Moody")
young = await _person(client, h, tid, "Kid", "Moody")
await _birth(client, h, tid, old, 1900)
await _birth(client, h, tid, young, 1990)
prev = (
await client.get(f"/api/v1/trees/{tid}/cleanup/deceased?born_on_or_before=1930", headers=h)
).json()
assert [r["person_id"] for r in prev] == [old]
r = await client.post(
f"/api/v1/trees/{tid}/cleanup/deceased", json={"person_ids": [old]}, headers=h
)
assert r.status_code == 200 and r.json()["updated"] == 1
assert (
await client.get(f"/api/v1/trees/{tid}/persons/{old}", headers=h)
).json()["is_living"] is False
# Re-preview no longer lists the now-deceased person.
prev2 = (
await client.get(f"/api/v1/trees/{tid}/cleanup/deceased?born_on_or_before=1930", headers=h)
).json()
assert old not in [r["person_id"] for r in prev2]
GED = b"""0 HEAD
0 @I1@ INDI
1 NAME Josias /Moody/
1 SEX M
0 @I2@ INDI
1 NAME Flora /Paul/
1 SEX F
0 TRLR
"""
async def test_gender_from_source(client):
h, tid = await _tree(client, "cl-gen@example.com")
await _person(client, h, tid, "Josias", "Moody")
await _person(client, h, tid, "Flora", "Paul")
await _person(client, h, tid, "Nobody", "Else") # not in source
prev = await client.post(
f"/api/v1/trees/{tid}/cleanup/gender/preview",
files={"file": ("src.ged", GED, "text/plain")},
headers=h,
)
props = prev.json()
by_name = {p["name"]: p["proposed_gender"] for p in props}
assert by_name == {"Josias Moody": "male", "Flora Paul": "female"}
updates = [{"person_id": p["person_id"], "gender": p["proposed_gender"]} for p in props]
r = await client.post(
f"/api/v1/trees/{tid}/cleanup/gender", json={"updates": updates}, headers=h
)
assert r.status_code == 200 and r.json()["updated"] == 2
people = (await client.get(f"/api/v1/trees/{tid}/persons", headers=h)).json()
genders = {p["primary_name"]: p["gender"] for p in people}
assert genders["Josias Moody"] == "male" and genders["Flora Paul"] == "female"
async def test_name_issues_preview_and_fix(client):
h, tid = await _tree(client, "cl-name@example.com")
# surname got a date; real surname landed in the given name.
bad = await _person(client, h, tid, "Henry Paul", "1859")
await _person(client, h, tid, "Normal", "Person") # should not be flagged
issues = (await client.get(f"/api/v1/trees/{tid}/cleanup/names", headers=h)).json()
assert len(issues) == 1 and issues[0]["issue"] == "date_in_surname"
name_id = issues[0]["name_id"]
r = await client.post(
f"/api/v1/trees/{tid}/cleanup/names",
json={"edits": [{"name_id": name_id, "given": "Henry", "surname": "Paul"}]},
headers=h,
)
assert r.status_code == 200 and r.json()["updated"] == 1
person = (await client.get(f"/api/v1/trees/{tid}/persons/{bad}", headers=h)).json()
assert person["primary_name"] == "Henry Paul"