Tree Cleanup tool: bulk fixes with preview → approve
A new per-tree Cleanup page (and cleanup_service + endpoints), each fix preview-first per the propose-then-approve rule: - Mark deceased by birth year: lists people born ≤ a cutoff (default 1930) not already deceased; apply sets is_living=false for the ones you keep checked. - Set sex from a source GEDCOM: upload the source .ged (it carries SEX); matches by name and proposes sex only where it's missing — far more accurate than guessing from first names. Review, then apply. - Names that look broken: flags date-in-surname / date-in-given / no-surname / packed given names, with inline editable given+surname; fix the checked ones. No migration (uses existing columns). 55 backend tests pass (preview+apply for all three); frontend builds. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -5,6 +5,7 @@ from fastapi import APIRouter
|
||||
from app.api.v1 import (
|
||||
auth,
|
||||
citations,
|
||||
cleanup,
|
||||
events,
|
||||
gedcom,
|
||||
media,
|
||||
@@ -28,3 +29,4 @@ api_router.include_router(sources.router)
|
||||
api_router.include_router(citations.router)
|
||||
api_router.include_router(media.router)
|
||||
api_router.include_router(gedcom.router)
|
||||
api_router.include_router(cleanup.router)
|
||||
|
||||
@@ -0,0 +1,91 @@
|
||||
import uuid
|
||||
|
||||
from fastapi import APIRouter, File, UploadFile
|
||||
|
||||
from app.api.deps import CurrentUser, SessionDep
|
||||
from app.schemas.cleanup import (
|
||||
CleanupResult,
|
||||
DeceasedApply,
|
||||
DeceasedCandidate,
|
||||
GenderApply,
|
||||
GenderProposal,
|
||||
NameApply,
|
||||
NameIssue,
|
||||
)
|
||||
from app.services import cleanup_service, tree_service
|
||||
|
||||
router = APIRouter(prefix="/trees", tags=["cleanup"])
|
||||
|
||||
|
||||
@router.get("/{tree_id}/cleanup/deceased", response_model=list[DeceasedCandidate])
|
||||
async def preview_deceased(
|
||||
tree_id: uuid.UUID,
|
||||
session: SessionDep,
|
||||
current: CurrentUser,
|
||||
born_on_or_before: int = 1930,
|
||||
) -> list[DeceasedCandidate]:
|
||||
tree = await tree_service.get_tree(session, viewer_id=current.id, tree_id=tree_id)
|
||||
rows = await cleanup_service.preview_deceased(
|
||||
session, actor=current, tree=tree, year=born_on_or_before
|
||||
)
|
||||
return [DeceasedCandidate(**r) for r in rows]
|
||||
|
||||
|
||||
@router.post("/{tree_id}/cleanup/deceased", response_model=CleanupResult)
|
||||
async def apply_deceased(
|
||||
tree_id: uuid.UUID, data: DeceasedApply, session: SessionDep, current: CurrentUser
|
||||
) -> CleanupResult:
|
||||
tree = await tree_service.get_tree(session, viewer_id=current.id, tree_id=tree_id)
|
||||
n = await cleanup_service.apply_deceased(
|
||||
session, actor=current, tree=tree, person_ids=data.person_ids
|
||||
)
|
||||
return CleanupResult(updated=n)
|
||||
|
||||
|
||||
@router.post("/{tree_id}/cleanup/gender/preview", response_model=list[GenderProposal])
|
||||
async def preview_gender(
|
||||
tree_id: uuid.UUID,
|
||||
session: SessionDep,
|
||||
current: CurrentUser,
|
||||
file: UploadFile = File(...),
|
||||
) -> list[GenderProposal]:
|
||||
tree = await tree_service.get_tree(session, viewer_id=current.id, tree_id=tree_id)
|
||||
text = (await file.read()).decode("utf-8", errors="replace")
|
||||
rows = await cleanup_service.preview_gender(
|
||||
session, actor=current, tree=tree, gedcom_text=text
|
||||
)
|
||||
return [GenderProposal(**r) for r in rows]
|
||||
|
||||
|
||||
@router.post("/{tree_id}/cleanup/gender", response_model=CleanupResult)
|
||||
async def apply_gender(
|
||||
tree_id: uuid.UUID, data: GenderApply, session: SessionDep, current: CurrentUser
|
||||
) -> CleanupResult:
|
||||
tree = await tree_service.get_tree(session, viewer_id=current.id, tree_id=tree_id)
|
||||
n = await cleanup_service.apply_gender(
|
||||
session,
|
||||
actor=current,
|
||||
tree=tree,
|
||||
updates=[u.model_dump() for u in data.updates],
|
||||
)
|
||||
return CleanupResult(updated=n)
|
||||
|
||||
|
||||
@router.get("/{tree_id}/cleanup/names", response_model=list[NameIssue])
|
||||
async def preview_names(
|
||||
tree_id: uuid.UUID, session: SessionDep, current: CurrentUser
|
||||
) -> list[NameIssue]:
|
||||
tree = await tree_service.get_tree(session, viewer_id=current.id, tree_id=tree_id)
|
||||
rows = await cleanup_service.preview_names(session, actor=current, tree=tree)
|
||||
return [NameIssue(**r) for r in rows]
|
||||
|
||||
|
||||
@router.post("/{tree_id}/cleanup/names", response_model=CleanupResult)
|
||||
async def apply_names(
|
||||
tree_id: uuid.UUID, data: NameApply, session: SessionDep, current: CurrentUser
|
||||
) -> CleanupResult:
|
||||
tree = await tree_service.get_tree(session, viewer_id=current.id, tree_id=tree_id)
|
||||
n = await cleanup_service.apply_names(
|
||||
session, actor=current, tree=tree, edits=[e.model_dump() for e in data.edits]
|
||||
)
|
||||
return CleanupResult(updated=n)
|
||||
@@ -0,0 +1,50 @@
|
||||
import uuid
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class DeceasedCandidate(BaseModel):
|
||||
person_id: uuid.UUID
|
||||
name: str
|
||||
birth_year: int
|
||||
|
||||
|
||||
class DeceasedApply(BaseModel):
|
||||
person_ids: list[uuid.UUID]
|
||||
|
||||
|
||||
class GenderProposal(BaseModel):
|
||||
person_id: uuid.UUID
|
||||
name: str
|
||||
proposed_gender: str
|
||||
|
||||
|
||||
class GenderUpdate(BaseModel):
|
||||
person_id: uuid.UUID
|
||||
gender: str
|
||||
|
||||
|
||||
class GenderApply(BaseModel):
|
||||
updates: list[GenderUpdate]
|
||||
|
||||
|
||||
class NameIssue(BaseModel):
|
||||
name_id: uuid.UUID
|
||||
person_id: uuid.UUID
|
||||
given: str | None = None
|
||||
surname: str | None = None
|
||||
issue: str
|
||||
|
||||
|
||||
class NameEdit(BaseModel):
|
||||
name_id: uuid.UUID
|
||||
given: str | None = None
|
||||
surname: str | None = None
|
||||
|
||||
|
||||
class NameApply(BaseModel):
|
||||
edits: list[NameEdit]
|
||||
|
||||
|
||||
class CleanupResult(BaseModel):
|
||||
updated: int
|
||||
@@ -0,0 +1,267 @@
|
||||
"""Bulk tree cleanup — preview/apply pairs for common import messes.
|
||||
|
||||
Per the project's #1 rule (the assistant proposes, humans approve), each fix has
|
||||
a *preview* that returns the proposed changes and an *apply* that commits only
|
||||
the ids/edits the user confirmed. Nothing here mutates without an explicit apply
|
||||
call carrying the user's selections.
|
||||
"""
|
||||
|
||||
import re
|
||||
import uuid
|
||||
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.models.event import Event
|
||||
from app.models.person import Name, Person
|
||||
from app.models.tree import Tree
|
||||
from app.models.user import User
|
||||
from app.services import gedcom, privacy
|
||||
from app.services.audit import record_audit
|
||||
from app.services.exceptions import Forbidden, NotFound
|
||||
|
||||
|
||||
async def _require_editor(session: AsyncSession, *, actor: User, tree: Tree) -> None:
|
||||
if not await privacy.can_edit_tree(session, user_id=actor.id, tree=tree):
|
||||
raise Forbidden("not an editor of this tree")
|
||||
|
||||
|
||||
async def _persons(session: AsyncSession, tree_id: uuid.UUID) -> list[Person]:
|
||||
return list(
|
||||
(
|
||||
await session.execute(
|
||||
select(Person).where(Person.tree_id == tree_id, Person.deleted_at.is_(None))
|
||||
)
|
||||
).scalars().all()
|
||||
)
|
||||
|
||||
|
||||
async def _primary_name_by_person(
|
||||
session: AsyncSession, tree_id: uuid.UUID
|
||||
) -> dict[uuid.UUID, Name]:
|
||||
names = (
|
||||
await session.execute(
|
||||
select(Name)
|
||||
.where(Name.tree_id == tree_id, Name.deleted_at.is_(None))
|
||||
.order_by(Name.is_primary.desc(), Name.sort_order)
|
||||
)
|
||||
).scalars().all()
|
||||
out: dict[uuid.UUID, Name] = {}
|
||||
for n in names:
|
||||
out.setdefault(n.person_id, n)
|
||||
return out
|
||||
|
||||
|
||||
async def _birth_year_by_person(session: AsyncSession, tree_id: uuid.UUID) -> dict[uuid.UUID, int]:
|
||||
evs = (
|
||||
await session.execute(
|
||||
select(Event).where(
|
||||
Event.tree_id == tree_id,
|
||||
Event.deleted_at.is_(None),
|
||||
Event.event_type == "birth",
|
||||
)
|
||||
)
|
||||
).scalars().all()
|
||||
out: dict[uuid.UUID, int] = {}
|
||||
for e in evs:
|
||||
if not e.person_id or e.person_id in out:
|
||||
continue
|
||||
y = e.date_start.year if e.date_start else None
|
||||
if y is None:
|
||||
ys = gedcom._year(e.date_value)
|
||||
y = int(ys) if ys else None
|
||||
if y is not None:
|
||||
out[e.person_id] = y
|
||||
return out
|
||||
|
||||
|
||||
def _display(n: Name | None) -> str:
|
||||
if n is None:
|
||||
return "Unnamed"
|
||||
return " ".join(x for x in (n.given, n.surname) if x) or (n.display_name or "Unnamed")
|
||||
|
||||
|
||||
# ---- 1. Mark deceased by birth year -------------------------------------------------
|
||||
|
||||
async def preview_deceased(
|
||||
session: AsyncSession, *, actor: User, tree: Tree, year: int
|
||||
) -> list[dict]:
|
||||
await _require_editor(session, actor=actor, tree=tree)
|
||||
names = await _primary_name_by_person(session, tree.id)
|
||||
years = await _birth_year_by_person(session, tree.id)
|
||||
out: list[dict] = []
|
||||
for p in await _persons(session, tree.id):
|
||||
if p.is_living is False: # already deceased
|
||||
continue
|
||||
by = years.get(p.id)
|
||||
if by is not None and by <= year:
|
||||
out.append(
|
||||
{"person_id": str(p.id), "name": _display(names.get(p.id)), "birth_year": by}
|
||||
)
|
||||
out.sort(key=lambda r: r["birth_year"])
|
||||
return out
|
||||
|
||||
|
||||
async def apply_deceased(
|
||||
session: AsyncSession, *, actor: User, tree: Tree, person_ids: list[uuid.UUID]
|
||||
) -> int:
|
||||
await _require_editor(session, actor=actor, tree=tree)
|
||||
persons = (
|
||||
await session.execute(
|
||||
select(Person).where(
|
||||
Person.tree_id == tree.id,
|
||||
Person.deleted_at.is_(None),
|
||||
Person.id.in_(person_ids),
|
||||
)
|
||||
)
|
||||
).scalars().all()
|
||||
for p in persons:
|
||||
p.is_living = False
|
||||
record_audit(
|
||||
session,
|
||||
action="cleanup_deceased",
|
||||
entity_type="Tree",
|
||||
entity_id=tree.id,
|
||||
tree_id=tree.id,
|
||||
actor_user_id=actor.id,
|
||||
after={"count": len(persons)},
|
||||
)
|
||||
await session.commit()
|
||||
return len(persons)
|
||||
|
||||
|
||||
# ---- 2. Re-derive gender from a source GEDCOM (matches by name) ----------------------
|
||||
|
||||
async def preview_gender(
|
||||
session: AsyncSession, *, actor: User, tree: Tree, gedcom_text: str
|
||||
) -> list[dict]:
|
||||
await _require_editor(session, actor=actor, tree=tree)
|
||||
name2sex: dict[str, str] = {}
|
||||
for rec in gedcom.parse_records(gedcom_text):
|
||||
if rec.tag != "INDI":
|
||||
continue
|
||||
summ = gedcom._person_summary(rec)
|
||||
sex = gedcom._sex(rec.text("SEX"))
|
||||
if sex and summ["norm"]:
|
||||
name2sex.setdefault(summ["norm"], sex)
|
||||
|
||||
names = await _primary_name_by_person(session, tree.id)
|
||||
out: list[dict] = []
|
||||
for p in await _persons(session, tree.id):
|
||||
if p.gender: # only fill in what's missing
|
||||
continue
|
||||
nm = names.get(p.id)
|
||||
if nm is None:
|
||||
continue
|
||||
proposed = name2sex.get(gedcom._norm(nm.given, nm.surname))
|
||||
if proposed:
|
||||
out.append({"person_id": str(p.id), "name": _display(nm), "proposed_gender": proposed})
|
||||
out.sort(key=lambda r: r["name"])
|
||||
return out
|
||||
|
||||
|
||||
async def apply_gender(
|
||||
session: AsyncSession, *, actor: User, tree: Tree, updates: list[dict]
|
||||
) -> int:
|
||||
"""updates: [{person_id, gender}]."""
|
||||
await _require_editor(session, actor=actor, tree=tree)
|
||||
wanted = {uuid.UUID(str(u["person_id"])): u["gender"] for u in updates if u.get("gender")}
|
||||
persons = (
|
||||
await session.execute(
|
||||
select(Person).where(
|
||||
Person.tree_id == tree.id,
|
||||
Person.deleted_at.is_(None),
|
||||
Person.id.in_(wanted.keys()),
|
||||
)
|
||||
)
|
||||
).scalars().all()
|
||||
for p in persons:
|
||||
p.gender = wanted[p.id]
|
||||
record_audit(
|
||||
session,
|
||||
action="cleanup_gender",
|
||||
entity_type="Tree",
|
||||
entity_id=tree.id,
|
||||
tree_id=tree.id,
|
||||
actor_user_id=actor.id,
|
||||
after={"count": len(persons)},
|
||||
)
|
||||
await session.commit()
|
||||
return len(persons)
|
||||
|
||||
|
||||
# ---- 3. Flag malformed names for review --------------------------------------------
|
||||
|
||||
_YEAR_RE = re.compile(r"\b\d{3,4}\b")
|
||||
|
||||
|
||||
def _name_issue(n: Name) -> str | None:
|
||||
given = (n.given or "").strip()
|
||||
surname = (n.surname or "").strip()
|
||||
if _YEAR_RE.search(surname) or re.search(r"\d", surname):
|
||||
return "date_in_surname"
|
||||
if re.search(r"\d", given):
|
||||
return "date_in_given"
|
||||
# A given name with many tokens often means a maiden+married name was packed
|
||||
# in (e.g. "Mary Smith Jones") — surface it for a human to split.
|
||||
if surname == "" and len(given.split()) >= 2:
|
||||
return "no_surname"
|
||||
if len(given.split()) >= 3:
|
||||
return "packed_given"
|
||||
return None
|
||||
|
||||
|
||||
async def preview_names(session: AsyncSession, *, actor: User, tree: Tree) -> list[dict]:
|
||||
await _require_editor(session, actor=actor, tree=tree)
|
||||
names = (
|
||||
await session.execute(
|
||||
select(Name).where(Name.tree_id == tree.id, Name.deleted_at.is_(None))
|
||||
)
|
||||
).scalars().all()
|
||||
out: list[dict] = []
|
||||
for n in names:
|
||||
issue = _name_issue(n)
|
||||
if issue:
|
||||
out.append({
|
||||
"name_id": str(n.id),
|
||||
"person_id": str(n.person_id),
|
||||
"given": n.given,
|
||||
"surname": n.surname,
|
||||
"issue": issue,
|
||||
})
|
||||
return out
|
||||
|
||||
|
||||
async def apply_names(
|
||||
session: AsyncSession, *, actor: User, tree: Tree, edits: list[dict]
|
||||
) -> int:
|
||||
"""edits: [{name_id, given, surname}] — the user's corrected values."""
|
||||
await _require_editor(session, actor=actor, tree=tree)
|
||||
by_id = {uuid.UUID(str(e["name_id"])): e for e in edits}
|
||||
rows = (
|
||||
await session.execute(
|
||||
select(Name).where(
|
||||
Name.tree_id == tree.id,
|
||||
Name.deleted_at.is_(None),
|
||||
Name.id.in_(by_id.keys()),
|
||||
)
|
||||
)
|
||||
).scalars().all()
|
||||
if len(rows) != len(by_id):
|
||||
raise NotFound("one or more names not found in this tree")
|
||||
for n in rows:
|
||||
e = by_id[n.id]
|
||||
n.given = (e.get("given") or "").strip() or None
|
||||
n.surname = (e.get("surname") or "").strip() or None
|
||||
n.display_name = None # rebuild from parts
|
||||
record_audit(
|
||||
session,
|
||||
action="cleanup_names",
|
||||
entity_type="Tree",
|
||||
entity_id=tree.id,
|
||||
tree_id=tree.id,
|
||||
actor_user_id=actor.id,
|
||||
after={"count": len(rows)},
|
||||
)
|
||||
await session.commit()
|
||||
return len(rows)
|
||||
@@ -0,0 +1,107 @@
|
||||
"""Tree cleanup: preview/apply for deceased-by-year, gender-from-source, names."""
|
||||
|
||||
from tests.conftest import auth, register
|
||||
|
||||
|
||||
async def _tree(client, email):
|
||||
h = auth(await register(client, email))
|
||||
tid = (await client.post("/api/v1/trees", json={"name": "T"}, headers=h)).json()["id"]
|
||||
return h, tid
|
||||
|
||||
|
||||
async def _person(client, h, tid, given, surname=None):
|
||||
return (
|
||||
await client.post(
|
||||
f"/api/v1/trees/{tid}/persons", json={"given": given, "surname": surname}, headers=h
|
||||
)
|
||||
).json()["id"]
|
||||
|
||||
|
||||
async def _birth(client, h, tid, pid, year):
|
||||
await client.post(
|
||||
f"/api/v1/trees/{tid}/events",
|
||||
json={"event_type": "birth", "person_id": pid, "date_value": str(year)},
|
||||
headers=h,
|
||||
)
|
||||
|
||||
|
||||
async def test_deceased_preview_and_apply(client):
|
||||
h, tid = await _tree(client, "cl-dec@example.com")
|
||||
old = await _person(client, h, tid, "Josias", "Moody")
|
||||
young = await _person(client, h, tid, "Kid", "Moody")
|
||||
await _birth(client, h, tid, old, 1900)
|
||||
await _birth(client, h, tid, young, 1990)
|
||||
|
||||
prev = (
|
||||
await client.get(f"/api/v1/trees/{tid}/cleanup/deceased?born_on_or_before=1930", headers=h)
|
||||
).json()
|
||||
assert [r["person_id"] for r in prev] == [old]
|
||||
|
||||
r = await client.post(
|
||||
f"/api/v1/trees/{tid}/cleanup/deceased", json={"person_ids": [old]}, headers=h
|
||||
)
|
||||
assert r.status_code == 200 and r.json()["updated"] == 1
|
||||
assert (
|
||||
await client.get(f"/api/v1/trees/{tid}/persons/{old}", headers=h)
|
||||
).json()["is_living"] is False
|
||||
# Re-preview no longer lists the now-deceased person.
|
||||
prev2 = (
|
||||
await client.get(f"/api/v1/trees/{tid}/cleanup/deceased?born_on_or_before=1930", headers=h)
|
||||
).json()
|
||||
assert old not in [r["person_id"] for r in prev2]
|
||||
|
||||
|
||||
GED = b"""0 HEAD
|
||||
0 @I1@ INDI
|
||||
1 NAME Josias /Moody/
|
||||
1 SEX M
|
||||
0 @I2@ INDI
|
||||
1 NAME Flora /Paul/
|
||||
1 SEX F
|
||||
0 TRLR
|
||||
"""
|
||||
|
||||
|
||||
async def test_gender_from_source(client):
|
||||
h, tid = await _tree(client, "cl-gen@example.com")
|
||||
await _person(client, h, tid, "Josias", "Moody")
|
||||
await _person(client, h, tid, "Flora", "Paul")
|
||||
await _person(client, h, tid, "Nobody", "Else") # not in source
|
||||
|
||||
prev = await client.post(
|
||||
f"/api/v1/trees/{tid}/cleanup/gender/preview",
|
||||
files={"file": ("src.ged", GED, "text/plain")},
|
||||
headers=h,
|
||||
)
|
||||
props = prev.json()
|
||||
by_name = {p["name"]: p["proposed_gender"] for p in props}
|
||||
assert by_name == {"Josias Moody": "male", "Flora Paul": "female"}
|
||||
|
||||
updates = [{"person_id": p["person_id"], "gender": p["proposed_gender"]} for p in props]
|
||||
r = await client.post(
|
||||
f"/api/v1/trees/{tid}/cleanup/gender", json={"updates": updates}, headers=h
|
||||
)
|
||||
assert r.status_code == 200 and r.json()["updated"] == 2
|
||||
people = (await client.get(f"/api/v1/trees/{tid}/persons", headers=h)).json()
|
||||
genders = {p["primary_name"]: p["gender"] for p in people}
|
||||
assert genders["Josias Moody"] == "male" and genders["Flora Paul"] == "female"
|
||||
|
||||
|
||||
async def test_name_issues_preview_and_fix(client):
|
||||
h, tid = await _tree(client, "cl-name@example.com")
|
||||
# surname got a date; real surname landed in the given name.
|
||||
bad = await _person(client, h, tid, "Henry Paul", "1859")
|
||||
await _person(client, h, tid, "Normal", "Person") # should not be flagged
|
||||
|
||||
issues = (await client.get(f"/api/v1/trees/{tid}/cleanup/names", headers=h)).json()
|
||||
assert len(issues) == 1 and issues[0]["issue"] == "date_in_surname"
|
||||
name_id = issues[0]["name_id"]
|
||||
|
||||
r = await client.post(
|
||||
f"/api/v1/trees/{tid}/cleanup/names",
|
||||
json={"edits": [{"name_id": name_id, "given": "Henry", "surname": "Paul"}]},
|
||||
headers=h,
|
||||
)
|
||||
assert r.status_code == 200 and r.json()["updated"] == 1
|
||||
person = (await client.get(f"/api/v1/trees/{tid}/persons/{bad}", headers=h)).json()
|
||||
assert person["primary_name"] == "Henry Paul"
|
||||
Reference in New Issue
Block a user