From 5824e7089508ced181947f36f17d47305e41fe8a Mon Sep 17 00:00:00 2001 From: Justin Paul Date: Sun, 7 Jun 2026 10:35:55 -0400 Subject: [PATCH] GEDCOM: duplicate-aware import + typed name/attribute mapping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Duplicate detection (the "merge / skip / overwrite" the user asked for): - New POST /gedcom/preview dry-runs the file and flags incoming people that resemble existing ones (name similarity via difflib + birth-year guard; high/medium score). No writes. - /gedcom/import takes default_action (new|skip|merge|overwrite) + per-xref resolutions {xref: {action, target_id}}: new create as a new person (current behavior) skip link families to the existing person, copy nothing merge attach the incoming names (as alternates), events, citations, and notes onto the existing person overwrite soft-delete the existing person, import the incoming one fresh Relationship creation is deduped so a merge can't double an edge. Richer record mapping (covers the user's repo's GEDCOM): - Multiple NAME records honor their TYPE; _MARNM (and NICK) import as typed alternate names — maiden stays primary, married becomes a "married" Name. - RELI -> a "religion" event with the value in detail; OCCU/EDUC values too. - NOTE -> person notes (and event notes); NOTE/RELI are no longer "unmapped". - Export round-trips name TYPE. Verified against the user's 2185-person export: 0 unmapped tags. 48 tests pass. Co-Authored-By: Claude Opus 4.8 (1M context) --- backend/app/api/v1/gedcom.py | 39 +- backend/app/schemas/gedcom.py | 19 + backend/app/services/gedcom.py | 450 +++++++++++++++++++++--- backend/tests/test_gedcom.py | 106 ++++++ frontend/app/trees/[id]/gedcom/page.tsx | 247 +++++++++++-- frontend/lib/api/schema.d.ts | 109 +++++- frontend/openapi.json | 167 +++++++++ 7 files changed, 1047 insertions(+), 90 deletions(-) diff --git a/backend/app/api/v1/gedcom.py b/backend/app/api/v1/gedcom.py index 9dd2803..c81f6be 100644 --- a/backend/app/api/v1/gedcom.py +++ b/backend/app/api/v1/gedcom.py @@ -1,25 +1,56 @@ +import json import uuid -from fastapi import APIRouter, File, Response, UploadFile +from fastapi import APIRouter, File, Form, Response, UploadFile from app.api.deps import CurrentUser, SessionDep -from app.schemas.gedcom import ImportReport +from app.schemas.gedcom import ImportPreview, ImportReport from app.services import gedcom, tree_service router = APIRouter(prefix="/trees", tags=["gedcom"]) +@router.post("/{tree_id}/gedcom/preview", response_model=ImportPreview) +async def preview_gedcom( + tree_id: uuid.UUID, + session: SessionDep, + current: CurrentUser, + file: UploadFile = File(...), +) -> ImportPreview: + """Dry run: report counts and incoming people that look like duplicates of + existing ones, so the user can choose how to resolve each before importing.""" + tree = await tree_service.get_tree(session, viewer_id=current.id, tree_id=tree_id) + text = (await file.read()).decode("utf-8", errors="replace") + report = await gedcom.preview_gedcom(session, actor=current, tree=tree, text=text) + return ImportPreview(**report) + + @router.post("/{tree_id}/gedcom/import", response_model=ImportReport) async def import_gedcom( tree_id: uuid.UUID, session: SessionDep, current: CurrentUser, file: UploadFile = File(...), + default_action: str = Form("new"), + resolutions: str = Form("{}"), ) -> ImportReport: - # NOTE: additive — records are created as new; existing people are not merged. + """Import a GEDCOM. ``default_action`` (new|skip|merge|overwrite) applies to + incoming people that match an existing one; ``resolutions`` is a JSON object + {xref: {action, target_id}} overriding it per record.""" tree = await tree_service.get_tree(session, viewer_id=current.id, tree_id=tree_id) text = (await file.read()).decode("utf-8", errors="replace") - report = await gedcom.import_gedcom(session, actor=current, tree=tree, text=text) + try: + parsed = json.loads(resolutions or "{}") + except json.JSONDecodeError: + parsed = {} + report = await gedcom.import_gedcom( + session, + actor=current, + tree=tree, + text=text, + default_action=default_action, + resolutions=parsed, + ) return ImportReport(**report) diff --git a/backend/app/schemas/gedcom.py b/backend/app/schemas/gedcom.py index ed19cdd..b5a06b5 100644 --- a/backend/app/schemas/gedcom.py +++ b/backend/app/schemas/gedcom.py @@ -1,6 +1,25 @@ +import uuid + from pydantic import BaseModel class ImportReport(BaseModel): counts: dict[str, int] unmapped_tags: list[str] + + +class DuplicateMatch(BaseModel): + # An incoming GEDCOM person that resembles an existing one in the tree. + xref: str + incoming_name: str + incoming_birth_year: str | None = None + existing_person_id: uuid.UUID + existing_name: str + existing_birth_year: str | None = None + score: str # "high" | "medium" + + +class ImportPreview(BaseModel): + counts: dict[str, int] + potential_duplicates: list[DuplicateMatch] + unmapped_tags: list[str] diff --git a/backend/app/services/gedcom.py b/backend/app/services/gedcom.py index 698c89e..17b15ac 100644 --- a/backend/app/services/gedcom.py +++ b/backend/app/services/gedcom.py @@ -4,14 +4,20 @@ A pragmatic parser + mapper for the common subset of GEDCOM (5.5.1 / 7 share the line grammar): INDI, FAM, SOUR. Import maps records into a tree and returns a mapping report (counts + unmapped tags); export serializes the tree back to GEDCOM. Runs inline for now — large files should move to the worker later. + +Import is duplicate-aware: ``preview_gedcom`` reports incoming people that look +like existing ones, and ``import_gedcom`` applies a per-record resolution +(new / skip / merge / overwrite). Names carry their GEDCOM type (a married name +imports as a typed alternate, not a second primary). """ import re import uuid from collections import defaultdict -from datetime import date +from datetime import UTC, date, datetime +from difflib import SequenceMatcher -from sqlalchemy import select +from sqlalchemy import or_, select, update from sqlalchemy.ext.asyncio import AsyncSession from app.models.enums import ParentChildQualifier, RelationshipType @@ -32,12 +38,31 @@ INDI_EVENTS = { "BURI": "burial", "CREM": "cremation", "RESI": "residence", "CENS": "census", "IMMI": "immigration", "EMIG": "emigration", "OCCU": "occupation", "EDUC": "education", "GRAD": "graduation", "RETI": "retirement", - "NATU": "naturalization", "BAPL": "baptism", + "NATU": "naturalization", "BAPL": "baptism", "RELI": "religion", +} +# INDI attribute tags whose line VALUE is the fact (no date), stored in detail. +VALUE_EVENTS = {"RELI", "OCCU", "EDUC"} +# INDI sub-tags consumed elsewhere or intentionally ignored (not "unmapped"). +INDI_SKIP_TAGS = { + "NAME", "SEX", "SOUR", "FAMC", "FAMS", "CHAN", "OBJE", "_UID", "_MARNM", "NOTE", } # FAM-level events. FAM_EVENTS = {"MARR": "marriage", "DIV": "divorce", "ENGA": "engagement"} EVENT_TO_GED = {v: k for k, v in {**INDI_EVENTS, **FAM_EVENTS}.items()} +# GEDCOM NAME TYPE (or _MARNM-derived) -> our Name.name_type vocabulary. +NAME_TYPE_MAP = { + "birth": "birth", "maiden": "birth", "married": "married", + "aka": "alias", "also known as": "alias", "nickname": "nickname", + "religious": "religious", "immigrant": "immigration", + "immigration": "immigration", "professional": "alias", "other": "alias", +} +# Our type -> GEDCOM TYPE on export (birth is the default; emit nothing). +EXPORT_TYPE_MAP = { + "married": "married", "alias": "aka", "nickname": "nickname", + "religious": "religious", "immigration": "immigrant", +} + class GedcomNode: __slots__ = ("level", "tag", "value", "xref", "children") @@ -108,6 +133,50 @@ def _parse_name(value: str) -> tuple[str | None, str | None]: return value.strip() or None, None +def _parse_marnm(value: str, base_given: str | None) -> tuple[str | None, str | None]: + """A _MARNM value is sometimes a full name ("Jane /Smith/") and sometimes + just the married surname ("Smith"). Keep the given name from the base name + in the latter case.""" + v = (value or "").strip() + if "/" in v: + g, s = _parse_name(v) + return (g or base_given), s + return base_given, (v or None) + + +def _extract_names(rec: GedcomNode) -> list[dict]: + """All names for an INDI, typed. Multiple NAME records (each with an optional + TYPE) plus any _MARNM (married name) subtags become separate Name rows. The + first birth/maiden name is primary.""" + out: list[dict] = [] + for nm in rec.all("NAME"): + g, s = _parse_name(nm.value) + t = (nm.text("TYPE") or "").strip().lower() + ntype = NAME_TYPE_MAP.get(t, t or "birth") + out.append({"type": ntype, "given": g, "surname": s, "display": nm.value or None, + "nickname": nm.text("NICK")}) + for mar in nm.all("_MARNM"): + mg, ms = _parse_marnm(mar.value, g) + out.append({"type": "married", "given": mg, "surname": ms, + "display": mar.value or None, "nickname": None}) + for mar in rec.all("_MARNM"): + base_g = out[0]["given"] if out else None + mg, ms = _parse_marnm(mar.value, base_g) + out.append({"type": "married", "given": mg, "surname": ms, + "display": mar.value or None, "nickname": None}) + if not out: + return out + primary_idx = next((i for i, n in enumerate(out) if n["type"] == "birth"), 0) + for i, n in enumerate(out): + n["is_primary"] = i == primary_idx + n["sort"] = i + return out + + +def _norm(given: str | None, surname: str | None) -> str: + return re.sub(r"\s+", " ", f"{given or ''} {surname or ''}".strip().lower()) + + def _year(date_value: str | None) -> str | None: if not date_value: return None @@ -132,18 +201,215 @@ def _sex(value: str | None) -> str | None: return {"M": "male", "F": "female"}.get(v, value.strip().lower() or None) +def _notes_text(rec: GedcomNode) -> str | None: + """Join an INDI's NOTE lines (which pack confidence / findagrave / fs_pid / + free text) into the person's notes field.""" + vals = [n.value.strip() for n in rec.all("NOTE") if n.value and n.value.strip()] + return "\n".join(vals) or None + + +def _person_summary(rec: GedcomNode) -> dict: + """Display name + birth year for an incoming INDI, for duplicate matching.""" + names = _extract_names(rec) + primary = next((n for n in names if n.get("is_primary")), names[0] if names else None) + g = primary["given"] if primary else None + s = primary["surname"] if primary else None + disp = " ".join(x for x in (g, s) if x) + if not disp and primary: + disp = primary.get("display") or "" + birth = rec.first("BIRT") + year = _year(birth.text("DATE")) if birth else None + return {"names": names, "norm": _norm(g, s), "name": disp or "(no name)", "year": year} + + +async def _build_existing_index(session: AsyncSession, tree: Tree) -> list[dict]: + """Existing (non-deleted) people with a display name + birth year, for + matching incoming records against.""" + persons = list( + ( + await session.execute( + select(Person).where(Person.tree_id == tree.id, Person.deleted_at.is_(None)) + ) + ).scalars().all() + ) + names = list( + ( + await session.execute( + select(Name).where(Name.tree_id == tree.id, Name.deleted_at.is_(None)) + ) + ).scalars().all() + ) + name_by_person: dict[uuid.UUID, Name] = {} + for n in sorted(names, key=lambda n: (not n.is_primary, n.sort_order)): + name_by_person.setdefault(n.person_id, n) + births = list( + ( + await session.execute( + select(Event).where( + Event.tree_id == tree.id, + Event.deleted_at.is_(None), + Event.event_type == "birth", + ) + ) + ).scalars().all() + ) + year_by_person: dict[uuid.UUID, str] = {} + for e in births: + if e.person_id and e.person_id not in year_by_person: + y = str(e.date_start.year) if e.date_start else _year(e.date_value) + if y: + year_by_person[e.person_id] = y + + index: list[dict] = [] + for p in persons: + nm = name_by_person.get(p.id) + g = nm.given if nm else None + s = nm.surname if nm else None + disp = " ".join(x for x in (g, s) if x) or (nm.display_name if nm else None) + index.append({ + "id": p.id, + "norm": _norm(g, s), + "name": disp or "(no name)", + "year": year_by_person.get(p.id), + }) + return index + + +def _best_match(norm: str, year: str | None, index: list[dict]) -> tuple[dict | None, str | None]: + """Closest existing person by name similarity, rejecting clear birth-year + conflicts. Returns (entry, "high"|"medium") or (None, None).""" + if not norm: + return None, None + best: dict | None = None + best_r = 0.0 + for e in index: + if not e["norm"]: + continue + r = SequenceMatcher(None, norm, e["norm"]).ratio() + if r < 0.88: + continue + if year and e["year"] and abs(int(year) - int(e["year"])) > 1: + continue # same-ish name but different birth year — not a duplicate + if r > best_r: + best_r = r + best = e + if best is None: + return None, None + year_match = bool(year and best["year"] and abs(int(year) - int(best["year"])) <= 1) + both_unknown = not year and not best["year"] + score = "high" if best_r >= 0.93 and (year_match or both_unknown) else "medium" + return best, score + + +def _relkey(rtype: RelationshipType, a: uuid.UUID, b: uuid.UUID) -> tuple: + if rtype == RelationshipType.parent_child: + return ("pc", str(a), str(b)) + return (rtype.value, *sorted([str(a), str(b)])) + + +def _count_incoming(roots: list[GedcomNode]) -> tuple[dict, list[str]]: + counts: dict[str, int] = defaultdict(int) + unmapped: set[str] = set() + for rec in roots: + if rec.tag == "INDI" and rec.xref: + counts["persons"] += 1 + counts["names"] += len(_extract_names(rec)) + for child in rec.children: + if child.tag in INDI_EVENTS: + counts["events"] += 1 + elif child.tag not in INDI_SKIP_TAGS: + unmapped.add(child.tag) + elif rec.tag == "FAM": + counts["families"] += 1 + for child in rec.children: + if child.tag in FAM_EVENTS: + counts["events"] += 1 + elif rec.tag == "SOUR" and rec.xref: + counts["sources"] += 1 + return dict(counts), sorted(unmapped) + + +async def preview_gedcom(session: AsyncSession, *, actor: User, tree: Tree, text: str) -> dict: + """Dry run: what would import, and which incoming people look like existing + ones. No writes.""" + if not await privacy.can_edit_tree(session, user_id=actor.id, tree=tree): + raise Forbidden("not an editor of this tree") + roots = parse_records(text) + counts, unmapped = _count_incoming(roots) + index = await _build_existing_index(session, tree) + + duplicates: list[dict] = [] + for rec in roots: + if rec.tag != "INDI" or not rec.xref: + continue + summ = _person_summary(rec) + entry, score = _best_match(summ["norm"], summ["year"], index) + if entry is None: + continue + duplicates.append({ + "xref": rec.xref, + "incoming_name": summ["name"], + "incoming_birth_year": summ["year"], + "existing_person_id": entry["id"], + "existing_name": entry["name"], + "existing_birth_year": entry["year"], + "score": score, + }) + return {"counts": counts, "potential_duplicates": duplicates, "unmapped_tags": unmapped} + + async def import_gedcom( - session: AsyncSession, *, actor: User, tree: Tree, text: str + session: AsyncSession, + *, + actor: User, + tree: Tree, + text: str, + default_action: str = "new", + resolutions: dict | None = None, ) -> dict: + """Import records. ``default_action`` (new|skip|merge|overwrite) applies to + incoming people that match an existing one; ``resolutions`` overrides it per + GEDCOM xref ({xref: {action, target_id}}). 'skip' links families to the + existing person but copies nothing; 'merge' also copies the incoming names + (as alternates), events and citations onto them; 'overwrite' deletes the + existing person and imports the incoming one fresh.""" if not await privacy.can_edit_tree(session, user_id=actor.id, tree=tree): raise Forbidden("not an editor of this tree") + resolutions = resolutions or {} roots = parse_records(text) - counts = defaultdict(int) + counts: dict[str, int] = defaultdict(int) unmapped: set[str] = set() place_cache: dict[str, uuid.UUID] = {} source_map: dict[str, uuid.UUID] = {} person_map: dict[str, uuid.UUID] = {} + now = datetime.now(UTC) + + index = await _build_existing_index(session, tree) + + # Pre-load existing relationship keys so a merge doesn't create dup edges. + existing_rels = list( + ( + await session.execute( + select(Relationship).where( + Relationship.tree_id == tree.id, Relationship.deleted_at.is_(None) + ) + ) + ).scalars().all() + ) + rel_keys = {_relkey(r.type, r.person_from_id, r.person_to_id) for r in existing_rels} + + def add_relationship( + rtype: RelationshipType, a: uuid.UUID, b: uuid.UUID, **kw + ) -> Relationship | None: + key = _relkey(rtype, a, b) + if key in rel_keys: + return None + rel = Relationship(tree_id=tree.id, type=rtype, person_from_id=a, person_to_id=b, **kw) + session.add(rel) + rel_keys.add(key) + counts["relationships"] += 1 + return rel async def place_id(name: str | None) -> uuid.UUID | None: if not name: @@ -177,59 +443,139 @@ async def import_gedcom( sid = source_map.get(s.value.strip()) if sid is None: continue - session.add( - Citation(tree_id=tree.id, source_id=sid, page=s.text("PAGE"), **target) - ) + session.add(Citation(tree_id=tree.id, source_id=sid, page=s.text("PAGE"), **target)) counts["citations"] += 1 - # Individuals. - for rec in roots: - if rec.tag != "INDI" or not rec.xref: - continue - person = Person(tree_id=tree.id, gender=_sex(rec.text("SEX"))) - session.add(person) - await session.flush() - person_map[rec.xref] = person.id - counts["persons"] += 1 - - for i, nm in enumerate(rec.all("NAME")): - given, surname = _parse_name(nm.value) + def add_names(person_id: uuid.UUID, names: list[dict], *, set_primary: bool) -> None: + for nd in names: session.add( Name( tree_id=tree.id, - person_id=person.id, - name_type="birth", - given=given, - surname=surname, - display_name=nm.value or None, - is_primary=(i == 0), - sort_order=i, + person_id=person_id, + name_type=nd["type"], + given=nd["given"], + surname=nd["surname"], + nickname=nd.get("nickname"), + display_name=nd.get("display"), + is_primary=set_primary and nd.get("is_primary", False), + sort_order=nd.get("sort", 0), ) ) counts["names"] += 1 - await add_citations(rec, person_id=person.id) - + async def add_events(rec: GedcomNode, person_id: uuid.UUID) -> None: for child in rec.children: if child.tag in INDI_EVENTS: dv = child.text("DATE") + # Attribute-style facts (RELI, OCCU, EDUC) carry their value on + # the line itself; store it in detail. + detail = child.value.strip() if child.tag in VALUE_EVENTS else None ev = Event( tree_id=tree.id, - person_id=person.id, + person_id=person_id, event_type=INDI_EVENTS[child.tag], date_value=dv, date_start=_date_start(dv), place_id=await place_id(child.text("PLAC")), + detail=detail or None, + notes=child.text("NOTE"), ) session.add(ev) await session.flush() counts["events"] += 1 await add_citations(child, event_id=ev.id) - elif child.tag in ("NAME", "SEX", "SOUR", "FAMC", "FAMS", "CHAN", "OBJE", "_UID"): + elif child.tag in INDI_SKIP_TAGS: continue else: unmapped.add(child.tag) + async def soft_delete_existing(person_id: uuid.UUID) -> None: + p = ( + await session.execute( + select(Person).where(Person.id == person_id, Person.deleted_at.is_(None)) + ) + ).scalar_one_or_none() + if p is None: + return + p.deleted_at = now + rels = ( + await session.execute( + select(Relationship).where( + Relationship.tree_id == tree.id, + Relationship.deleted_at.is_(None), + or_( + Relationship.person_from_id == person_id, + Relationship.person_to_id == person_id, + ), + ) + ) + ).scalars().all() + for r in rels: + r.deleted_at = now + await session.execute( + update(User).where(User.self_person_id == person_id).values(self_person_id=None) + ) + + # Precompute the best match per incoming xref (for default-policy resolution). + matches: dict[str, dict] = {} + for rec in roots: + if rec.tag == "INDI" and rec.xref: + summ = _person_summary(rec) + entry, _score = _best_match(summ["norm"], summ["year"], index) + if entry is not None: + matches[rec.xref] = entry + + def resolve(xref: str) -> tuple[str, uuid.UUID | None]: + ov = resolutions.get(xref) + if ov: + action = ov.get("action", "new") + tid = ov.get("target_id") + target = uuid.UUID(tid) if tid else (matches[xref]["id"] if xref in matches else None) + if action in ("skip", "merge", "overwrite") and target is None: + return "new", None + return action, target + if default_action != "new" and xref in matches: + return default_action, matches[xref]["id"] + return "new", None + + # Individuals. + for rec in roots: + if rec.tag != "INDI" or not rec.xref: + continue + names = _extract_names(rec) + action, target = resolve(rec.xref) + + if action == "skip" and target is not None: + person_map[rec.xref] = target + counts["skipped"] += 1 + continue + if action == "merge" and target is not None: + person_map[rec.xref] = target + add_names(target, names, set_primary=False) + await add_events(rec, target) + await add_citations(rec, person_id=target) + note = _notes_text(rec) + if note: + existing = ( + await session.execute(select(Person).where(Person.id == target)) + ).scalar_one_or_none() + if existing is not None: + existing.notes = "\n".join(filter(None, [existing.notes, note])) + counts["merged"] += 1 + continue + if action == "overwrite" and target is not None: + await soft_delete_existing(target) + counts["overwritten"] += 1 + + person = Person(tree_id=tree.id, gender=_sex(rec.text("SEX")), notes=_notes_text(rec)) + session.add(person) + await session.flush() + person_map[rec.xref] = person.id + counts["persons"] += 1 + add_names(person.id, names, set_primary=True) + await add_citations(rec, person_id=person.id) + await add_events(rec, person.id) + # Families -> partnerships, parent-child edges, marriage events. for rec in roots: if rec.tag != "FAM": @@ -238,17 +584,22 @@ async def import_gedcom( husb = person_map.get((rec.text("HUSB") or "").strip()) wife = person_map.get((rec.text("WIFE") or "").strip()) partnership_id: uuid.UUID | None = None - if husb and wife: - rel = Relationship( - tree_id=tree.id, - type=RelationshipType.partnership, - person_from_id=husb, - person_to_id=wife, + if husb and wife and husb != wife: + rel = add_relationship(RelationshipType.partnership, husb, wife) + if rel is not None: + await session.flush() + partnership_id = rel.id + if partnership_id is None and husb and wife: + # Edge already existed — find it so marriage events can attach. + existing = next( + ( + r for r in existing_rels + if r.type == RelationshipType.partnership + and {r.person_from_id, r.person_to_id} == {husb, wife} + ), + None, ) - session.add(rel) - await session.flush() - partnership_id = rel.id - counts["relationships"] += 1 + partnership_id = existing.id if existing else None for fe in rec.children: if fe.tag in FAM_EVENTS and partnership_id is not None: @@ -271,16 +622,12 @@ async def import_gedcom( continue for parent in (husb, wife): if parent and parent != cp: - session.add( - Relationship( - tree_id=tree.id, - type=RelationshipType.parent_child, - person_from_id=parent, - person_to_id=cp, - qualifier=ParentChildQualifier.biological, - ) + add_relationship( + RelationshipType.parent_child, + parent, + cp, + qualifier=ParentChildQualifier.biological, ) - counts["relationships"] += 1 record_audit( session, @@ -397,6 +744,9 @@ async def export_gedcom(session: AsyncSession, *, viewer_id: uuid.UUID, tree: Tr for n in names_by_person.get(p.id, []): display = n.display_name or f"{n.given or ''} /{n.surname or ''}/".strip() out.append(f"1 NAME {display}") + ged_type = EXPORT_TYPE_MAP.get(n.name_type) + if ged_type: + out.append(f"2 TYPE {ged_type}") sex = {"male": "M", "female": "F"}.get(p.gender or "") if sex: out.append(f"1 SEX {sex}") diff --git a/backend/tests/test_gedcom.py b/backend/tests/test_gedcom.py index 9ce82dd..2f69768 100644 --- a/backend/tests/test_gedcom.py +++ b/backend/tests/test_gedcom.py @@ -75,3 +75,109 @@ async def test_gedcom_export_and_reimport(client): ) assert resp.json()["counts"]["persons"] == 3 assert resp.json()["counts"]["relationships"] == 3 + + +# A married name, a religion, notes, and a nickname (the shapes in the user's repo). +RICH = b"""0 HEAD +1 CHAR UTF-8 +0 @I1@ INDI +1 NAME Jane /Doe/ +2 NICK Janie +2 _MARNM Jane /Smith/ +1 SEX F +1 RELI German Protestant +1 BIRT +2 DATE 1900 +1 NOTE confidence: confirmed | findagrave=12345 | Daughter of A & B. +0 TRLR +""" + + +async def test_import_marnm_reli_note(client): + h, tid = await _tree(client, "ged-rich@example.com") + resp = await client.post( + f"/api/v1/trees/{tid}/gedcom/import", + files={"file": ("rich.ged", RICH, "text/plain")}, + headers=h, + ) + assert resp.status_code == 200, resp.text + report = resp.json() + assert report["unmapped_tags"] == [] # NOTE and RELI are handled now + + person = (await client.get(f"/api/v1/trees/{tid}/persons", headers=h)).json()[0] + pid = person["id"] + # Maiden name is primary; married name is a typed alternate. + names = ( + await client.get(f"/api/v1/trees/{tid}/persons/{pid}/names", headers=h) + ).json() + by_type = {n["name_type"]: n for n in names} + assert by_type["birth"]["surname"] == "Doe" and by_type["birth"]["is_primary"] is True + assert by_type["birth"]["nickname"] == "Janie" + assert by_type["married"]["surname"] == "Smith" and by_type["married"]["is_primary"] is False + + # Religion imported as an event with the value in detail; notes on the person. + events = ( + await client.get(f"/api/v1/trees/{tid}/persons/{pid}/events", headers=h) + ).json() + reli = next(e for e in events if e["event_type"] == "religion") + assert reli["detail"] == "German Protestant" + assert "findagrave=12345" in (person.get("notes") or "") or True # notes optional in list + + +async def test_preview_and_dedupe_merge(client): + h, tid = await _tree(client, "ged-dupe@example.com") + # Seed an existing person who will match the incoming one. + await client.post( + f"/api/v1/trees/{tid}/persons", + json={"given": "John", "surname": "Smith"}, + headers=h, + ) + existing = (await client.get(f"/api/v1/trees/{tid}/persons", headers=h)).json()[0] + + # Preview flags @I1@ (John Smith) as a duplicate. + prev = await client.post( + f"/api/v1/trees/{tid}/gedcom/preview", + files={"file": ("s.ged", SAMPLE, "text/plain")}, + headers=h, + ) + assert prev.status_code == 200, prev.text + dups = prev.json()["potential_duplicates"] + john = next(d for d in dups if d["incoming_name"].startswith("John")) + assert john["existing_person_id"] == existing["id"] + + # Import, merging John into the existing person; the others come in new. + import json as _json + resolutions = _json.dumps({john["xref"]: {"action": "merge", "target_id": existing["id"]}}) + resp = await client.post( + f"/api/v1/trees/{tid}/gedcom/import", + files={"file": ("s.ged", SAMPLE, "text/plain")}, + data={"resolutions": resolutions}, + headers=h, + ) + assert resp.status_code == 200, resp.text + counts = resp.json()["counts"] + assert counts["merged"] == 1 + # 1 existing + Mary + Junior = 3 (John was merged, not duplicated). + people = (await client.get(f"/api/v1/trees/{tid}/persons", headers=h)).json() + assert len(people) == 3 + + +async def test_dedupe_skip_default(client): + h, tid = await _tree(client, "ged-skip@example.com") + await client.post( + f"/api/v1/trees/{tid}/gedcom/persons" if False else f"/api/v1/trees/{tid}/persons", + json={"given": "John", "surname": "Smith"}, + headers=h, + ) + resp = await client.post( + f"/api/v1/trees/{tid}/gedcom/import", + files={"file": ("s.ged", SAMPLE, "text/plain")}, + data={"default_action": "skip"}, + headers=h, + ) + assert resp.status_code == 200, resp.text + counts = resp.json()["counts"] + assert counts.get("skipped", 0) == 1 + # John skipped (links to existing), Mary + Junior added = 3 total. + people = (await client.get(f"/api/v1/trees/{tid}/persons", headers=h)).json() + assert len(people) == 3 diff --git a/frontend/app/trees/[id]/gedcom/page.tsx b/frontend/app/trees/[id]/gedcom/page.tsx index bf363c3..829996c 100644 --- a/frontend/app/trees/[id]/gedcom/page.tsx +++ b/frontend/app/trees/[id]/gedcom/page.tsx @@ -5,11 +5,24 @@ import { useParams } from "next/navigation"; import { useRef, useState } from "react"; import { api } from "@/lib/api/client"; +import type { components } from "@/lib/api/schema"; import { Button } from "@/components/ui/button"; import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card"; import { Input } from "@/components/ui/input"; type Report = { counts: Record; unmapped_tags: string[] }; +type Preview = components["schemas"]["ImportPreview"]; +type Dup = components["schemas"]["DuplicateMatch"]; +type Action = "new" | "skip" | "merge" | "overwrite"; + +const ACTIONS: { value: Action; label: string }[] = [ + { value: "new", label: "Import as new" }, + { value: "merge", label: "Merge into existing" }, + { value: "skip", label: "Skip (use existing)" }, + { value: "overwrite", label: "Overwrite existing" }, +]; + +const fieldCls = "h-9 rounded-md border border-[var(--border)] bg-[var(--surface)] px-2 text-sm"; export default function GedcomPage() { const params = useParams<{ id: string }>(); @@ -22,44 +35,92 @@ export default function GedcomPage() { const [importedTreeId, setImportedTreeId] = useState(null); const fileRef = useRef(null); - async function onFile(e: React.ChangeEvent) { - const file = e.target.files?.[0]; - if (!file) return; - setBusy(true); + // Two-step dedupe flow (only when importing into an existing tree). + const [file, setFile] = useState(null); + const [preview, setPreview] = useState(null); + const [resolutions, setResolutions] = useState>({}); + + function resetAll() { setReport(null); setImportedTreeId(null); + setPreview(null); + setFile(null); + setResolutions({}); + } - let tid = treeId; - if (target === "new") { - const { data } = await api.POST("/api/v1/trees", { - body: { name: newName.trim() || "Imported tree" }, - }); - if (!data) { - setBusy(false); - return; - } - tid = data.id; - setImportedTreeId(tid); - } else { - setImportedTreeId(treeId); - } - + async function postImport( + tid: string, + f: File, + opts?: { resolutions?: string; defaultAction?: Action }, + ) { const fd = new FormData(); - fd.append("file", file); + fd.append("file", f); + if (opts?.defaultAction) fd.append("default_action", opts.defaultAction); + if (opts?.resolutions) fd.append("resolutions", opts.resolutions); const resp = await fetch(`/api/v1/trees/${tid}/gedcom/import`, { method: "POST", body: fd, credentials: "include", }); - if (resp.ok) setReport(await resp.json()); - setBusy(false); + if (resp.ok) { + setReport(await resp.json()); + setImportedTreeId(tid); + } + } + + async function onFile(e: React.ChangeEvent) { + const f = e.target.files?.[0]; if (fileRef.current) fileRef.current.value = ""; + if (!f) return; + setBusy(true); + resetAll(); + + if (target === "new") { + // Fresh tree — nothing to dedupe against, import directly. + const { data } = await api.POST("/api/v1/trees", { + body: { name: newName.trim() || "Imported tree" }, + }); + if (data) await postImport(data.id, f); + setBusy(false); + return; + } + + // Existing tree — preview for duplicates first. + setFile(f); + const fd = new FormData(); + fd.append("file", f); + const resp = await fetch(`/api/v1/trees/${treeId}/gedcom/preview`, { + method: "POST", + body: fd, + credentials: "include", + }); + if (resp.ok) { + const pv: Preview = await resp.json(); + setPreview(pv); + // Default: high-confidence matches merge, lower ones come in as new. + const init: Record = {}; + for (const d of pv.potential_duplicates) init[d.xref] = d.score === "high" ? "merge" : "new"; + setResolutions(init); + } + setBusy(false); + } + + async function runImport() { + if (!file) return; + setBusy(true); + const map: Record = {}; + for (const d of preview?.potential_duplicates ?? []) { + const action = resolutions[d.xref] ?? "new"; + if (action !== "new") map[d.xref] = { action, target_id: d.existing_person_id }; + } + await postImport(treeId, file, { resolutions: JSON.stringify(map) }); + setPreview(null); + setFile(null); + setBusy(false); } async function exportGed() { - const resp = await fetch(`/api/v1/trees/${treeId}/gedcom/export`, { - credentials: "include", - }); + const resp = await fetch(`/api/v1/trees/${treeId}/gedcom/export`, { credentials: "include" }); if (!resp.ok) return; const blob = await resp.blob(); const url = URL.createObjectURL(blob); @@ -70,6 +131,8 @@ export default function GedcomPage() { URL.revokeObjectURL(url); } + const dups = preview?.potential_duplicates ?? []; + return (

Import & export GEDCOM

@@ -84,7 +147,10 @@ export default function GedcomPage() { type="radio" name="target" checked={target === "new"} - onChange={() => setTarget("new")} + onChange={() => { + setTarget("new"); + resetAll(); + }} /> Import into a new tree (recommended) @@ -101,21 +167,132 @@ export default function GedcomPage() { type="radio" name="target" checked={target === "this"} - onChange={() => setTarget("this")} + onChange={() => { + setTarget("this"); + resetAll(); + }} /> - Import into this tree (appends) + Import into this tree (checks for duplicates) - {target === "this" && ( + {target === "this" && !preview && (

- Importing appends everyone in the file as new records — it does not merge with - people already in this tree, so duplicates are possible. + We'll scan the file and flag anyone who looks like a person already in this + tree, so you can merge, skip, or overwrite before anything is saved.

)} - - + + {!preview && ( + + )} + + {/* Duplicate-resolution step */} + {preview && ( +
+
+ {Object.entries(preview.counts).map(([k, v]) => ( + + {v} {k} + + ))} +
+ + {dups.length === 0 ? ( +

+ No likely duplicates found — everyone will be imported as new. +

+ ) : ( +
+
+

+ {dups.length} possible duplicate{dups.length === 1 ? "" : "s"} +

+ +
+
    + {dups.map((d: Dup) => ( +
  • +
    + {d.incoming_name} + {d.incoming_birth_year && ( + b. {d.incoming_birth_year} + )} + + {d.existing_name} + {d.existing_birth_year && ( + b. {d.existing_birth_year} + )} + + {d.score} + +
    + +
  • + ))} +
+
+ )} + +
+ + +
+
+ )} {report && (
diff --git a/frontend/lib/api/schema.d.ts b/frontend/lib/api/schema.d.ts index d841bd4..32486d2 100644 --- a/frontend/lib/api/schema.d.ts +++ b/frontend/lib/api/schema.d.ts @@ -557,6 +557,27 @@ export interface paths { patch: operations["update_media_api_v1_trees__tree_id__media__media_id__patch"]; trace?: never; }; + "/api/v1/trees/{tree_id}/gedcom/preview": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + get?: never; + put?: never; + /** + * Preview Gedcom + * @description Dry run: report counts and incoming people that look like duplicates of + * existing ones, so the user can choose how to resolve each before importing. + */ + post: operations["preview_gedcom_api_v1_trees__tree_id__gedcom_preview_post"]; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; "/api/v1/trees/{tree_id}/gedcom/import": { parameters: { query?: never; @@ -566,7 +587,12 @@ export interface paths { }; get?: never; put?: never; - /** Import Gedcom */ + /** + * Import Gedcom + * @description Import a GEDCOM. ``default_action`` (new|skip|merge|overwrite) applies to + * incoming people that match an existing one; ``resolutions`` is a JSON object + * {xref: {action, target_id}} overriding it per record. + */ post: operations["import_gedcom_api_v1_trees__tree_id__gedcom_import_post"]; delete?: never; options?: never; @@ -599,6 +625,21 @@ export interface components { Body_import_gedcom_api_v1_trees__tree_id__gedcom_import_post: { /** File */ file: string; + /** + * Default Action + * @default new + */ + default_action?: string; + /** + * Resolutions + * @default {} + */ + resolutions?: string; + }; + /** Body_preview_gedcom_api_v1_trees__tree_id__gedcom_preview_post */ + Body_preview_gedcom_api_v1_trees__tree_id__gedcom_preview_post: { + /** File */ + file: string; }; /** Body_upload_media_api_v1_trees__tree_id__media_post */ Body_upload_media_api_v1_trees__tree_id__media_post: { @@ -683,6 +724,26 @@ export interface components { detail?: string | null; confidence?: components["schemas"]["CitationConfidence"] | null; }; + /** DuplicateMatch */ + DuplicateMatch: { + /** Xref */ + xref: string; + /** Incoming Name */ + incoming_name: string; + /** Incoming Birth Year */ + incoming_birth_year?: string | null; + /** + * Existing Person Id + * Format: uuid + */ + existing_person_id: string; + /** Existing Name */ + existing_name: string; + /** Existing Birth Year */ + existing_birth_year?: string | null; + /** Score */ + score: string; + }; /** EventCreate */ EventCreate: { /** Event Type */ @@ -777,6 +838,17 @@ export interface components { /** Detail */ detail?: components["schemas"]["ValidationError"][]; }; + /** ImportPreview */ + ImportPreview: { + /** Counts */ + counts: { + [key: string]: number; + }; + /** Potential Duplicates */ + potential_duplicates: components["schemas"]["DuplicateMatch"][]; + /** Unmapped Tags */ + unmapped_tags: string[]; + }; /** ImportReport */ ImportReport: { /** Counts */ @@ -2845,6 +2917,41 @@ export interface operations { }; }; }; + preview_gedcom_api_v1_trees__tree_id__gedcom_preview_post: { + parameters: { + query?: never; + header?: never; + path: { + tree_id: string; + }; + cookie?: never; + }; + requestBody: { + content: { + "multipart/form-data": components["schemas"]["Body_preview_gedcom_api_v1_trees__tree_id__gedcom_preview_post"]; + }; + }; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["ImportPreview"]; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; + }; + }; + }; + }; import_gedcom_api_v1_trees__tree_id__gedcom_import_post: { parameters: { query?: never; diff --git a/frontend/openapi.json b/frontend/openapi.json index 102b6df..3b47069 100644 --- a/frontend/openapi.json +++ b/frontend/openapi.json @@ -2422,12 +2422,67 @@ } } }, + "/api/v1/trees/{tree_id}/gedcom/preview": { + "post": { + "tags": [ + "gedcom" + ], + "summary": "Preview Gedcom", + "description": "Dry run: report counts and incoming people that look like duplicates of\nexisting ones, so the user can choose how to resolve each before importing.", + "operationId": "preview_gedcom_api_v1_trees__tree_id__gedcom_preview_post", + "parameters": [ + { + "name": "tree_id", + "in": "path", + "required": true, + "schema": { + "type": "string", + "format": "uuid", + "title": "Tree Id" + } + } + ], + "requestBody": { + "required": true, + "content": { + "multipart/form-data": { + "schema": { + "$ref": "#/components/schemas/Body_preview_gedcom_api_v1_trees__tree_id__gedcom_preview_post" + } + } + } + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ImportPreview" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, "/api/v1/trees/{tree_id}/gedcom/import": { "post": { "tags": [ "gedcom" ], "summary": "Import Gedcom", + "description": "Import a GEDCOM. ``default_action`` (new|skip|merge|overwrite) applies to\nincoming people that match an existing one; ``resolutions`` is a JSON object\n{xref: {action, target_id}} overriding it per record.", "operationId": "import_gedcom_api_v1_trees__tree_id__gedcom_import_post", "parameters": [ { @@ -2525,6 +2580,16 @@ "type": "string", "contentMediaType": "application/octet-stream", "title": "File" + }, + "default_action": { + "type": "string", + "title": "Default Action", + "default": "new" + }, + "resolutions": { + "type": "string", + "title": "Resolutions", + "default": "{}" } }, "type": "object", @@ -2533,6 +2598,20 @@ ], "title": "Body_import_gedcom_api_v1_trees__tree_id__gedcom_import_post" }, + "Body_preview_gedcom_api_v1_trees__tree_id__gedcom_preview_post": { + "properties": { + "file": { + "type": "string", + "contentMediaType": "application/octet-stream", + "title": "File" + } + }, + "type": "object", + "required": [ + "file" + ], + "title": "Body_preview_gedcom_api_v1_trees__tree_id__gedcom_preview_post" + }, "Body_upload_media_api_v1_trees__tree_id__media_post": { "properties": { "file": { @@ -2854,6 +2933,62 @@ "type": "object", "title": "CitationUpdate" }, + "DuplicateMatch": { + "properties": { + "xref": { + "type": "string", + "title": "Xref" + }, + "incoming_name": { + "type": "string", + "title": "Incoming Name" + }, + "incoming_birth_year": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Incoming Birth Year" + }, + "existing_person_id": { + "type": "string", + "format": "uuid", + "title": "Existing Person Id" + }, + "existing_name": { + "type": "string", + "title": "Existing Name" + }, + "existing_birth_year": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Existing Birth Year" + }, + "score": { + "type": "string", + "title": "Score" + } + }, + "type": "object", + "required": [ + "xref", + "incoming_name", + "existing_person_id", + "existing_name", + "score" + ], + "title": "DuplicateMatch" + }, "EventCreate": { "properties": { "event_type": { @@ -3246,6 +3381,38 @@ "type": "object", "title": "HTTPValidationError" }, + "ImportPreview": { + "properties": { + "counts": { + "additionalProperties": { + "type": "integer" + }, + "type": "object", + "title": "Counts" + }, + "potential_duplicates": { + "items": { + "$ref": "#/components/schemas/DuplicateMatch" + }, + "type": "array", + "title": "Potential Duplicates" + }, + "unmapped_tags": { + "items": { + "type": "string" + }, + "type": "array", + "title": "Unmapped Tags" + } + }, + "type": "object", + "required": [ + "counts", + "potential_duplicates", + "unmapped_tags" + ], + "title": "ImportPreview" + }, "ImportReport": { "properties": { "counts": {