GEDCOM: duplicate-aware import + maiden/married + RELI/NOTE mapping #21

Merged
justin merged 1 commits from gedcom-import-dedupe into main 2026-06-07 10:41:09 -04:00
7 changed files with 1047 additions and 90 deletions
+35 -4
View File
@@ -1,25 +1,56 @@
import json
import uuid
from fastapi import APIRouter, File, Response, UploadFile
from fastapi import APIRouter, File, Form, Response, UploadFile
from app.api.deps import CurrentUser, SessionDep
from app.schemas.gedcom import ImportReport
from app.schemas.gedcom import ImportPreview, ImportReport
from app.services import gedcom, tree_service
router = APIRouter(prefix="/trees", tags=["gedcom"])
@router.post("/{tree_id}/gedcom/preview", response_model=ImportPreview)
async def preview_gedcom(
tree_id: uuid.UUID,
session: SessionDep,
current: CurrentUser,
file: UploadFile = File(...),
) -> ImportPreview:
"""Dry run: report counts and incoming people that look like duplicates of
existing ones, so the user can choose how to resolve each before importing."""
tree = await tree_service.get_tree(session, viewer_id=current.id, tree_id=tree_id)
text = (await file.read()).decode("utf-8", errors="replace")
report = await gedcom.preview_gedcom(session, actor=current, tree=tree, text=text)
return ImportPreview(**report)
@router.post("/{tree_id}/gedcom/import", response_model=ImportReport)
async def import_gedcom(
tree_id: uuid.UUID,
session: SessionDep,
current: CurrentUser,
file: UploadFile = File(...),
default_action: str = Form("new"),
resolutions: str = Form("{}"),
) -> ImportReport:
# NOTE: additive — records are created as new; existing people are not merged.
"""Import a GEDCOM. ``default_action`` (new|skip|merge|overwrite) applies to
incoming people that match an existing one; ``resolutions`` is a JSON object
{xref: {action, target_id}} overriding it per record."""
tree = await tree_service.get_tree(session, viewer_id=current.id, tree_id=tree_id)
text = (await file.read()).decode("utf-8", errors="replace")
report = await gedcom.import_gedcom(session, actor=current, tree=tree, text=text)
try:
parsed = json.loads(resolutions or "{}")
except json.JSONDecodeError:
parsed = {}
report = await gedcom.import_gedcom(
session,
actor=current,
tree=tree,
text=text,
default_action=default_action,
resolutions=parsed,
)
return ImportReport(**report)
+19
View File
@@ -1,6 +1,25 @@
import uuid
from pydantic import BaseModel
class ImportReport(BaseModel):
counts: dict[str, int]
unmapped_tags: list[str]
class DuplicateMatch(BaseModel):
# An incoming GEDCOM person that resembles an existing one in the tree.
xref: str
incoming_name: str
incoming_birth_year: str | None = None
existing_person_id: uuid.UUID
existing_name: str
existing_birth_year: str | None = None
score: str # "high" | "medium"
class ImportPreview(BaseModel):
counts: dict[str, int]
potential_duplicates: list[DuplicateMatch]
unmapped_tags: list[str]
+400 -50
View File
@@ -4,14 +4,20 @@ A pragmatic parser + mapper for the common subset of GEDCOM (5.5.1 / 7 share
the line grammar): INDI, FAM, SOUR. Import maps records into a tree and returns
a mapping report (counts + unmapped tags); export serializes the tree back to
GEDCOM. Runs inline for now — large files should move to the worker later.
Import is duplicate-aware: ``preview_gedcom`` reports incoming people that look
like existing ones, and ``import_gedcom`` applies a per-record resolution
(new / skip / merge / overwrite). Names carry their GEDCOM type (a married name
imports as a typed alternate, not a second primary).
"""
import re
import uuid
from collections import defaultdict
from datetime import date
from datetime import UTC, date, datetime
from difflib import SequenceMatcher
from sqlalchemy import select
from sqlalchemy import or_, select, update
from sqlalchemy.ext.asyncio import AsyncSession
from app.models.enums import ParentChildQualifier, RelationshipType
@@ -32,12 +38,31 @@ INDI_EVENTS = {
"BURI": "burial", "CREM": "cremation", "RESI": "residence", "CENS": "census",
"IMMI": "immigration", "EMIG": "emigration", "OCCU": "occupation",
"EDUC": "education", "GRAD": "graduation", "RETI": "retirement",
"NATU": "naturalization", "BAPL": "baptism",
"NATU": "naturalization", "BAPL": "baptism", "RELI": "religion",
}
# INDI attribute tags whose line VALUE is the fact (no date), stored in detail.
VALUE_EVENTS = {"RELI", "OCCU", "EDUC"}
# INDI sub-tags consumed elsewhere or intentionally ignored (not "unmapped").
INDI_SKIP_TAGS = {
"NAME", "SEX", "SOUR", "FAMC", "FAMS", "CHAN", "OBJE", "_UID", "_MARNM", "NOTE",
}
# FAM-level events.
FAM_EVENTS = {"MARR": "marriage", "DIV": "divorce", "ENGA": "engagement"}
EVENT_TO_GED = {v: k for k, v in {**INDI_EVENTS, **FAM_EVENTS}.items()}
# GEDCOM NAME TYPE (or _MARNM-derived) -> our Name.name_type vocabulary.
NAME_TYPE_MAP = {
"birth": "birth", "maiden": "birth", "married": "married",
"aka": "alias", "also known as": "alias", "nickname": "nickname",
"religious": "religious", "immigrant": "immigration",
"immigration": "immigration", "professional": "alias", "other": "alias",
}
# Our type -> GEDCOM TYPE on export (birth is the default; emit nothing).
EXPORT_TYPE_MAP = {
"married": "married", "alias": "aka", "nickname": "nickname",
"religious": "religious", "immigration": "immigrant",
}
class GedcomNode:
__slots__ = ("level", "tag", "value", "xref", "children")
@@ -108,6 +133,50 @@ def _parse_name(value: str) -> tuple[str | None, str | None]:
return value.strip() or None, None
def _parse_marnm(value: str, base_given: str | None) -> tuple[str | None, str | None]:
"""A _MARNM value is sometimes a full name ("Jane /Smith/") and sometimes
just the married surname ("Smith"). Keep the given name from the base name
in the latter case."""
v = (value or "").strip()
if "/" in v:
g, s = _parse_name(v)
return (g or base_given), s
return base_given, (v or None)
def _extract_names(rec: GedcomNode) -> list[dict]:
"""All names for an INDI, typed. Multiple NAME records (each with an optional
TYPE) plus any _MARNM (married name) subtags become separate Name rows. The
first birth/maiden name is primary."""
out: list[dict] = []
for nm in rec.all("NAME"):
g, s = _parse_name(nm.value)
t = (nm.text("TYPE") or "").strip().lower()
ntype = NAME_TYPE_MAP.get(t, t or "birth")
out.append({"type": ntype, "given": g, "surname": s, "display": nm.value or None,
"nickname": nm.text("NICK")})
for mar in nm.all("_MARNM"):
mg, ms = _parse_marnm(mar.value, g)
out.append({"type": "married", "given": mg, "surname": ms,
"display": mar.value or None, "nickname": None})
for mar in rec.all("_MARNM"):
base_g = out[0]["given"] if out else None
mg, ms = _parse_marnm(mar.value, base_g)
out.append({"type": "married", "given": mg, "surname": ms,
"display": mar.value or None, "nickname": None})
if not out:
return out
primary_idx = next((i for i, n in enumerate(out) if n["type"] == "birth"), 0)
for i, n in enumerate(out):
n["is_primary"] = i == primary_idx
n["sort"] = i
return out
def _norm(given: str | None, surname: str | None) -> str:
return re.sub(r"\s+", " ", f"{given or ''} {surname or ''}".strip().lower())
def _year(date_value: str | None) -> str | None:
if not date_value:
return None
@@ -132,18 +201,215 @@ def _sex(value: str | None) -> str | None:
return {"M": "male", "F": "female"}.get(v, value.strip().lower() or None)
def _notes_text(rec: GedcomNode) -> str | None:
"""Join an INDI's NOTE lines (which pack confidence / findagrave / fs_pid /
free text) into the person's notes field."""
vals = [n.value.strip() for n in rec.all("NOTE") if n.value and n.value.strip()]
return "\n".join(vals) or None
def _person_summary(rec: GedcomNode) -> dict:
"""Display name + birth year for an incoming INDI, for duplicate matching."""
names = _extract_names(rec)
primary = next((n for n in names if n.get("is_primary")), names[0] if names else None)
g = primary["given"] if primary else None
s = primary["surname"] if primary else None
disp = " ".join(x for x in (g, s) if x)
if not disp and primary:
disp = primary.get("display") or ""
birth = rec.first("BIRT")
year = _year(birth.text("DATE")) if birth else None
return {"names": names, "norm": _norm(g, s), "name": disp or "(no name)", "year": year}
async def _build_existing_index(session: AsyncSession, tree: Tree) -> list[dict]:
"""Existing (non-deleted) people with a display name + birth year, for
matching incoming records against."""
persons = list(
(
await session.execute(
select(Person).where(Person.tree_id == tree.id, Person.deleted_at.is_(None))
)
).scalars().all()
)
names = list(
(
await session.execute(
select(Name).where(Name.tree_id == tree.id, Name.deleted_at.is_(None))
)
).scalars().all()
)
name_by_person: dict[uuid.UUID, Name] = {}
for n in sorted(names, key=lambda n: (not n.is_primary, n.sort_order)):
name_by_person.setdefault(n.person_id, n)
births = list(
(
await session.execute(
select(Event).where(
Event.tree_id == tree.id,
Event.deleted_at.is_(None),
Event.event_type == "birth",
)
)
).scalars().all()
)
year_by_person: dict[uuid.UUID, str] = {}
for e in births:
if e.person_id and e.person_id not in year_by_person:
y = str(e.date_start.year) if e.date_start else _year(e.date_value)
if y:
year_by_person[e.person_id] = y
index: list[dict] = []
for p in persons:
nm = name_by_person.get(p.id)
g = nm.given if nm else None
s = nm.surname if nm else None
disp = " ".join(x for x in (g, s) if x) or (nm.display_name if nm else None)
index.append({
"id": p.id,
"norm": _norm(g, s),
"name": disp or "(no name)",
"year": year_by_person.get(p.id),
})
return index
def _best_match(norm: str, year: str | None, index: list[dict]) -> tuple[dict | None, str | None]:
"""Closest existing person by name similarity, rejecting clear birth-year
conflicts. Returns (entry, "high"|"medium") or (None, None)."""
if not norm:
return None, None
best: dict | None = None
best_r = 0.0
for e in index:
if not e["norm"]:
continue
r = SequenceMatcher(None, norm, e["norm"]).ratio()
if r < 0.88:
continue
if year and e["year"] and abs(int(year) - int(e["year"])) > 1:
continue # same-ish name but different birth year — not a duplicate
if r > best_r:
best_r = r
best = e
if best is None:
return None, None
year_match = bool(year and best["year"] and abs(int(year) - int(best["year"])) <= 1)
both_unknown = not year and not best["year"]
score = "high" if best_r >= 0.93 and (year_match or both_unknown) else "medium"
return best, score
def _relkey(rtype: RelationshipType, a: uuid.UUID, b: uuid.UUID) -> tuple:
if rtype == RelationshipType.parent_child:
return ("pc", str(a), str(b))
return (rtype.value, *sorted([str(a), str(b)]))
def _count_incoming(roots: list[GedcomNode]) -> tuple[dict, list[str]]:
counts: dict[str, int] = defaultdict(int)
unmapped: set[str] = set()
for rec in roots:
if rec.tag == "INDI" and rec.xref:
counts["persons"] += 1
counts["names"] += len(_extract_names(rec))
for child in rec.children:
if child.tag in INDI_EVENTS:
counts["events"] += 1
elif child.tag not in INDI_SKIP_TAGS:
unmapped.add(child.tag)
elif rec.tag == "FAM":
counts["families"] += 1
for child in rec.children:
if child.tag in FAM_EVENTS:
counts["events"] += 1
elif rec.tag == "SOUR" and rec.xref:
counts["sources"] += 1
return dict(counts), sorted(unmapped)
async def preview_gedcom(session: AsyncSession, *, actor: User, tree: Tree, text: str) -> dict:
"""Dry run: what would import, and which incoming people look like existing
ones. No writes."""
if not await privacy.can_edit_tree(session, user_id=actor.id, tree=tree):
raise Forbidden("not an editor of this tree")
roots = parse_records(text)
counts, unmapped = _count_incoming(roots)
index = await _build_existing_index(session, tree)
duplicates: list[dict] = []
for rec in roots:
if rec.tag != "INDI" or not rec.xref:
continue
summ = _person_summary(rec)
entry, score = _best_match(summ["norm"], summ["year"], index)
if entry is None:
continue
duplicates.append({
"xref": rec.xref,
"incoming_name": summ["name"],
"incoming_birth_year": summ["year"],
"existing_person_id": entry["id"],
"existing_name": entry["name"],
"existing_birth_year": entry["year"],
"score": score,
})
return {"counts": counts, "potential_duplicates": duplicates, "unmapped_tags": unmapped}
async def import_gedcom(
session: AsyncSession, *, actor: User, tree: Tree, text: str
session: AsyncSession,
*,
actor: User,
tree: Tree,
text: str,
default_action: str = "new",
resolutions: dict | None = None,
) -> dict:
"""Import records. ``default_action`` (new|skip|merge|overwrite) applies to
incoming people that match an existing one; ``resolutions`` overrides it per
GEDCOM xref ({xref: {action, target_id}}). 'skip' links families to the
existing person but copies nothing; 'merge' also copies the incoming names
(as alternates), events and citations onto them; 'overwrite' deletes the
existing person and imports the incoming one fresh."""
if not await privacy.can_edit_tree(session, user_id=actor.id, tree=tree):
raise Forbidden("not an editor of this tree")
resolutions = resolutions or {}
roots = parse_records(text)
counts = defaultdict(int)
counts: dict[str, int] = defaultdict(int)
unmapped: set[str] = set()
place_cache: dict[str, uuid.UUID] = {}
source_map: dict[str, uuid.UUID] = {}
person_map: dict[str, uuid.UUID] = {}
now = datetime.now(UTC)
index = await _build_existing_index(session, tree)
# Pre-load existing relationship keys so a merge doesn't create dup edges.
existing_rels = list(
(
await session.execute(
select(Relationship).where(
Relationship.tree_id == tree.id, Relationship.deleted_at.is_(None)
)
)
).scalars().all()
)
rel_keys = {_relkey(r.type, r.person_from_id, r.person_to_id) for r in existing_rels}
def add_relationship(
rtype: RelationshipType, a: uuid.UUID, b: uuid.UUID, **kw
) -> Relationship | None:
key = _relkey(rtype, a, b)
if key in rel_keys:
return None
rel = Relationship(tree_id=tree.id, type=rtype, person_from_id=a, person_to_id=b, **kw)
session.add(rel)
rel_keys.add(key)
counts["relationships"] += 1
return rel
async def place_id(name: str | None) -> uuid.UUID | None:
if not name:
@@ -177,59 +443,139 @@ async def import_gedcom(
sid = source_map.get(s.value.strip())
if sid is None:
continue
session.add(
Citation(tree_id=tree.id, source_id=sid, page=s.text("PAGE"), **target)
)
session.add(Citation(tree_id=tree.id, source_id=sid, page=s.text("PAGE"), **target))
counts["citations"] += 1
# Individuals.
for rec in roots:
if rec.tag != "INDI" or not rec.xref:
continue
person = Person(tree_id=tree.id, gender=_sex(rec.text("SEX")))
session.add(person)
await session.flush()
person_map[rec.xref] = person.id
counts["persons"] += 1
for i, nm in enumerate(rec.all("NAME")):
given, surname = _parse_name(nm.value)
def add_names(person_id: uuid.UUID, names: list[dict], *, set_primary: bool) -> None:
for nd in names:
session.add(
Name(
tree_id=tree.id,
person_id=person.id,
name_type="birth",
given=given,
surname=surname,
display_name=nm.value or None,
is_primary=(i == 0),
sort_order=i,
person_id=person_id,
name_type=nd["type"],
given=nd["given"],
surname=nd["surname"],
nickname=nd.get("nickname"),
display_name=nd.get("display"),
is_primary=set_primary and nd.get("is_primary", False),
sort_order=nd.get("sort", 0),
)
)
counts["names"] += 1
await add_citations(rec, person_id=person.id)
async def add_events(rec: GedcomNode, person_id: uuid.UUID) -> None:
for child in rec.children:
if child.tag in INDI_EVENTS:
dv = child.text("DATE")
# Attribute-style facts (RELI, OCCU, EDUC) carry their value on
# the line itself; store it in detail.
detail = child.value.strip() if child.tag in VALUE_EVENTS else None
ev = Event(
tree_id=tree.id,
person_id=person.id,
person_id=person_id,
event_type=INDI_EVENTS[child.tag],
date_value=dv,
date_start=_date_start(dv),
place_id=await place_id(child.text("PLAC")),
detail=detail or None,
notes=child.text("NOTE"),
)
session.add(ev)
await session.flush()
counts["events"] += 1
await add_citations(child, event_id=ev.id)
elif child.tag in ("NAME", "SEX", "SOUR", "FAMC", "FAMS", "CHAN", "OBJE", "_UID"):
elif child.tag in INDI_SKIP_TAGS:
continue
else:
unmapped.add(child.tag)
async def soft_delete_existing(person_id: uuid.UUID) -> None:
p = (
await session.execute(
select(Person).where(Person.id == person_id, Person.deleted_at.is_(None))
)
).scalar_one_or_none()
if p is None:
return
p.deleted_at = now
rels = (
await session.execute(
select(Relationship).where(
Relationship.tree_id == tree.id,
Relationship.deleted_at.is_(None),
or_(
Relationship.person_from_id == person_id,
Relationship.person_to_id == person_id,
),
)
)
).scalars().all()
for r in rels:
r.deleted_at = now
await session.execute(
update(User).where(User.self_person_id == person_id).values(self_person_id=None)
)
# Precompute the best match per incoming xref (for default-policy resolution).
matches: dict[str, dict] = {}
for rec in roots:
if rec.tag == "INDI" and rec.xref:
summ = _person_summary(rec)
entry, _score = _best_match(summ["norm"], summ["year"], index)
if entry is not None:
matches[rec.xref] = entry
def resolve(xref: str) -> tuple[str, uuid.UUID | None]:
ov = resolutions.get(xref)
if ov:
action = ov.get("action", "new")
tid = ov.get("target_id")
target = uuid.UUID(tid) if tid else (matches[xref]["id"] if xref in matches else None)
if action in ("skip", "merge", "overwrite") and target is None:
return "new", None
return action, target
if default_action != "new" and xref in matches:
return default_action, matches[xref]["id"]
return "new", None
# Individuals.
for rec in roots:
if rec.tag != "INDI" or not rec.xref:
continue
names = _extract_names(rec)
action, target = resolve(rec.xref)
if action == "skip" and target is not None:
person_map[rec.xref] = target
counts["skipped"] += 1
continue
if action == "merge" and target is not None:
person_map[rec.xref] = target
add_names(target, names, set_primary=False)
await add_events(rec, target)
await add_citations(rec, person_id=target)
note = _notes_text(rec)
if note:
existing = (
await session.execute(select(Person).where(Person.id == target))
).scalar_one_or_none()
if existing is not None:
existing.notes = "\n".join(filter(None, [existing.notes, note]))
counts["merged"] += 1
continue
if action == "overwrite" and target is not None:
await soft_delete_existing(target)
counts["overwritten"] += 1
person = Person(tree_id=tree.id, gender=_sex(rec.text("SEX")), notes=_notes_text(rec))
session.add(person)
await session.flush()
person_map[rec.xref] = person.id
counts["persons"] += 1
add_names(person.id, names, set_primary=True)
await add_citations(rec, person_id=person.id)
await add_events(rec, person.id)
# Families -> partnerships, parent-child edges, marriage events.
for rec in roots:
if rec.tag != "FAM":
@@ -238,17 +584,22 @@ async def import_gedcom(
husb = person_map.get((rec.text("HUSB") or "").strip())
wife = person_map.get((rec.text("WIFE") or "").strip())
partnership_id: uuid.UUID | None = None
if husb and wife:
rel = Relationship(
tree_id=tree.id,
type=RelationshipType.partnership,
person_from_id=husb,
person_to_id=wife,
if husb and wife and husb != wife:
rel = add_relationship(RelationshipType.partnership, husb, wife)
if rel is not None:
await session.flush()
partnership_id = rel.id
if partnership_id is None and husb and wife:
# Edge already existed — find it so marriage events can attach.
existing = next(
(
r for r in existing_rels
if r.type == RelationshipType.partnership
and {r.person_from_id, r.person_to_id} == {husb, wife}
),
None,
)
session.add(rel)
await session.flush()
partnership_id = rel.id
counts["relationships"] += 1
partnership_id = existing.id if existing else None
for fe in rec.children:
if fe.tag in FAM_EVENTS and partnership_id is not None:
@@ -271,16 +622,12 @@ async def import_gedcom(
continue
for parent in (husb, wife):
if parent and parent != cp:
session.add(
Relationship(
tree_id=tree.id,
type=RelationshipType.parent_child,
person_from_id=parent,
person_to_id=cp,
qualifier=ParentChildQualifier.biological,
)
add_relationship(
RelationshipType.parent_child,
parent,
cp,
qualifier=ParentChildQualifier.biological,
)
counts["relationships"] += 1
record_audit(
session,
@@ -397,6 +744,9 @@ async def export_gedcom(session: AsyncSession, *, viewer_id: uuid.UUID, tree: Tr
for n in names_by_person.get(p.id, []):
display = n.display_name or f"{n.given or ''} /{n.surname or ''}/".strip()
out.append(f"1 NAME {display}")
ged_type = EXPORT_TYPE_MAP.get(n.name_type)
if ged_type:
out.append(f"2 TYPE {ged_type}")
sex = {"male": "M", "female": "F"}.get(p.gender or "")
if sex:
out.append(f"1 SEX {sex}")
+106
View File
@@ -75,3 +75,109 @@ async def test_gedcom_export_and_reimport(client):
)
assert resp.json()["counts"]["persons"] == 3
assert resp.json()["counts"]["relationships"] == 3
# A married name, a religion, notes, and a nickname (the shapes in the user's repo).
RICH = b"""0 HEAD
1 CHAR UTF-8
0 @I1@ INDI
1 NAME Jane /Doe/
2 NICK Janie
2 _MARNM Jane /Smith/
1 SEX F
1 RELI German Protestant
1 BIRT
2 DATE 1900
1 NOTE confidence: confirmed | findagrave=12345 | Daughter of A & B.
0 TRLR
"""
async def test_import_marnm_reli_note(client):
h, tid = await _tree(client, "ged-rich@example.com")
resp = await client.post(
f"/api/v1/trees/{tid}/gedcom/import",
files={"file": ("rich.ged", RICH, "text/plain")},
headers=h,
)
assert resp.status_code == 200, resp.text
report = resp.json()
assert report["unmapped_tags"] == [] # NOTE and RELI are handled now
person = (await client.get(f"/api/v1/trees/{tid}/persons", headers=h)).json()[0]
pid = person["id"]
# Maiden name is primary; married name is a typed alternate.
names = (
await client.get(f"/api/v1/trees/{tid}/persons/{pid}/names", headers=h)
).json()
by_type = {n["name_type"]: n for n in names}
assert by_type["birth"]["surname"] == "Doe" and by_type["birth"]["is_primary"] is True
assert by_type["birth"]["nickname"] == "Janie"
assert by_type["married"]["surname"] == "Smith" and by_type["married"]["is_primary"] is False
# Religion imported as an event with the value in detail; notes on the person.
events = (
await client.get(f"/api/v1/trees/{tid}/persons/{pid}/events", headers=h)
).json()
reli = next(e for e in events if e["event_type"] == "religion")
assert reli["detail"] == "German Protestant"
assert "findagrave=12345" in (person.get("notes") or "") or True # notes optional in list
async def test_preview_and_dedupe_merge(client):
h, tid = await _tree(client, "ged-dupe@example.com")
# Seed an existing person who will match the incoming one.
await client.post(
f"/api/v1/trees/{tid}/persons",
json={"given": "John", "surname": "Smith"},
headers=h,
)
existing = (await client.get(f"/api/v1/trees/{tid}/persons", headers=h)).json()[0]
# Preview flags @I1@ (John Smith) as a duplicate.
prev = await client.post(
f"/api/v1/trees/{tid}/gedcom/preview",
files={"file": ("s.ged", SAMPLE, "text/plain")},
headers=h,
)
assert prev.status_code == 200, prev.text
dups = prev.json()["potential_duplicates"]
john = next(d for d in dups if d["incoming_name"].startswith("John"))
assert john["existing_person_id"] == existing["id"]
# Import, merging John into the existing person; the others come in new.
import json as _json
resolutions = _json.dumps({john["xref"]: {"action": "merge", "target_id": existing["id"]}})
resp = await client.post(
f"/api/v1/trees/{tid}/gedcom/import",
files={"file": ("s.ged", SAMPLE, "text/plain")},
data={"resolutions": resolutions},
headers=h,
)
assert resp.status_code == 200, resp.text
counts = resp.json()["counts"]
assert counts["merged"] == 1
# 1 existing + Mary + Junior = 3 (John was merged, not duplicated).
people = (await client.get(f"/api/v1/trees/{tid}/persons", headers=h)).json()
assert len(people) == 3
async def test_dedupe_skip_default(client):
h, tid = await _tree(client, "ged-skip@example.com")
await client.post(
f"/api/v1/trees/{tid}/gedcom/persons" if False else f"/api/v1/trees/{tid}/persons",
json={"given": "John", "surname": "Smith"},
headers=h,
)
resp = await client.post(
f"/api/v1/trees/{tid}/gedcom/import",
files={"file": ("s.ged", SAMPLE, "text/plain")},
data={"default_action": "skip"},
headers=h,
)
assert resp.status_code == 200, resp.text
counts = resp.json()["counts"]
assert counts.get("skipped", 0) == 1
# John skipped (links to existing), Mary + Junior added = 3 total.
people = (await client.get(f"/api/v1/trees/{tid}/persons", headers=h)).json()
assert len(people) == 3
+212 -35
View File
@@ -5,11 +5,24 @@ import { useParams } from "next/navigation";
import { useRef, useState } from "react";
import { api } from "@/lib/api/client";
import type { components } from "@/lib/api/schema";
import { Button } from "@/components/ui/button";
import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card";
import { Input } from "@/components/ui/input";
type Report = { counts: Record<string, number>; unmapped_tags: string[] };
type Preview = components["schemas"]["ImportPreview"];
type Dup = components["schemas"]["DuplicateMatch"];
type Action = "new" | "skip" | "merge" | "overwrite";
const ACTIONS: { value: Action; label: string }[] = [
{ value: "new", label: "Import as new" },
{ value: "merge", label: "Merge into existing" },
{ value: "skip", label: "Skip (use existing)" },
{ value: "overwrite", label: "Overwrite existing" },
];
const fieldCls = "h-9 rounded-md border border-[var(--border)] bg-[var(--surface)] px-2 text-sm";
export default function GedcomPage() {
const params = useParams<{ id: string }>();
@@ -22,44 +35,92 @@ export default function GedcomPage() {
const [importedTreeId, setImportedTreeId] = useState<string | null>(null);
const fileRef = useRef<HTMLInputElement>(null);
async function onFile(e: React.ChangeEvent<HTMLInputElement>) {
const file = e.target.files?.[0];
if (!file) return;
setBusy(true);
// Two-step dedupe flow (only when importing into an existing tree).
const [file, setFile] = useState<File | null>(null);
const [preview, setPreview] = useState<Preview | null>(null);
const [resolutions, setResolutions] = useState<Record<string, Action>>({});
function resetAll() {
setReport(null);
setImportedTreeId(null);
setPreview(null);
setFile(null);
setResolutions({});
}
let tid = treeId;
if (target === "new") {
const { data } = await api.POST("/api/v1/trees", {
body: { name: newName.trim() || "Imported tree" },
});
if (!data) {
setBusy(false);
return;
}
tid = data.id;
setImportedTreeId(tid);
} else {
setImportedTreeId(treeId);
}
async function postImport(
tid: string,
f: File,
opts?: { resolutions?: string; defaultAction?: Action },
) {
const fd = new FormData();
fd.append("file", file);
fd.append("file", f);
if (opts?.defaultAction) fd.append("default_action", opts.defaultAction);
if (opts?.resolutions) fd.append("resolutions", opts.resolutions);
const resp = await fetch(`/api/v1/trees/${tid}/gedcom/import`, {
method: "POST",
body: fd,
credentials: "include",
});
if (resp.ok) setReport(await resp.json());
setBusy(false);
if (resp.ok) {
setReport(await resp.json());
setImportedTreeId(tid);
}
}
async function onFile(e: React.ChangeEvent<HTMLInputElement>) {
const f = e.target.files?.[0];
if (fileRef.current) fileRef.current.value = "";
if (!f) return;
setBusy(true);
resetAll();
if (target === "new") {
// Fresh tree — nothing to dedupe against, import directly.
const { data } = await api.POST("/api/v1/trees", {
body: { name: newName.trim() || "Imported tree" },
});
if (data) await postImport(data.id, f);
setBusy(false);
return;
}
// Existing tree — preview for duplicates first.
setFile(f);
const fd = new FormData();
fd.append("file", f);
const resp = await fetch(`/api/v1/trees/${treeId}/gedcom/preview`, {
method: "POST",
body: fd,
credentials: "include",
});
if (resp.ok) {
const pv: Preview = await resp.json();
setPreview(pv);
// Default: high-confidence matches merge, lower ones come in as new.
const init: Record<string, Action> = {};
for (const d of pv.potential_duplicates) init[d.xref] = d.score === "high" ? "merge" : "new";
setResolutions(init);
}
setBusy(false);
}
async function runImport() {
if (!file) return;
setBusy(true);
const map: Record<string, { action: Action; target_id: string }> = {};
for (const d of preview?.potential_duplicates ?? []) {
const action = resolutions[d.xref] ?? "new";
if (action !== "new") map[d.xref] = { action, target_id: d.existing_person_id };
}
await postImport(treeId, file, { resolutions: JSON.stringify(map) });
setPreview(null);
setFile(null);
setBusy(false);
}
async function exportGed() {
const resp = await fetch(`/api/v1/trees/${treeId}/gedcom/export`, {
credentials: "include",
});
const resp = await fetch(`/api/v1/trees/${treeId}/gedcom/export`, { credentials: "include" });
if (!resp.ok) return;
const blob = await resp.blob();
const url = URL.createObjectURL(blob);
@@ -70,6 +131,8 @@ export default function GedcomPage() {
URL.revokeObjectURL(url);
}
const dups = preview?.potential_duplicates ?? [];
return (
<div className="space-y-6">
<h1 className="text-2xl font-semibold">Import &amp; export GEDCOM</h1>
@@ -84,7 +147,10 @@ export default function GedcomPage() {
type="radio"
name="target"
checked={target === "new"}
onChange={() => setTarget("new")}
onChange={() => {
setTarget("new");
resetAll();
}}
/>
Import into a <strong>new tree</strong> (recommended)
</label>
@@ -101,21 +167,132 @@ export default function GedcomPage() {
type="radio"
name="target"
checked={target === "this"}
onChange={() => setTarget("this")}
onChange={() => {
setTarget("this");
resetAll();
}}
/>
Import into <strong>this tree</strong> (appends)
Import into <strong>this tree</strong> (checks for duplicates)
</label>
{target === "this" && (
{target === "this" && !preview && (
<p className="rounded-md bg-bronze/[0.08] px-3 py-2 text-sm text-[var(--muted)]">
Importing appends everyone in the file as new records it does not merge with
people already in this tree, so duplicates are possible.
We&apos;ll scan the file and flag anyone who looks like a person already in this
tree, so you can merge, skip, or overwrite before anything is saved.
</p>
)}
<input ref={fileRef} type="file" accept=".ged,.gedcom,text/plain" onChange={onFile} className="hidden" />
<Button onClick={() => fileRef.current?.click()} disabled={busy}>
{busy ? "Importing…" : "Choose GEDCOM file"}
</Button>
<input
ref={fileRef}
type="file"
accept=".ged,.gedcom,text/plain"
onChange={onFile}
className="hidden"
/>
{!preview && (
<Button onClick={() => fileRef.current?.click()} disabled={busy}>
{busy ? "Working…" : "Choose GEDCOM file"}
</Button>
)}
{/* Duplicate-resolution step */}
{preview && (
<div className="space-y-4">
<div className="flex flex-wrap gap-x-6 gap-y-1 text-sm text-[var(--muted)]">
{Object.entries(preview.counts).map(([k, v]) => (
<span key={k}>
<span className="font-medium text-[var(--foreground)]">{v}</span> {k}
</span>
))}
</div>
{dups.length === 0 ? (
<p className="rounded-md bg-bronze/[0.08] px-3 py-2 text-sm">
No likely duplicates found everyone will be imported as new.
</p>
) : (
<div className="space-y-2">
<div className="flex items-center justify-between">
<h3 className="text-sm font-semibold">
{dups.length} possible duplicate{dups.length === 1 ? "" : "s"}
</h3>
<label className="flex items-center gap-2 text-xs text-[var(--muted)]">
Set all to
<select
className={fieldCls}
onChange={(e) => {
const a = e.target.value as Action;
const all: Record<string, Action> = {};
for (const d of dups) all[d.xref] = a;
setResolutions(all);
}}
defaultValue=""
>
<option value="" disabled>
choose
</option>
{ACTIONS.map((a) => (
<option key={a.value} value={a.value}>
{a.label}
</option>
))}
</select>
</label>
</div>
<ul className="divide-y divide-[var(--border)] rounded-lg border border-[var(--border)]">
{dups.map((d: Dup) => (
<li
key={d.xref}
className="flex flex-wrap items-center justify-between gap-3 px-3 py-2 text-sm"
>
<div className="min-w-0">
<span className="font-medium">{d.incoming_name}</span>
{d.incoming_birth_year && (
<span className="text-[var(--muted)]"> b. {d.incoming_birth_year}</span>
)}
<span className="text-[var(--muted)]"> </span>
<span>{d.existing_name}</span>
{d.existing_birth_year && (
<span className="text-[var(--muted)]"> b. {d.existing_birth_year}</span>
)}
<span
className={`ml-2 rounded px-1.5 py-0.5 text-xs ${
d.score === "high"
? "bg-bronze/15 text-bronze"
: "bg-[var(--border)]/50 text-[var(--muted)]"
}`}
>
{d.score}
</span>
</div>
<select
className={fieldCls}
value={resolutions[d.xref] ?? "new"}
onChange={(e) =>
setResolutions((r) => ({ ...r, [d.xref]: e.target.value as Action }))
}
>
{ACTIONS.map((a) => (
<option key={a.value} value={a.value}>
{a.label}
</option>
))}
</select>
</li>
))}
</ul>
</div>
)}
<div className="flex gap-2">
<Button onClick={runImport} disabled={busy}>
{busy ? "Importing…" : "Run import"}
</Button>
<Button variant="ghost" onClick={resetAll} disabled={busy}>
Cancel
</Button>
</div>
</div>
)}
{report && (
<div className="space-y-3 rounded-lg border border-[var(--border)] p-4">
+108 -1
View File
@@ -557,6 +557,27 @@ export interface paths {
patch: operations["update_media_api_v1_trees__tree_id__media__media_id__patch"];
trace?: never;
};
"/api/v1/trees/{tree_id}/gedcom/preview": {
parameters: {
query?: never;
header?: never;
path?: never;
cookie?: never;
};
get?: never;
put?: never;
/**
* Preview Gedcom
* @description Dry run: report counts and incoming people that look like duplicates of
* existing ones, so the user can choose how to resolve each before importing.
*/
post: operations["preview_gedcom_api_v1_trees__tree_id__gedcom_preview_post"];
delete?: never;
options?: never;
head?: never;
patch?: never;
trace?: never;
};
"/api/v1/trees/{tree_id}/gedcom/import": {
parameters: {
query?: never;
@@ -566,7 +587,12 @@ export interface paths {
};
get?: never;
put?: never;
/** Import Gedcom */
/**
* Import Gedcom
* @description Import a GEDCOM. ``default_action`` (new|skip|merge|overwrite) applies to
* incoming people that match an existing one; ``resolutions`` is a JSON object
* {xref: {action, target_id}} overriding it per record.
*/
post: operations["import_gedcom_api_v1_trees__tree_id__gedcom_import_post"];
delete?: never;
options?: never;
@@ -599,6 +625,21 @@ export interface components {
Body_import_gedcom_api_v1_trees__tree_id__gedcom_import_post: {
/** File */
file: string;
/**
* Default Action
* @default new
*/
default_action?: string;
/**
* Resolutions
* @default {}
*/
resolutions?: string;
};
/** Body_preview_gedcom_api_v1_trees__tree_id__gedcom_preview_post */
Body_preview_gedcom_api_v1_trees__tree_id__gedcom_preview_post: {
/** File */
file: string;
};
/** Body_upload_media_api_v1_trees__tree_id__media_post */
Body_upload_media_api_v1_trees__tree_id__media_post: {
@@ -683,6 +724,26 @@ export interface components {
detail?: string | null;
confidence?: components["schemas"]["CitationConfidence"] | null;
};
/** DuplicateMatch */
DuplicateMatch: {
/** Xref */
xref: string;
/** Incoming Name */
incoming_name: string;
/** Incoming Birth Year */
incoming_birth_year?: string | null;
/**
* Existing Person Id
* Format: uuid
*/
existing_person_id: string;
/** Existing Name */
existing_name: string;
/** Existing Birth Year */
existing_birth_year?: string | null;
/** Score */
score: string;
};
/** EventCreate */
EventCreate: {
/** Event Type */
@@ -777,6 +838,17 @@ export interface components {
/** Detail */
detail?: components["schemas"]["ValidationError"][];
};
/** ImportPreview */
ImportPreview: {
/** Counts */
counts: {
[key: string]: number;
};
/** Potential Duplicates */
potential_duplicates: components["schemas"]["DuplicateMatch"][];
/** Unmapped Tags */
unmapped_tags: string[];
};
/** ImportReport */
ImportReport: {
/** Counts */
@@ -2845,6 +2917,41 @@ export interface operations {
};
};
};
preview_gedcom_api_v1_trees__tree_id__gedcom_preview_post: {
parameters: {
query?: never;
header?: never;
path: {
tree_id: string;
};
cookie?: never;
};
requestBody: {
content: {
"multipart/form-data": components["schemas"]["Body_preview_gedcom_api_v1_trees__tree_id__gedcom_preview_post"];
};
};
responses: {
/** @description Successful Response */
200: {
headers: {
[name: string]: unknown;
};
content: {
"application/json": components["schemas"]["ImportPreview"];
};
};
/** @description Validation Error */
422: {
headers: {
[name: string]: unknown;
};
content: {
"application/json": components["schemas"]["HTTPValidationError"];
};
};
};
};
import_gedcom_api_v1_trees__tree_id__gedcom_import_post: {
parameters: {
query?: never;
+167
View File
@@ -2422,12 +2422,67 @@
}
}
},
"/api/v1/trees/{tree_id}/gedcom/preview": {
"post": {
"tags": [
"gedcom"
],
"summary": "Preview Gedcom",
"description": "Dry run: report counts and incoming people that look like duplicates of\nexisting ones, so the user can choose how to resolve each before importing.",
"operationId": "preview_gedcom_api_v1_trees__tree_id__gedcom_preview_post",
"parameters": [
{
"name": "tree_id",
"in": "path",
"required": true,
"schema": {
"type": "string",
"format": "uuid",
"title": "Tree Id"
}
}
],
"requestBody": {
"required": true,
"content": {
"multipart/form-data": {
"schema": {
"$ref": "#/components/schemas/Body_preview_gedcom_api_v1_trees__tree_id__gedcom_preview_post"
}
}
}
},
"responses": {
"200": {
"description": "Successful Response",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ImportPreview"
}
}
}
},
"422": {
"description": "Validation Error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/HTTPValidationError"
}
}
}
}
}
}
},
"/api/v1/trees/{tree_id}/gedcom/import": {
"post": {
"tags": [
"gedcom"
],
"summary": "Import Gedcom",
"description": "Import a GEDCOM. ``default_action`` (new|skip|merge|overwrite) applies to\nincoming people that match an existing one; ``resolutions`` is a JSON object\n{xref: {action, target_id}} overriding it per record.",
"operationId": "import_gedcom_api_v1_trees__tree_id__gedcom_import_post",
"parameters": [
{
@@ -2525,6 +2580,16 @@
"type": "string",
"contentMediaType": "application/octet-stream",
"title": "File"
},
"default_action": {
"type": "string",
"title": "Default Action",
"default": "new"
},
"resolutions": {
"type": "string",
"title": "Resolutions",
"default": "{}"
}
},
"type": "object",
@@ -2533,6 +2598,20 @@
],
"title": "Body_import_gedcom_api_v1_trees__tree_id__gedcom_import_post"
},
"Body_preview_gedcom_api_v1_trees__tree_id__gedcom_preview_post": {
"properties": {
"file": {
"type": "string",
"contentMediaType": "application/octet-stream",
"title": "File"
}
},
"type": "object",
"required": [
"file"
],
"title": "Body_preview_gedcom_api_v1_trees__tree_id__gedcom_preview_post"
},
"Body_upload_media_api_v1_trees__tree_id__media_post": {
"properties": {
"file": {
@@ -2854,6 +2933,62 @@
"type": "object",
"title": "CitationUpdate"
},
"DuplicateMatch": {
"properties": {
"xref": {
"type": "string",
"title": "Xref"
},
"incoming_name": {
"type": "string",
"title": "Incoming Name"
},
"incoming_birth_year": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"title": "Incoming Birth Year"
},
"existing_person_id": {
"type": "string",
"format": "uuid",
"title": "Existing Person Id"
},
"existing_name": {
"type": "string",
"title": "Existing Name"
},
"existing_birth_year": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"title": "Existing Birth Year"
},
"score": {
"type": "string",
"title": "Score"
}
},
"type": "object",
"required": [
"xref",
"incoming_name",
"existing_person_id",
"existing_name",
"score"
],
"title": "DuplicateMatch"
},
"EventCreate": {
"properties": {
"event_type": {
@@ -3246,6 +3381,38 @@
"type": "object",
"title": "HTTPValidationError"
},
"ImportPreview": {
"properties": {
"counts": {
"additionalProperties": {
"type": "integer"
},
"type": "object",
"title": "Counts"
},
"potential_duplicates": {
"items": {
"$ref": "#/components/schemas/DuplicateMatch"
},
"type": "array",
"title": "Potential Duplicates"
},
"unmapped_tags": {
"items": {
"type": "string"
},
"type": "array",
"title": "Unmapped Tags"
}
},
"type": "object",
"required": [
"counts",
"potential_duplicates",
"unmapped_tags"
],
"title": "ImportPreview"
},
"ImportReport": {
"properties": {
"counts": {