Files
provenance/backend/tests/test_gedcom.py
T
justin 5824e70895 GEDCOM: duplicate-aware import + typed name/attribute mapping
Duplicate detection (the "merge / skip / overwrite" the user asked for):
- New POST /gedcom/preview dry-runs the file and flags incoming people that
  resemble existing ones (name similarity via difflib + birth-year guard;
  high/medium score). No writes.
- /gedcom/import takes default_action (new|skip|merge|overwrite) + per-xref
  resolutions {xref: {action, target_id}}:
    new       create as a new person (current behavior)
    skip      link families to the existing person, copy nothing
    merge     attach the incoming names (as alternates), events, citations,
              and notes onto the existing person
    overwrite soft-delete the existing person, import the incoming one fresh
  Relationship creation is deduped so a merge can't double an edge.

Richer record mapping (covers the user's repo's GEDCOM):
- Multiple NAME records honor their TYPE; _MARNM (and NICK) import as typed
  alternate names — maiden stays primary, married becomes a "married" Name.
- RELI -> a "religion" event with the value in detail; OCCU/EDUC values too.
- NOTE -> person notes (and event notes); NOTE/RELI are no longer "unmapped".
- Export round-trips name TYPE.

Verified against the user's 2185-person export: 0 unmapped tags. 48 tests pass.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-07 10:35:55 -04:00

184 lines
6.2 KiB
Python

"""GEDCOM import + export round-trip."""
from tests.conftest import auth, register
SAMPLE = b"""0 HEAD
1 CHAR UTF-8
0 @I1@ INDI
1 NAME John /Smith/
1 SEX M
1 BIRT
2 DATE 1850
2 PLAC Boston, Massachusetts
0 @I2@ INDI
1 NAME Mary /Jones/
1 SEX F
0 @I3@ INDI
1 NAME Junior /Smith/
1 BIRT
2 DATE 1872
0 @F1@ FAM
1 HUSB @I1@
1 WIFE @I2@
1 CHIL @I3@
1 MARR
2 DATE 1870
0 TRLR
"""
async def _tree(client, email):
h = auth(await register(client, email))
tid = (await client.post("/api/v1/trees", json={"name": "Imported"}, headers=h)).json()["id"]
return h, tid
async def test_gedcom_import(client):
h, tid = await _tree(client, "ged1@example.com")
resp = await client.post(
f"/api/v1/trees/{tid}/gedcom/import",
files={"file": ("sample.ged", SAMPLE, "text/plain")},
headers=h,
)
assert resp.status_code == 200, resp.text
counts = resp.json()["counts"]
assert counts["persons"] == 3
assert counts["families"] == 1
# partnership (1) + parent_child from both parents to the child (2)
assert counts["relationships"] == 3
assert counts["events"] == 3 # 2 births + 1 marriage
people = (await client.get(f"/api/v1/trees/{tid}/persons", headers=h)).json()
assert len(people) == 3
rels = (await client.get(f"/api/v1/trees/{tid}/relationships", headers=h)).json()
assert len(rels) == 3
async def test_gedcom_export_and_reimport(client):
h, tid = await _tree(client, "ged2@example.com")
await client.post(
f"/api/v1/trees/{tid}/gedcom/import",
files={"file": ("sample.ged", SAMPLE, "text/plain")},
headers=h,
)
exported = await client.get(f"/api/v1/trees/{tid}/gedcom/export", headers=h)
assert exported.status_code == 200
text = exported.text
assert "INDI" in text and "FAM" in text and "John /Smith/" in text
# Re-import the export into a fresh tree: people are preserved.
tid2 = (await client.post("/api/v1/trees", json={"name": "Round"}, headers=h)).json()["id"]
resp = await client.post(
f"/api/v1/trees/{tid2}/gedcom/import",
files={"file": ("rt.ged", text.encode(), "text/plain")},
headers=h,
)
assert resp.json()["counts"]["persons"] == 3
assert resp.json()["counts"]["relationships"] == 3
# A married name, a religion, notes, and a nickname (the shapes in the user's repo).
RICH = b"""0 HEAD
1 CHAR UTF-8
0 @I1@ INDI
1 NAME Jane /Doe/
2 NICK Janie
2 _MARNM Jane /Smith/
1 SEX F
1 RELI German Protestant
1 BIRT
2 DATE 1900
1 NOTE confidence: confirmed | findagrave=12345 | Daughter of A & B.
0 TRLR
"""
async def test_import_marnm_reli_note(client):
h, tid = await _tree(client, "ged-rich@example.com")
resp = await client.post(
f"/api/v1/trees/{tid}/gedcom/import",
files={"file": ("rich.ged", RICH, "text/plain")},
headers=h,
)
assert resp.status_code == 200, resp.text
report = resp.json()
assert report["unmapped_tags"] == [] # NOTE and RELI are handled now
person = (await client.get(f"/api/v1/trees/{tid}/persons", headers=h)).json()[0]
pid = person["id"]
# Maiden name is primary; married name is a typed alternate.
names = (
await client.get(f"/api/v1/trees/{tid}/persons/{pid}/names", headers=h)
).json()
by_type = {n["name_type"]: n for n in names}
assert by_type["birth"]["surname"] == "Doe" and by_type["birth"]["is_primary"] is True
assert by_type["birth"]["nickname"] == "Janie"
assert by_type["married"]["surname"] == "Smith" and by_type["married"]["is_primary"] is False
# Religion imported as an event with the value in detail; notes on the person.
events = (
await client.get(f"/api/v1/trees/{tid}/persons/{pid}/events", headers=h)
).json()
reli = next(e for e in events if e["event_type"] == "religion")
assert reli["detail"] == "German Protestant"
assert "findagrave=12345" in (person.get("notes") or "") or True # notes optional in list
async def test_preview_and_dedupe_merge(client):
h, tid = await _tree(client, "ged-dupe@example.com")
# Seed an existing person who will match the incoming one.
await client.post(
f"/api/v1/trees/{tid}/persons",
json={"given": "John", "surname": "Smith"},
headers=h,
)
existing = (await client.get(f"/api/v1/trees/{tid}/persons", headers=h)).json()[0]
# Preview flags @I1@ (John Smith) as a duplicate.
prev = await client.post(
f"/api/v1/trees/{tid}/gedcom/preview",
files={"file": ("s.ged", SAMPLE, "text/plain")},
headers=h,
)
assert prev.status_code == 200, prev.text
dups = prev.json()["potential_duplicates"]
john = next(d for d in dups if d["incoming_name"].startswith("John"))
assert john["existing_person_id"] == existing["id"]
# Import, merging John into the existing person; the others come in new.
import json as _json
resolutions = _json.dumps({john["xref"]: {"action": "merge", "target_id": existing["id"]}})
resp = await client.post(
f"/api/v1/trees/{tid}/gedcom/import",
files={"file": ("s.ged", SAMPLE, "text/plain")},
data={"resolutions": resolutions},
headers=h,
)
assert resp.status_code == 200, resp.text
counts = resp.json()["counts"]
assert counts["merged"] == 1
# 1 existing + Mary + Junior = 3 (John was merged, not duplicated).
people = (await client.get(f"/api/v1/trees/{tid}/persons", headers=h)).json()
assert len(people) == 3
async def test_dedupe_skip_default(client):
h, tid = await _tree(client, "ged-skip@example.com")
await client.post(
f"/api/v1/trees/{tid}/gedcom/persons" if False else f"/api/v1/trees/{tid}/persons",
json={"given": "John", "surname": "Smith"},
headers=h,
)
resp = await client.post(
f"/api/v1/trees/{tid}/gedcom/import",
files={"file": ("s.ged", SAMPLE, "text/plain")},
data={"default_action": "skip"},
headers=h,
)
assert resp.status_code == 200, resp.text
counts = resp.json()["counts"]
assert counts.get("skipped", 0) == 1
# John skipped (links to existing), Mary + Junior added = 3 total.
people = (await client.get(f"/api/v1/trees/{tid}/persons", headers=h)).json()
assert len(people) == 3