diff --git a/backend/app/api/v1/cleanup.py b/backend/app/api/v1/cleanup.py index 666649b..1855ddb 100644 --- a/backend/app/api/v1/cleanup.py +++ b/backend/app/api/v1/cleanup.py @@ -57,6 +57,16 @@ async def preview_gender( return [GenderProposal(**r) for r in rows] +@router.get("/{tree_id}/cleanup/gender/guess", response_model=list[GenderProposal]) +async def guess_gender( + tree_id: uuid.UUID, session: SessionDep, current: CurrentUser +) -> list[GenderProposal]: + """Best-guess sex from first names (bundled dictionary) for people missing it.""" + tree = await tree_service.get_tree(session, viewer_id=current.id, tree_id=tree_id) + rows = await cleanup_service.guess_gender_by_name(session, actor=current, tree=tree) + return [GenderProposal(**r) for r in rows] + + @router.post("/{tree_id}/cleanup/gender", response_model=CleanupResult) async def apply_gender( tree_id: uuid.UUID, data: GenderApply, session: SessionDep, current: CurrentUser diff --git a/backend/app/services/cleanup_service.py b/backend/app/services/cleanup_service.py index 506a4ef..5761728 100644 --- a/backend/app/services/cleanup_service.py +++ b/backend/app/services/cleanup_service.py @@ -19,6 +19,7 @@ from app.models.user import User from app.services import gedcom, privacy from app.services.audit import record_audit from app.services.exceptions import Forbidden, NotFound +from app.services.name_gender_data import guess_sex async def _require_editor(session: AsyncSession, *, actor: User, tree: Tree) -> None: @@ -160,6 +161,27 @@ async def preview_gender( return out +async def guess_gender_by_name( + session: AsyncSession, *, actor: User, tree: Tree +) -> list[dict]: + """Best-guess sex from the first given name for people who don't have it set, + using the bundled name dictionary. Ambiguous/unknown names are skipped.""" + await _require_editor(session, actor=actor, tree=tree) + names = await _primary_name_by_person(session, tree.id) + out: list[dict] = [] + for p in await _persons(session, tree.id): + if p.gender: + continue + nm = names.get(p.id) + if nm is None: + continue + proposed = guess_sex(nm.given) + if proposed: + out.append({"person_id": str(p.id), "name": _display(nm), "proposed_gender": proposed}) + out.sort(key=lambda r: r["name"]) + return out + + async def apply_gender( session: AsyncSession, *, actor: User, tree: Tree, updates: list[dict] ) -> int: diff --git a/backend/app/services/name_gender_data.py b/backend/app/services/name_gender_data.py new file mode 100644 index 0000000..0fd34a1 --- /dev/null +++ b/backend/app/services/name_gender_data.py @@ -0,0 +1,69 @@ +"""A curated given-name -> sex lookup for best-guessing a person's sex from +their first name. Weighted toward English + German names (this codebase's first +real tree is a German-American family). Deterministic and offline — no model +needed; the Cleanup tool previews every guess before anything is applied. + +Genuinely ambiguous names (Marion, Frances/Francis, Jordan, Jamie, Robin, Leslie, +Dana, …) are intentionally left out of BOTH sets so they aren't guessed — better +a human decides those than a coin flip. +""" + +MALE_NAMES: set[str] = { + # English / common US + "james", "john", "robert", "michael", "william", "david", "richard", "joseph", + "thomas", "charles", "christopher", "daniel", "matthew", "anthony", "donald", + "mark", "paul", "steven", "andrew", "kenneth", "george", "joshua", "kevin", + "brian", "edward", "ronald", "timothy", "jason", "jeffrey", "gary", "ryan", + "nicholas", "eric", "stephen", "jacob", "larry", "frank", "jonathan", "scott", + "raymond", "gregory", "samuel", "benjamin", "patrick", "jack", "dennis", "jerry", + "alexander", "tyler", "henry", "douglas", "peter", "adam", "harold", "albert", + "arthur", "carl", "ralph", "roy", "eugene", "louis", "philip", "bobby", "walter", + "willie", "wayne", "fred", "howard", "ernest", "earl", "clarence", "leon", + "leonard", "lewis", "floyd", "leroy", "elmer", "homer", "orrin", "josias", + "emerson", "dale", "bernard", "vernon", "virgil", "wilbur", "russell", + "harvey", "herbert", "melvin", "lloyd", "marvin", "norman", "stanley", + # German + "hans", "karl", "wilhelm", "friedrich", "heinrich", "otto", "hermann", "gustav", + "ludwig", "ernst", "fritz", "johann", "conrad", "konrad", "reinhold", "rudolf", + "rudolph", "gerhard", "helmut", "horst", "klaus", "kurt", "dieter", "günther", + "gunther", "manfred", "siegfried", "hilgard", "christian", "august", "wolfgang", + "jürgen", "jurgen", "matthias", "lothar", "bruno", "gottlieb", "reinhard", +} + +FEMALE_NAMES: set[str] = { + # English / common US + "mary", "patricia", "jennifer", "linda", "elizabeth", "barbara", "susan", + "jessica", "sarah", "karen", "nancy", "lisa", "betty", "margaret", "sandra", + "ashley", "kimberly", "emily", "donna", "michelle", "carol", "amanda", "dorothy", + "melissa", "deborah", "stephanie", "rebecca", "sharon", "laura", "cynthia", + "kathleen", "amy", "angela", "shirley", "anna", "ruth", "brenda", "pamela", + "nicole", "katherine", "virginia", "catherine", "helen", "debra", "rachel", + "carolyn", "janet", "maria", "heather", "diane", "julie", "joyce", "victoria", + "kelly", "christina", "joan", "evelyn", "judith", "megan", "alice", "frances", + "marie", "florence", "flora", "zella", "thelma", "ellen", "althea", "della", + "beatrice", "pauline", "hedwig", "florentine", "wilhelmina", "augusta", "bertha", + "gladys", "mildred", "lucille", "edith", "esther", "irene", "hazel", "doris", + "rose", "rita", "norma", "june", "lois", "marjorie", + # German + "greta", "ilse", "ursula", "gertrud", "gertrude", "frieda", "frida", "else", + "hilda", "hilde", "hildegard", "ingrid", "helga", "renate", "monika", "sieglinde", + "brigitte", "gisela", "elke", "anneliese", "waltraud", "edeltraud", "johanna", + "katharina", "margarethe", "wilhelmine", "emilie", "auguste", +} + + +def guess_sex(given: str | None) -> str | None: + """Best-guess "male"/"female" from the first token of a given name, or None + if unknown/ambiguous.""" + if not given: + return None + first = given.strip().split()[0].lower() if given.strip() else "" + # Strip trailing punctuation/initials like "wm." -> "wm". + first = first.strip(".,'\"") + if not first: + return None + if first in MALE_NAMES: + return "male" + if first in FEMALE_NAMES: + return "female" + return None diff --git a/backend/tests/test_cleanup.py b/backend/tests/test_cleanup.py index e452676..61c426a 100644 --- a/backend/tests/test_cleanup.py +++ b/backend/tests/test_cleanup.py @@ -87,6 +87,28 @@ async def test_gender_from_source(client): assert genders["Josias Moody"] == "male" and genders["Flora Paul"] == "female" +async def test_guess_gender_from_first_name(client): + h, tid = await _tree(client, "cl-guess@example.com") + await _person(client, h, tid, "William", "Paul") # male + await _person(client, h, tid, "Flora", "Reier") # female + await _person(client, h, tid, "Marion", "Doe") # ambiguous -> skipped + # Already-gendered person is left alone even if guessable. + gendered = await _person(client, h, tid, "James", "Known") + await client.patch( + f"/api/v1/trees/{tid}/persons/{gendered}", json={"gender": "male"}, headers=h + ) + + prev = (await client.get(f"/api/v1/trees/{tid}/cleanup/gender/guess", headers=h)).json() + by = {p["name"]: p["proposed_gender"] for p in prev} + assert by == {"William Paul": "male", "Flora Reier": "female"} + + updates = [{"person_id": p["person_id"], "gender": p["proposed_gender"]} for p in prev] + r = await client.post( + f"/api/v1/trees/{tid}/cleanup/gender", json={"updates": updates}, headers=h + ) + assert r.status_code == 200 and r.json()["updated"] == 2 + + async def test_name_issues_preview_and_fix(client): h, tid = await _tree(client, "cl-name@example.com") # surname got a date; real surname landed in the given name. diff --git a/frontend/app/trees/[id]/cleanup/page.tsx b/frontend/app/trees/[id]/cleanup/page.tsx index addfdc3..d159d62 100644 --- a/frontend/app/trees/[id]/cleanup/page.tsx +++ b/frontend/app/trees/[id]/cleanup/page.tsx @@ -77,6 +77,15 @@ export default function CleanupPage() { setGenSel(new Set(data.map((g) => g.person_id))); } } + async function guessGender() { + setGenMsg(null); + const { data } = await api.GET("/api/v1/trees/{tree_id}/cleanup/gender/guess", { + params: { path: { tree_id: treeId } }, + }); + setGender(data ?? []); + setGenSel(new Set((data ?? []).map((g) => g.person_id))); + } + async function applyGender() { const updates = (gender ?? []) .filter((g) => genSel.has(g.person_id)) @@ -205,9 +214,18 @@ export default function CleanupPage() { onChange={previewGender} className="hidden" /> - +
+ + +
+

+ “Guess from first name” uses a built-in name dictionary for people with no sex set; + ambiguous names (Marion, Frances, …) are left for you to decide. +

{genMsg &&

{genMsg}

} {gender && (
diff --git a/frontend/lib/api/schema.d.ts b/frontend/lib/api/schema.d.ts index 86fffd3..d9cd02b 100644 --- a/frontend/lib/api/schema.d.ts +++ b/frontend/lib/api/schema.d.ts @@ -714,6 +714,26 @@ export interface paths { patch?: never; trace?: never; }; + "/api/v1/trees/{tree_id}/cleanup/gender/guess": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + /** + * Guess Gender + * @description Best-guess sex from first names (bundled dictionary) for people missing it. + */ + get: operations["guess_gender_api_v1_trees__tree_id__cleanup_gender_guess_get"]; + put?: never; + post?: never; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; "/api/v1/trees/{tree_id}/cleanup/gender": { parameters: { query?: never; @@ -3481,6 +3501,37 @@ export interface operations { }; }; }; + guess_gender_api_v1_trees__tree_id__cleanup_gender_guess_get: { + parameters: { + query?: never; + header?: never; + path: { + tree_id: string; + }; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["GenderProposal"][]; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; + }; + }; + }; + }; apply_gender_api_v1_trees__tree_id__cleanup_gender_post: { parameters: { query?: never; diff --git a/frontend/openapi.json b/frontend/openapi.json index 47c7215..b073713 100644 --- a/frontend/openapi.json +++ b/frontend/openapi.json @@ -2867,6 +2867,54 @@ } } }, + "/api/v1/trees/{tree_id}/cleanup/gender/guess": { + "get": { + "tags": [ + "cleanup" + ], + "summary": "Guess Gender", + "description": "Best-guess sex from first names (bundled dictionary) for people missing it.", + "operationId": "guess_gender_api_v1_trees__tree_id__cleanup_gender_guess_get", + "parameters": [ + { + "name": "tree_id", + "in": "path", + "required": true, + "schema": { + "type": "string", + "format": "uuid", + "title": "Tree Id" + } + } + ], + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "type": "array", + "items": { + "$ref": "#/components/schemas/GenderProposal" + }, + "title": "Response Guess Gender Api V1 Trees Tree Id Cleanup Gender Guess Get" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, "/api/v1/trees/{tree_id}/cleanup/gender": { "post": { "tags": [