Cleanup: best-guess sex from first name (offline dictionary)

A "Guess from first name" option in the Cleanup gender section: a bundled,
curated given-name -> sex dictionary (weighted English + German for the first
real tree) proposes sex for people who don't have it set. Deterministic, offline,
no model. Genuinely ambiguous names (Marion, Frances, Jordan, …) are excluded
from both sets so they're left for a human. Reuses the existing preview/apply
gender flow, so every guess is reviewed before saving.

No migration. 56 backend tests pass; frontend builds.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-08 10:30:35 -04:00
parent 7405ec762f
commit 6ec852a23a
7 changed files with 243 additions and 3 deletions
+22
View File
@@ -19,6 +19,7 @@ from app.models.user import User
from app.services import gedcom, privacy
from app.services.audit import record_audit
from app.services.exceptions import Forbidden, NotFound
from app.services.name_gender_data import guess_sex
async def _require_editor(session: AsyncSession, *, actor: User, tree: Tree) -> None:
@@ -160,6 +161,27 @@ async def preview_gender(
return out
async def guess_gender_by_name(
session: AsyncSession, *, actor: User, tree: Tree
) -> list[dict]:
"""Best-guess sex from the first given name for people who don't have it set,
using the bundled name dictionary. Ambiguous/unknown names are skipped."""
await _require_editor(session, actor=actor, tree=tree)
names = await _primary_name_by_person(session, tree.id)
out: list[dict] = []
for p in await _persons(session, tree.id):
if p.gender:
continue
nm = names.get(p.id)
if nm is None:
continue
proposed = guess_sex(nm.given)
if proposed:
out.append({"person_id": str(p.id), "name": _display(nm), "proposed_gender": proposed})
out.sort(key=lambda r: r["name"])
return out
async def apply_gender(
session: AsyncSession, *, actor: User, tree: Tree, updates: list[dict]
) -> int:
+69
View File
@@ -0,0 +1,69 @@
"""A curated given-name -> sex lookup for best-guessing a person's sex from
their first name. Weighted toward English + German names (this codebase's first
real tree is a German-American family). Deterministic and offline — no model
needed; the Cleanup tool previews every guess before anything is applied.
Genuinely ambiguous names (Marion, Frances/Francis, Jordan, Jamie, Robin, Leslie,
Dana, …) are intentionally left out of BOTH sets so they aren't guessed — better
a human decides those than a coin flip.
"""
MALE_NAMES: set[str] = {
# English / common US
"james", "john", "robert", "michael", "william", "david", "richard", "joseph",
"thomas", "charles", "christopher", "daniel", "matthew", "anthony", "donald",
"mark", "paul", "steven", "andrew", "kenneth", "george", "joshua", "kevin",
"brian", "edward", "ronald", "timothy", "jason", "jeffrey", "gary", "ryan",
"nicholas", "eric", "stephen", "jacob", "larry", "frank", "jonathan", "scott",
"raymond", "gregory", "samuel", "benjamin", "patrick", "jack", "dennis", "jerry",
"alexander", "tyler", "henry", "douglas", "peter", "adam", "harold", "albert",
"arthur", "carl", "ralph", "roy", "eugene", "louis", "philip", "bobby", "walter",
"willie", "wayne", "fred", "howard", "ernest", "earl", "clarence", "leon",
"leonard", "lewis", "floyd", "leroy", "elmer", "homer", "orrin", "josias",
"emerson", "dale", "bernard", "vernon", "virgil", "wilbur", "russell",
"harvey", "herbert", "melvin", "lloyd", "marvin", "norman", "stanley",
# German
"hans", "karl", "wilhelm", "friedrich", "heinrich", "otto", "hermann", "gustav",
"ludwig", "ernst", "fritz", "johann", "conrad", "konrad", "reinhold", "rudolf",
"rudolph", "gerhard", "helmut", "horst", "klaus", "kurt", "dieter", "günther",
"gunther", "manfred", "siegfried", "hilgard", "christian", "august", "wolfgang",
"jürgen", "jurgen", "matthias", "lothar", "bruno", "gottlieb", "reinhard",
}
FEMALE_NAMES: set[str] = {
# English / common US
"mary", "patricia", "jennifer", "linda", "elizabeth", "barbara", "susan",
"jessica", "sarah", "karen", "nancy", "lisa", "betty", "margaret", "sandra",
"ashley", "kimberly", "emily", "donna", "michelle", "carol", "amanda", "dorothy",
"melissa", "deborah", "stephanie", "rebecca", "sharon", "laura", "cynthia",
"kathleen", "amy", "angela", "shirley", "anna", "ruth", "brenda", "pamela",
"nicole", "katherine", "virginia", "catherine", "helen", "debra", "rachel",
"carolyn", "janet", "maria", "heather", "diane", "julie", "joyce", "victoria",
"kelly", "christina", "joan", "evelyn", "judith", "megan", "alice", "frances",
"marie", "florence", "flora", "zella", "thelma", "ellen", "althea", "della",
"beatrice", "pauline", "hedwig", "florentine", "wilhelmina", "augusta", "bertha",
"gladys", "mildred", "lucille", "edith", "esther", "irene", "hazel", "doris",
"rose", "rita", "norma", "june", "lois", "marjorie",
# German
"greta", "ilse", "ursula", "gertrud", "gertrude", "frieda", "frida", "else",
"hilda", "hilde", "hildegard", "ingrid", "helga", "renate", "monika", "sieglinde",
"brigitte", "gisela", "elke", "anneliese", "waltraud", "edeltraud", "johanna",
"katharina", "margarethe", "wilhelmine", "emilie", "auguste",
}
def guess_sex(given: str | None) -> str | None:
"""Best-guess "male"/"female" from the first token of a given name, or None
if unknown/ambiguous."""
if not given:
return None
first = given.strip().split()[0].lower() if given.strip() else ""
# Strip trailing punctuation/initials like "wm." -> "wm".
first = first.strip(".,'\"")
if not first:
return None
if first in MALE_NAMES:
return "male"
if first in FEMALE_NAMES:
return "female"
return None