Cleanup: best-guess sex from first name (offline dictionary)
A "Guess from first name" option in the Cleanup gender section: a bundled, curated given-name -> sex dictionary (weighted English + German for the first real tree) proposes sex for people who don't have it set. Deterministic, offline, no model. Genuinely ambiguous names (Marion, Frances, Jordan, …) are excluded from both sets so they're left for a human. Reuses the existing preview/apply gender flow, so every guess is reviewed before saving. No migration. 56 backend tests pass; frontend builds. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -19,6 +19,7 @@ from app.models.user import User
|
||||
from app.services import gedcom, privacy
|
||||
from app.services.audit import record_audit
|
||||
from app.services.exceptions import Forbidden, NotFound
|
||||
from app.services.name_gender_data import guess_sex
|
||||
|
||||
|
||||
async def _require_editor(session: AsyncSession, *, actor: User, tree: Tree) -> None:
|
||||
@@ -160,6 +161,27 @@ async def preview_gender(
|
||||
return out
|
||||
|
||||
|
||||
async def guess_gender_by_name(
|
||||
session: AsyncSession, *, actor: User, tree: Tree
|
||||
) -> list[dict]:
|
||||
"""Best-guess sex from the first given name for people who don't have it set,
|
||||
using the bundled name dictionary. Ambiguous/unknown names are skipped."""
|
||||
await _require_editor(session, actor=actor, tree=tree)
|
||||
names = await _primary_name_by_person(session, tree.id)
|
||||
out: list[dict] = []
|
||||
for p in await _persons(session, tree.id):
|
||||
if p.gender:
|
||||
continue
|
||||
nm = names.get(p.id)
|
||||
if nm is None:
|
||||
continue
|
||||
proposed = guess_sex(nm.given)
|
||||
if proposed:
|
||||
out.append({"person_id": str(p.id), "name": _display(nm), "proposed_gender": proposed})
|
||||
out.sort(key=lambda r: r["name"])
|
||||
return out
|
||||
|
||||
|
||||
async def apply_gender(
|
||||
session: AsyncSession, *, actor: User, tree: Tree, updates: list[dict]
|
||||
) -> int:
|
||||
|
||||
@@ -0,0 +1,69 @@
|
||||
"""A curated given-name -> sex lookup for best-guessing a person's sex from
|
||||
their first name. Weighted toward English + German names (this codebase's first
|
||||
real tree is a German-American family). Deterministic and offline — no model
|
||||
needed; the Cleanup tool previews every guess before anything is applied.
|
||||
|
||||
Genuinely ambiguous names (Marion, Frances/Francis, Jordan, Jamie, Robin, Leslie,
|
||||
Dana, …) are intentionally left out of BOTH sets so they aren't guessed — better
|
||||
a human decides those than a coin flip.
|
||||
"""
|
||||
|
||||
MALE_NAMES: set[str] = {
|
||||
# English / common US
|
||||
"james", "john", "robert", "michael", "william", "david", "richard", "joseph",
|
||||
"thomas", "charles", "christopher", "daniel", "matthew", "anthony", "donald",
|
||||
"mark", "paul", "steven", "andrew", "kenneth", "george", "joshua", "kevin",
|
||||
"brian", "edward", "ronald", "timothy", "jason", "jeffrey", "gary", "ryan",
|
||||
"nicholas", "eric", "stephen", "jacob", "larry", "frank", "jonathan", "scott",
|
||||
"raymond", "gregory", "samuel", "benjamin", "patrick", "jack", "dennis", "jerry",
|
||||
"alexander", "tyler", "henry", "douglas", "peter", "adam", "harold", "albert",
|
||||
"arthur", "carl", "ralph", "roy", "eugene", "louis", "philip", "bobby", "walter",
|
||||
"willie", "wayne", "fred", "howard", "ernest", "earl", "clarence", "leon",
|
||||
"leonard", "lewis", "floyd", "leroy", "elmer", "homer", "orrin", "josias",
|
||||
"emerson", "dale", "bernard", "vernon", "virgil", "wilbur", "russell",
|
||||
"harvey", "herbert", "melvin", "lloyd", "marvin", "norman", "stanley",
|
||||
# German
|
||||
"hans", "karl", "wilhelm", "friedrich", "heinrich", "otto", "hermann", "gustav",
|
||||
"ludwig", "ernst", "fritz", "johann", "conrad", "konrad", "reinhold", "rudolf",
|
||||
"rudolph", "gerhard", "helmut", "horst", "klaus", "kurt", "dieter", "günther",
|
||||
"gunther", "manfred", "siegfried", "hilgard", "christian", "august", "wolfgang",
|
||||
"jürgen", "jurgen", "matthias", "lothar", "bruno", "gottlieb", "reinhard",
|
||||
}
|
||||
|
||||
FEMALE_NAMES: set[str] = {
|
||||
# English / common US
|
||||
"mary", "patricia", "jennifer", "linda", "elizabeth", "barbara", "susan",
|
||||
"jessica", "sarah", "karen", "nancy", "lisa", "betty", "margaret", "sandra",
|
||||
"ashley", "kimberly", "emily", "donna", "michelle", "carol", "amanda", "dorothy",
|
||||
"melissa", "deborah", "stephanie", "rebecca", "sharon", "laura", "cynthia",
|
||||
"kathleen", "amy", "angela", "shirley", "anna", "ruth", "brenda", "pamela",
|
||||
"nicole", "katherine", "virginia", "catherine", "helen", "debra", "rachel",
|
||||
"carolyn", "janet", "maria", "heather", "diane", "julie", "joyce", "victoria",
|
||||
"kelly", "christina", "joan", "evelyn", "judith", "megan", "alice", "frances",
|
||||
"marie", "florence", "flora", "zella", "thelma", "ellen", "althea", "della",
|
||||
"beatrice", "pauline", "hedwig", "florentine", "wilhelmina", "augusta", "bertha",
|
||||
"gladys", "mildred", "lucille", "edith", "esther", "irene", "hazel", "doris",
|
||||
"rose", "rita", "norma", "june", "lois", "marjorie",
|
||||
# German
|
||||
"greta", "ilse", "ursula", "gertrud", "gertrude", "frieda", "frida", "else",
|
||||
"hilda", "hilde", "hildegard", "ingrid", "helga", "renate", "monika", "sieglinde",
|
||||
"brigitte", "gisela", "elke", "anneliese", "waltraud", "edeltraud", "johanna",
|
||||
"katharina", "margarethe", "wilhelmine", "emilie", "auguste",
|
||||
}
|
||||
|
||||
|
||||
def guess_sex(given: str | None) -> str | None:
|
||||
"""Best-guess "male"/"female" from the first token of a given name, or None
|
||||
if unknown/ambiguous."""
|
||||
if not given:
|
||||
return None
|
||||
first = given.strip().split()[0].lower() if given.strip() else ""
|
||||
# Strip trailing punctuation/initials like "wm." -> "wm".
|
||||
first = first.strip(".,'\"")
|
||||
if not first:
|
||||
return None
|
||||
if first in MALE_NAMES:
|
||||
return "male"
|
||||
if first in FEMALE_NAMES:
|
||||
return "female"
|
||||
return None
|
||||
Reference in New Issue
Block a user