Files
justin 6ec852a23a Cleanup: best-guess sex from first name (offline dictionary)
A "Guess from first name" option in the Cleanup gender section: a bundled,
curated given-name -> sex dictionary (weighted English + German for the first
real tree) proposes sex for people who don't have it set. Deterministic, offline,
no model. Genuinely ambiguous names (Marion, Frances, Jordan, …) are excluded
from both sets so they're left for a human. Reuses the existing preview/apply
gender flow, so every guess is reviewed before saving.

No migration. 56 backend tests pass; frontend builds.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-08 10:30:35 -04:00

70 lines
3.8 KiB
Python

"""A curated given-name -> sex lookup for best-guessing a person's sex from
their first name. Weighted toward English + German names (this codebase's first
real tree is a German-American family). Deterministic and offline — no model
needed; the Cleanup tool previews every guess before anything is applied.
Genuinely ambiguous names (Marion, Frances/Francis, Jordan, Jamie, Robin, Leslie,
Dana, …) are intentionally left out of BOTH sets so they aren't guessed — better
a human decides those than a coin flip.
"""
MALE_NAMES: set[str] = {
# English / common US
"james", "john", "robert", "michael", "william", "david", "richard", "joseph",
"thomas", "charles", "christopher", "daniel", "matthew", "anthony", "donald",
"mark", "paul", "steven", "andrew", "kenneth", "george", "joshua", "kevin",
"brian", "edward", "ronald", "timothy", "jason", "jeffrey", "gary", "ryan",
"nicholas", "eric", "stephen", "jacob", "larry", "frank", "jonathan", "scott",
"raymond", "gregory", "samuel", "benjamin", "patrick", "jack", "dennis", "jerry",
"alexander", "tyler", "henry", "douglas", "peter", "adam", "harold", "albert",
"arthur", "carl", "ralph", "roy", "eugene", "louis", "philip", "bobby", "walter",
"willie", "wayne", "fred", "howard", "ernest", "earl", "clarence", "leon",
"leonard", "lewis", "floyd", "leroy", "elmer", "homer", "orrin", "josias",
"emerson", "dale", "bernard", "vernon", "virgil", "wilbur", "russell",
"harvey", "herbert", "melvin", "lloyd", "marvin", "norman", "stanley",
# German
"hans", "karl", "wilhelm", "friedrich", "heinrich", "otto", "hermann", "gustav",
"ludwig", "ernst", "fritz", "johann", "conrad", "konrad", "reinhold", "rudolf",
"rudolph", "gerhard", "helmut", "horst", "klaus", "kurt", "dieter", "günther",
"gunther", "manfred", "siegfried", "hilgard", "christian", "august", "wolfgang",
"jürgen", "jurgen", "matthias", "lothar", "bruno", "gottlieb", "reinhard",
}
FEMALE_NAMES: set[str] = {
# English / common US
"mary", "patricia", "jennifer", "linda", "elizabeth", "barbara", "susan",
"jessica", "sarah", "karen", "nancy", "lisa", "betty", "margaret", "sandra",
"ashley", "kimberly", "emily", "donna", "michelle", "carol", "amanda", "dorothy",
"melissa", "deborah", "stephanie", "rebecca", "sharon", "laura", "cynthia",
"kathleen", "amy", "angela", "shirley", "anna", "ruth", "brenda", "pamela",
"nicole", "katherine", "virginia", "catherine", "helen", "debra", "rachel",
"carolyn", "janet", "maria", "heather", "diane", "julie", "joyce", "victoria",
"kelly", "christina", "joan", "evelyn", "judith", "megan", "alice", "frances",
"marie", "florence", "flora", "zella", "thelma", "ellen", "althea", "della",
"beatrice", "pauline", "hedwig", "florentine", "wilhelmina", "augusta", "bertha",
"gladys", "mildred", "lucille", "edith", "esther", "irene", "hazel", "doris",
"rose", "rita", "norma", "june", "lois", "marjorie",
# German
"greta", "ilse", "ursula", "gertrud", "gertrude", "frieda", "frida", "else",
"hilda", "hilde", "hildegard", "ingrid", "helga", "renate", "monika", "sieglinde",
"brigitte", "gisela", "elke", "anneliese", "waltraud", "edeltraud", "johanna",
"katharina", "margarethe", "wilhelmine", "emilie", "auguste",
}
def guess_sex(given: str | None) -> str | None:
"""Best-guess "male"/"female" from the first token of a given name, or None
if unknown/ambiguous."""
if not given:
return None
first = given.strip().split()[0].lower() if given.strip() else ""
# Strip trailing punctuation/initials like "wm." -> "wm".
first = first.strip(".,'\"")
if not first:
return None
if first in MALE_NAMES:
return "male"
if first in FEMALE_NAMES:
return "female"
return None