6ec852a23a
A "Guess from first name" option in the Cleanup gender section: a bundled, curated given-name -> sex dictionary (weighted English + German for the first real tree) proposes sex for people who don't have it set. Deterministic, offline, no model. Genuinely ambiguous names (Marion, Frances, Jordan, …) are excluded from both sets so they're left for a human. Reuses the existing preview/apply gender flow, so every guess is reviewed before saving. No migration. 56 backend tests pass; frontend builds. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
70 lines
3.8 KiB
Python
70 lines
3.8 KiB
Python
"""A curated given-name -> sex lookup for best-guessing a person's sex from
|
|
their first name. Weighted toward English + German names (this codebase's first
|
|
real tree is a German-American family). Deterministic and offline — no model
|
|
needed; the Cleanup tool previews every guess before anything is applied.
|
|
|
|
Genuinely ambiguous names (Marion, Frances/Francis, Jordan, Jamie, Robin, Leslie,
|
|
Dana, …) are intentionally left out of BOTH sets so they aren't guessed — better
|
|
a human decides those than a coin flip.
|
|
"""
|
|
|
|
MALE_NAMES: set[str] = {
|
|
# English / common US
|
|
"james", "john", "robert", "michael", "william", "david", "richard", "joseph",
|
|
"thomas", "charles", "christopher", "daniel", "matthew", "anthony", "donald",
|
|
"mark", "paul", "steven", "andrew", "kenneth", "george", "joshua", "kevin",
|
|
"brian", "edward", "ronald", "timothy", "jason", "jeffrey", "gary", "ryan",
|
|
"nicholas", "eric", "stephen", "jacob", "larry", "frank", "jonathan", "scott",
|
|
"raymond", "gregory", "samuel", "benjamin", "patrick", "jack", "dennis", "jerry",
|
|
"alexander", "tyler", "henry", "douglas", "peter", "adam", "harold", "albert",
|
|
"arthur", "carl", "ralph", "roy", "eugene", "louis", "philip", "bobby", "walter",
|
|
"willie", "wayne", "fred", "howard", "ernest", "earl", "clarence", "leon",
|
|
"leonard", "lewis", "floyd", "leroy", "elmer", "homer", "orrin", "josias",
|
|
"emerson", "dale", "bernard", "vernon", "virgil", "wilbur", "russell",
|
|
"harvey", "herbert", "melvin", "lloyd", "marvin", "norman", "stanley",
|
|
# German
|
|
"hans", "karl", "wilhelm", "friedrich", "heinrich", "otto", "hermann", "gustav",
|
|
"ludwig", "ernst", "fritz", "johann", "conrad", "konrad", "reinhold", "rudolf",
|
|
"rudolph", "gerhard", "helmut", "horst", "klaus", "kurt", "dieter", "günther",
|
|
"gunther", "manfred", "siegfried", "hilgard", "christian", "august", "wolfgang",
|
|
"jürgen", "jurgen", "matthias", "lothar", "bruno", "gottlieb", "reinhard",
|
|
}
|
|
|
|
FEMALE_NAMES: set[str] = {
|
|
# English / common US
|
|
"mary", "patricia", "jennifer", "linda", "elizabeth", "barbara", "susan",
|
|
"jessica", "sarah", "karen", "nancy", "lisa", "betty", "margaret", "sandra",
|
|
"ashley", "kimberly", "emily", "donna", "michelle", "carol", "amanda", "dorothy",
|
|
"melissa", "deborah", "stephanie", "rebecca", "sharon", "laura", "cynthia",
|
|
"kathleen", "amy", "angela", "shirley", "anna", "ruth", "brenda", "pamela",
|
|
"nicole", "katherine", "virginia", "catherine", "helen", "debra", "rachel",
|
|
"carolyn", "janet", "maria", "heather", "diane", "julie", "joyce", "victoria",
|
|
"kelly", "christina", "joan", "evelyn", "judith", "megan", "alice", "frances",
|
|
"marie", "florence", "flora", "zella", "thelma", "ellen", "althea", "della",
|
|
"beatrice", "pauline", "hedwig", "florentine", "wilhelmina", "augusta", "bertha",
|
|
"gladys", "mildred", "lucille", "edith", "esther", "irene", "hazel", "doris",
|
|
"rose", "rita", "norma", "june", "lois", "marjorie",
|
|
# German
|
|
"greta", "ilse", "ursula", "gertrud", "gertrude", "frieda", "frida", "else",
|
|
"hilda", "hilde", "hildegard", "ingrid", "helga", "renate", "monika", "sieglinde",
|
|
"brigitte", "gisela", "elke", "anneliese", "waltraud", "edeltraud", "johanna",
|
|
"katharina", "margarethe", "wilhelmine", "emilie", "auguste",
|
|
}
|
|
|
|
|
|
def guess_sex(given: str | None) -> str | None:
|
|
"""Best-guess "male"/"female" from the first token of a given name, or None
|
|
if unknown/ambiguous."""
|
|
if not given:
|
|
return None
|
|
first = given.strip().split()[0].lower() if given.strip() else ""
|
|
# Strip trailing punctuation/initials like "wm." -> "wm".
|
|
first = first.strip(".,'\"")
|
|
if not first:
|
|
return None
|
|
if first in MALE_NAMES:
|
|
return "male"
|
|
if first in FEMALE_NAMES:
|
|
return "female"
|
|
return None
|