"""A curated given-name -> sex lookup for best-guessing a person's sex from their first name. Weighted toward English + German names (this codebase's first real tree is a German-American family). Deterministic and offline — no model needed; the Cleanup tool previews every guess before anything is applied. Genuinely ambiguous names (Marion, Frances/Francis, Jordan, Jamie, Robin, Leslie, Dana, …) are intentionally left out of BOTH sets so they aren't guessed — better a human decides those than a coin flip. """ MALE_NAMES: set[str] = { # English / common US "james", "john", "robert", "michael", "william", "david", "richard", "joseph", "thomas", "charles", "christopher", "daniel", "matthew", "anthony", "donald", "mark", "paul", "steven", "andrew", "kenneth", "george", "joshua", "kevin", "brian", "edward", "ronald", "timothy", "jason", "jeffrey", "gary", "ryan", "nicholas", "eric", "stephen", "jacob", "larry", "frank", "jonathan", "scott", "raymond", "gregory", "samuel", "benjamin", "patrick", "jack", "dennis", "jerry", "alexander", "tyler", "henry", "douglas", "peter", "adam", "harold", "albert", "arthur", "carl", "ralph", "roy", "eugene", "louis", "philip", "bobby", "walter", "willie", "wayne", "fred", "howard", "ernest", "earl", "clarence", "leon", "leonard", "lewis", "floyd", "leroy", "elmer", "homer", "orrin", "josias", "emerson", "dale", "bernard", "vernon", "virgil", "wilbur", "russell", "harvey", "herbert", "melvin", "lloyd", "marvin", "norman", "stanley", # German "hans", "karl", "wilhelm", "friedrich", "heinrich", "otto", "hermann", "gustav", "ludwig", "ernst", "fritz", "johann", "conrad", "konrad", "reinhold", "rudolf", "rudolph", "gerhard", "helmut", "horst", "klaus", "kurt", "dieter", "günther", "gunther", "manfred", "siegfried", "hilgard", "christian", "august", "wolfgang", "jürgen", "jurgen", "matthias", "lothar", "bruno", "gottlieb", "reinhard", } FEMALE_NAMES: set[str] = { # English / common US "mary", "patricia", "jennifer", "linda", "elizabeth", "barbara", "susan", "jessica", "sarah", "karen", "nancy", "lisa", "betty", "margaret", "sandra", "ashley", "kimberly", "emily", "donna", "michelle", "carol", "amanda", "dorothy", "melissa", "deborah", "stephanie", "rebecca", "sharon", "laura", "cynthia", "kathleen", "amy", "angela", "shirley", "anna", "ruth", "brenda", "pamela", "nicole", "katherine", "virginia", "catherine", "helen", "debra", "rachel", "carolyn", "janet", "maria", "heather", "diane", "julie", "joyce", "victoria", "kelly", "christina", "joan", "evelyn", "judith", "megan", "alice", "frances", "marie", "florence", "flora", "zella", "thelma", "ellen", "althea", "della", "beatrice", "pauline", "hedwig", "florentine", "wilhelmina", "augusta", "bertha", "gladys", "mildred", "lucille", "edith", "esther", "irene", "hazel", "doris", "rose", "rita", "norma", "june", "lois", "marjorie", # German "greta", "ilse", "ursula", "gertrud", "gertrude", "frieda", "frida", "else", "hilda", "hilde", "hildegard", "ingrid", "helga", "renate", "monika", "sieglinde", "brigitte", "gisela", "elke", "anneliese", "waltraud", "edeltraud", "johanna", "katharina", "margarethe", "wilhelmine", "emilie", "auguste", } def guess_sex(given: str | None) -> str | None: """Best-guess "male"/"female" from the first token of a given name, or None if unknown/ambiguous.""" if not given: return None first = given.strip().split()[0].lower() if given.strip() else "" # Strip trailing punctuation/initials like "wm." -> "wm". first = first.strip(".,'\"") if not first: return None if first in MALE_NAMES: return "male" if first in FEMALE_NAMES: return "female" return None