Merge pull request 'Cleanup: best-guess sex from first name (offline dictionary)' (#32) from gender-name-guess into main
This commit was merged in pull request #32.
This commit is contained in:
@@ -57,6 +57,16 @@ async def preview_gender(
|
||||
return [GenderProposal(**r) for r in rows]
|
||||
|
||||
|
||||
@router.get("/{tree_id}/cleanup/gender/guess", response_model=list[GenderProposal])
|
||||
async def guess_gender(
|
||||
tree_id: uuid.UUID, session: SessionDep, current: CurrentUser
|
||||
) -> list[GenderProposal]:
|
||||
"""Best-guess sex from first names (bundled dictionary) for people missing it."""
|
||||
tree = await tree_service.get_tree(session, viewer_id=current.id, tree_id=tree_id)
|
||||
rows = await cleanup_service.guess_gender_by_name(session, actor=current, tree=tree)
|
||||
return [GenderProposal(**r) for r in rows]
|
||||
|
||||
|
||||
@router.post("/{tree_id}/cleanup/gender", response_model=CleanupResult)
|
||||
async def apply_gender(
|
||||
tree_id: uuid.UUID, data: GenderApply, session: SessionDep, current: CurrentUser
|
||||
|
||||
@@ -19,6 +19,7 @@ from app.models.user import User
|
||||
from app.services import gedcom, privacy
|
||||
from app.services.audit import record_audit
|
||||
from app.services.exceptions import Forbidden, NotFound
|
||||
from app.services.name_gender_data import guess_sex
|
||||
|
||||
|
||||
async def _require_editor(session: AsyncSession, *, actor: User, tree: Tree) -> None:
|
||||
@@ -160,6 +161,27 @@ async def preview_gender(
|
||||
return out
|
||||
|
||||
|
||||
async def guess_gender_by_name(
|
||||
session: AsyncSession, *, actor: User, tree: Tree
|
||||
) -> list[dict]:
|
||||
"""Best-guess sex from the first given name for people who don't have it set,
|
||||
using the bundled name dictionary. Ambiguous/unknown names are skipped."""
|
||||
await _require_editor(session, actor=actor, tree=tree)
|
||||
names = await _primary_name_by_person(session, tree.id)
|
||||
out: list[dict] = []
|
||||
for p in await _persons(session, tree.id):
|
||||
if p.gender:
|
||||
continue
|
||||
nm = names.get(p.id)
|
||||
if nm is None:
|
||||
continue
|
||||
proposed = guess_sex(nm.given)
|
||||
if proposed:
|
||||
out.append({"person_id": str(p.id), "name": _display(nm), "proposed_gender": proposed})
|
||||
out.sort(key=lambda r: r["name"])
|
||||
return out
|
||||
|
||||
|
||||
async def apply_gender(
|
||||
session: AsyncSession, *, actor: User, tree: Tree, updates: list[dict]
|
||||
) -> int:
|
||||
|
||||
@@ -0,0 +1,69 @@
|
||||
"""A curated given-name -> sex lookup for best-guessing a person's sex from
|
||||
their first name. Weighted toward English + German names (this codebase's first
|
||||
real tree is a German-American family). Deterministic and offline — no model
|
||||
needed; the Cleanup tool previews every guess before anything is applied.
|
||||
|
||||
Genuinely ambiguous names (Marion, Frances/Francis, Jordan, Jamie, Robin, Leslie,
|
||||
Dana, …) are intentionally left out of BOTH sets so they aren't guessed — better
|
||||
a human decides those than a coin flip.
|
||||
"""
|
||||
|
||||
MALE_NAMES: set[str] = {
|
||||
# English / common US
|
||||
"james", "john", "robert", "michael", "william", "david", "richard", "joseph",
|
||||
"thomas", "charles", "christopher", "daniel", "matthew", "anthony", "donald",
|
||||
"mark", "paul", "steven", "andrew", "kenneth", "george", "joshua", "kevin",
|
||||
"brian", "edward", "ronald", "timothy", "jason", "jeffrey", "gary", "ryan",
|
||||
"nicholas", "eric", "stephen", "jacob", "larry", "frank", "jonathan", "scott",
|
||||
"raymond", "gregory", "samuel", "benjamin", "patrick", "jack", "dennis", "jerry",
|
||||
"alexander", "tyler", "henry", "douglas", "peter", "adam", "harold", "albert",
|
||||
"arthur", "carl", "ralph", "roy", "eugene", "louis", "philip", "bobby", "walter",
|
||||
"willie", "wayne", "fred", "howard", "ernest", "earl", "clarence", "leon",
|
||||
"leonard", "lewis", "floyd", "leroy", "elmer", "homer", "orrin", "josias",
|
||||
"emerson", "dale", "bernard", "vernon", "virgil", "wilbur", "russell",
|
||||
"harvey", "herbert", "melvin", "lloyd", "marvin", "norman", "stanley",
|
||||
# German
|
||||
"hans", "karl", "wilhelm", "friedrich", "heinrich", "otto", "hermann", "gustav",
|
||||
"ludwig", "ernst", "fritz", "johann", "conrad", "konrad", "reinhold", "rudolf",
|
||||
"rudolph", "gerhard", "helmut", "horst", "klaus", "kurt", "dieter", "günther",
|
||||
"gunther", "manfred", "siegfried", "hilgard", "christian", "august", "wolfgang",
|
||||
"jürgen", "jurgen", "matthias", "lothar", "bruno", "gottlieb", "reinhard",
|
||||
}
|
||||
|
||||
FEMALE_NAMES: set[str] = {
|
||||
# English / common US
|
||||
"mary", "patricia", "jennifer", "linda", "elizabeth", "barbara", "susan",
|
||||
"jessica", "sarah", "karen", "nancy", "lisa", "betty", "margaret", "sandra",
|
||||
"ashley", "kimberly", "emily", "donna", "michelle", "carol", "amanda", "dorothy",
|
||||
"melissa", "deborah", "stephanie", "rebecca", "sharon", "laura", "cynthia",
|
||||
"kathleen", "amy", "angela", "shirley", "anna", "ruth", "brenda", "pamela",
|
||||
"nicole", "katherine", "virginia", "catherine", "helen", "debra", "rachel",
|
||||
"carolyn", "janet", "maria", "heather", "diane", "julie", "joyce", "victoria",
|
||||
"kelly", "christina", "joan", "evelyn", "judith", "megan", "alice", "frances",
|
||||
"marie", "florence", "flora", "zella", "thelma", "ellen", "althea", "della",
|
||||
"beatrice", "pauline", "hedwig", "florentine", "wilhelmina", "augusta", "bertha",
|
||||
"gladys", "mildred", "lucille", "edith", "esther", "irene", "hazel", "doris",
|
||||
"rose", "rita", "norma", "june", "lois", "marjorie",
|
||||
# German
|
||||
"greta", "ilse", "ursula", "gertrud", "gertrude", "frieda", "frida", "else",
|
||||
"hilda", "hilde", "hildegard", "ingrid", "helga", "renate", "monika", "sieglinde",
|
||||
"brigitte", "gisela", "elke", "anneliese", "waltraud", "edeltraud", "johanna",
|
||||
"katharina", "margarethe", "wilhelmine", "emilie", "auguste",
|
||||
}
|
||||
|
||||
|
||||
def guess_sex(given: str | None) -> str | None:
|
||||
"""Best-guess "male"/"female" from the first token of a given name, or None
|
||||
if unknown/ambiguous."""
|
||||
if not given:
|
||||
return None
|
||||
first = given.strip().split()[0].lower() if given.strip() else ""
|
||||
# Strip trailing punctuation/initials like "wm." -> "wm".
|
||||
first = first.strip(".,'\"")
|
||||
if not first:
|
||||
return None
|
||||
if first in MALE_NAMES:
|
||||
return "male"
|
||||
if first in FEMALE_NAMES:
|
||||
return "female"
|
||||
return None
|
||||
@@ -87,6 +87,28 @@ async def test_gender_from_source(client):
|
||||
assert genders["Josias Moody"] == "male" and genders["Flora Paul"] == "female"
|
||||
|
||||
|
||||
async def test_guess_gender_from_first_name(client):
|
||||
h, tid = await _tree(client, "cl-guess@example.com")
|
||||
await _person(client, h, tid, "William", "Paul") # male
|
||||
await _person(client, h, tid, "Flora", "Reier") # female
|
||||
await _person(client, h, tid, "Marion", "Doe") # ambiguous -> skipped
|
||||
# Already-gendered person is left alone even if guessable.
|
||||
gendered = await _person(client, h, tid, "James", "Known")
|
||||
await client.patch(
|
||||
f"/api/v1/trees/{tid}/persons/{gendered}", json={"gender": "male"}, headers=h
|
||||
)
|
||||
|
||||
prev = (await client.get(f"/api/v1/trees/{tid}/cleanup/gender/guess", headers=h)).json()
|
||||
by = {p["name"]: p["proposed_gender"] for p in prev}
|
||||
assert by == {"William Paul": "male", "Flora Reier": "female"}
|
||||
|
||||
updates = [{"person_id": p["person_id"], "gender": p["proposed_gender"]} for p in prev]
|
||||
r = await client.post(
|
||||
f"/api/v1/trees/{tid}/cleanup/gender", json={"updates": updates}, headers=h
|
||||
)
|
||||
assert r.status_code == 200 and r.json()["updated"] == 2
|
||||
|
||||
|
||||
async def test_name_issues_preview_and_fix(client):
|
||||
h, tid = await _tree(client, "cl-name@example.com")
|
||||
# surname got a date; real surname landed in the given name.
|
||||
|
||||
@@ -77,6 +77,15 @@ export default function CleanupPage() {
|
||||
setGenSel(new Set(data.map((g) => g.person_id)));
|
||||
}
|
||||
}
|
||||
async function guessGender() {
|
||||
setGenMsg(null);
|
||||
const { data } = await api.GET("/api/v1/trees/{tree_id}/cleanup/gender/guess", {
|
||||
params: { path: { tree_id: treeId } },
|
||||
});
|
||||
setGender(data ?? []);
|
||||
setGenSel(new Set((data ?? []).map((g) => g.person_id)));
|
||||
}
|
||||
|
||||
async function applyGender() {
|
||||
const updates = (gender ?? [])
|
||||
.filter((g) => genSel.has(g.person_id))
|
||||
@@ -205,9 +214,18 @@ export default function CleanupPage() {
|
||||
onChange={previewGender}
|
||||
className="hidden"
|
||||
/>
|
||||
<Button variant="outline" onClick={() => genFile.current?.click()}>
|
||||
Choose source GEDCOM
|
||||
</Button>
|
||||
<div className="flex flex-wrap gap-2">
|
||||
<Button variant="outline" onClick={() => genFile.current?.click()}>
|
||||
Choose source GEDCOM
|
||||
</Button>
|
||||
<Button variant="outline" onClick={guessGender}>
|
||||
Guess from first name
|
||||
</Button>
|
||||
</div>
|
||||
<p className="text-xs text-[var(--muted)]">
|
||||
“Guess from first name” uses a built-in name dictionary for people with no sex set;
|
||||
ambiguous names (Marion, Frances, …) are left for you to decide.
|
||||
</p>
|
||||
{genMsg && <p className="text-sm text-bronze">{genMsg}</p>}
|
||||
{gender && (
|
||||
<div className="space-y-2">
|
||||
|
||||
Vendored
+51
@@ -714,6 +714,26 @@ export interface paths {
|
||||
patch?: never;
|
||||
trace?: never;
|
||||
};
|
||||
"/api/v1/trees/{tree_id}/cleanup/gender/guess": {
|
||||
parameters: {
|
||||
query?: never;
|
||||
header?: never;
|
||||
path?: never;
|
||||
cookie?: never;
|
||||
};
|
||||
/**
|
||||
* Guess Gender
|
||||
* @description Best-guess sex from first names (bundled dictionary) for people missing it.
|
||||
*/
|
||||
get: operations["guess_gender_api_v1_trees__tree_id__cleanup_gender_guess_get"];
|
||||
put?: never;
|
||||
post?: never;
|
||||
delete?: never;
|
||||
options?: never;
|
||||
head?: never;
|
||||
patch?: never;
|
||||
trace?: never;
|
||||
};
|
||||
"/api/v1/trees/{tree_id}/cleanup/gender": {
|
||||
parameters: {
|
||||
query?: never;
|
||||
@@ -3481,6 +3501,37 @@ export interface operations {
|
||||
};
|
||||
};
|
||||
};
|
||||
guess_gender_api_v1_trees__tree_id__cleanup_gender_guess_get: {
|
||||
parameters: {
|
||||
query?: never;
|
||||
header?: never;
|
||||
path: {
|
||||
tree_id: string;
|
||||
};
|
||||
cookie?: never;
|
||||
};
|
||||
requestBody?: never;
|
||||
responses: {
|
||||
/** @description Successful Response */
|
||||
200: {
|
||||
headers: {
|
||||
[name: string]: unknown;
|
||||
};
|
||||
content: {
|
||||
"application/json": components["schemas"]["GenderProposal"][];
|
||||
};
|
||||
};
|
||||
/** @description Validation Error */
|
||||
422: {
|
||||
headers: {
|
||||
[name: string]: unknown;
|
||||
};
|
||||
content: {
|
||||
"application/json": components["schemas"]["HTTPValidationError"];
|
||||
};
|
||||
};
|
||||
};
|
||||
};
|
||||
apply_gender_api_v1_trees__tree_id__cleanup_gender_post: {
|
||||
parameters: {
|
||||
query?: never;
|
||||
|
||||
@@ -2867,6 +2867,54 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"/api/v1/trees/{tree_id}/cleanup/gender/guess": {
|
||||
"get": {
|
||||
"tags": [
|
||||
"cleanup"
|
||||
],
|
||||
"summary": "Guess Gender",
|
||||
"description": "Best-guess sex from first names (bundled dictionary) for people missing it.",
|
||||
"operationId": "guess_gender_api_v1_trees__tree_id__cleanup_gender_guess_get",
|
||||
"parameters": [
|
||||
{
|
||||
"name": "tree_id",
|
||||
"in": "path",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"type": "string",
|
||||
"format": "uuid",
|
||||
"title": "Tree Id"
|
||||
}
|
||||
}
|
||||
],
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Successful Response",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/GenderProposal"
|
||||
},
|
||||
"title": "Response Guess Gender Api V1 Trees Tree Id Cleanup Gender Guess Get"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"422": {
|
||||
"description": "Validation Error",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/HTTPValidationError"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/api/v1/trees/{tree_id}/cleanup/gender": {
|
||||
"post": {
|
||||
"tags": [
|
||||
|
||||
Reference in New Issue
Block a user