Merge pull request 'Cleanup: best-guess sex from first name (offline dictionary)' (#32) from gender-name-guess into main
This commit was merged in pull request #32.
This commit is contained in:
@@ -57,6 +57,16 @@ async def preview_gender(
|
|||||||
return [GenderProposal(**r) for r in rows]
|
return [GenderProposal(**r) for r in rows]
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/{tree_id}/cleanup/gender/guess", response_model=list[GenderProposal])
|
||||||
|
async def guess_gender(
|
||||||
|
tree_id: uuid.UUID, session: SessionDep, current: CurrentUser
|
||||||
|
) -> list[GenderProposal]:
|
||||||
|
"""Best-guess sex from first names (bundled dictionary) for people missing it."""
|
||||||
|
tree = await tree_service.get_tree(session, viewer_id=current.id, tree_id=tree_id)
|
||||||
|
rows = await cleanup_service.guess_gender_by_name(session, actor=current, tree=tree)
|
||||||
|
return [GenderProposal(**r) for r in rows]
|
||||||
|
|
||||||
|
|
||||||
@router.post("/{tree_id}/cleanup/gender", response_model=CleanupResult)
|
@router.post("/{tree_id}/cleanup/gender", response_model=CleanupResult)
|
||||||
async def apply_gender(
|
async def apply_gender(
|
||||||
tree_id: uuid.UUID, data: GenderApply, session: SessionDep, current: CurrentUser
|
tree_id: uuid.UUID, data: GenderApply, session: SessionDep, current: CurrentUser
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ from app.models.user import User
|
|||||||
from app.services import gedcom, privacy
|
from app.services import gedcom, privacy
|
||||||
from app.services.audit import record_audit
|
from app.services.audit import record_audit
|
||||||
from app.services.exceptions import Forbidden, NotFound
|
from app.services.exceptions import Forbidden, NotFound
|
||||||
|
from app.services.name_gender_data import guess_sex
|
||||||
|
|
||||||
|
|
||||||
async def _require_editor(session: AsyncSession, *, actor: User, tree: Tree) -> None:
|
async def _require_editor(session: AsyncSession, *, actor: User, tree: Tree) -> None:
|
||||||
@@ -160,6 +161,27 @@ async def preview_gender(
|
|||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
async def guess_gender_by_name(
|
||||||
|
session: AsyncSession, *, actor: User, tree: Tree
|
||||||
|
) -> list[dict]:
|
||||||
|
"""Best-guess sex from the first given name for people who don't have it set,
|
||||||
|
using the bundled name dictionary. Ambiguous/unknown names are skipped."""
|
||||||
|
await _require_editor(session, actor=actor, tree=tree)
|
||||||
|
names = await _primary_name_by_person(session, tree.id)
|
||||||
|
out: list[dict] = []
|
||||||
|
for p in await _persons(session, tree.id):
|
||||||
|
if p.gender:
|
||||||
|
continue
|
||||||
|
nm = names.get(p.id)
|
||||||
|
if nm is None:
|
||||||
|
continue
|
||||||
|
proposed = guess_sex(nm.given)
|
||||||
|
if proposed:
|
||||||
|
out.append({"person_id": str(p.id), "name": _display(nm), "proposed_gender": proposed})
|
||||||
|
out.sort(key=lambda r: r["name"])
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
async def apply_gender(
|
async def apply_gender(
|
||||||
session: AsyncSession, *, actor: User, tree: Tree, updates: list[dict]
|
session: AsyncSession, *, actor: User, tree: Tree, updates: list[dict]
|
||||||
) -> int:
|
) -> int:
|
||||||
|
|||||||
@@ -0,0 +1,69 @@
|
|||||||
|
"""A curated given-name -> sex lookup for best-guessing a person's sex from
|
||||||
|
their first name. Weighted toward English + German names (this codebase's first
|
||||||
|
real tree is a German-American family). Deterministic and offline — no model
|
||||||
|
needed; the Cleanup tool previews every guess before anything is applied.
|
||||||
|
|
||||||
|
Genuinely ambiguous names (Marion, Frances/Francis, Jordan, Jamie, Robin, Leslie,
|
||||||
|
Dana, …) are intentionally left out of BOTH sets so they aren't guessed — better
|
||||||
|
a human decides those than a coin flip.
|
||||||
|
"""
|
||||||
|
|
||||||
|
MALE_NAMES: set[str] = {
|
||||||
|
# English / common US
|
||||||
|
"james", "john", "robert", "michael", "william", "david", "richard", "joseph",
|
||||||
|
"thomas", "charles", "christopher", "daniel", "matthew", "anthony", "donald",
|
||||||
|
"mark", "paul", "steven", "andrew", "kenneth", "george", "joshua", "kevin",
|
||||||
|
"brian", "edward", "ronald", "timothy", "jason", "jeffrey", "gary", "ryan",
|
||||||
|
"nicholas", "eric", "stephen", "jacob", "larry", "frank", "jonathan", "scott",
|
||||||
|
"raymond", "gregory", "samuel", "benjamin", "patrick", "jack", "dennis", "jerry",
|
||||||
|
"alexander", "tyler", "henry", "douglas", "peter", "adam", "harold", "albert",
|
||||||
|
"arthur", "carl", "ralph", "roy", "eugene", "louis", "philip", "bobby", "walter",
|
||||||
|
"willie", "wayne", "fred", "howard", "ernest", "earl", "clarence", "leon",
|
||||||
|
"leonard", "lewis", "floyd", "leroy", "elmer", "homer", "orrin", "josias",
|
||||||
|
"emerson", "dale", "bernard", "vernon", "virgil", "wilbur", "russell",
|
||||||
|
"harvey", "herbert", "melvin", "lloyd", "marvin", "norman", "stanley",
|
||||||
|
# German
|
||||||
|
"hans", "karl", "wilhelm", "friedrich", "heinrich", "otto", "hermann", "gustav",
|
||||||
|
"ludwig", "ernst", "fritz", "johann", "conrad", "konrad", "reinhold", "rudolf",
|
||||||
|
"rudolph", "gerhard", "helmut", "horst", "klaus", "kurt", "dieter", "günther",
|
||||||
|
"gunther", "manfred", "siegfried", "hilgard", "christian", "august", "wolfgang",
|
||||||
|
"jürgen", "jurgen", "matthias", "lothar", "bruno", "gottlieb", "reinhard",
|
||||||
|
}
|
||||||
|
|
||||||
|
FEMALE_NAMES: set[str] = {
|
||||||
|
# English / common US
|
||||||
|
"mary", "patricia", "jennifer", "linda", "elizabeth", "barbara", "susan",
|
||||||
|
"jessica", "sarah", "karen", "nancy", "lisa", "betty", "margaret", "sandra",
|
||||||
|
"ashley", "kimberly", "emily", "donna", "michelle", "carol", "amanda", "dorothy",
|
||||||
|
"melissa", "deborah", "stephanie", "rebecca", "sharon", "laura", "cynthia",
|
||||||
|
"kathleen", "amy", "angela", "shirley", "anna", "ruth", "brenda", "pamela",
|
||||||
|
"nicole", "katherine", "virginia", "catherine", "helen", "debra", "rachel",
|
||||||
|
"carolyn", "janet", "maria", "heather", "diane", "julie", "joyce", "victoria",
|
||||||
|
"kelly", "christina", "joan", "evelyn", "judith", "megan", "alice", "frances",
|
||||||
|
"marie", "florence", "flora", "zella", "thelma", "ellen", "althea", "della",
|
||||||
|
"beatrice", "pauline", "hedwig", "florentine", "wilhelmina", "augusta", "bertha",
|
||||||
|
"gladys", "mildred", "lucille", "edith", "esther", "irene", "hazel", "doris",
|
||||||
|
"rose", "rita", "norma", "june", "lois", "marjorie",
|
||||||
|
# German
|
||||||
|
"greta", "ilse", "ursula", "gertrud", "gertrude", "frieda", "frida", "else",
|
||||||
|
"hilda", "hilde", "hildegard", "ingrid", "helga", "renate", "monika", "sieglinde",
|
||||||
|
"brigitte", "gisela", "elke", "anneliese", "waltraud", "edeltraud", "johanna",
|
||||||
|
"katharina", "margarethe", "wilhelmine", "emilie", "auguste",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def guess_sex(given: str | None) -> str | None:
|
||||||
|
"""Best-guess "male"/"female" from the first token of a given name, or None
|
||||||
|
if unknown/ambiguous."""
|
||||||
|
if not given:
|
||||||
|
return None
|
||||||
|
first = given.strip().split()[0].lower() if given.strip() else ""
|
||||||
|
# Strip trailing punctuation/initials like "wm." -> "wm".
|
||||||
|
first = first.strip(".,'\"")
|
||||||
|
if not first:
|
||||||
|
return None
|
||||||
|
if first in MALE_NAMES:
|
||||||
|
return "male"
|
||||||
|
if first in FEMALE_NAMES:
|
||||||
|
return "female"
|
||||||
|
return None
|
||||||
@@ -87,6 +87,28 @@ async def test_gender_from_source(client):
|
|||||||
assert genders["Josias Moody"] == "male" and genders["Flora Paul"] == "female"
|
assert genders["Josias Moody"] == "male" and genders["Flora Paul"] == "female"
|
||||||
|
|
||||||
|
|
||||||
|
async def test_guess_gender_from_first_name(client):
|
||||||
|
h, tid = await _tree(client, "cl-guess@example.com")
|
||||||
|
await _person(client, h, tid, "William", "Paul") # male
|
||||||
|
await _person(client, h, tid, "Flora", "Reier") # female
|
||||||
|
await _person(client, h, tid, "Marion", "Doe") # ambiguous -> skipped
|
||||||
|
# Already-gendered person is left alone even if guessable.
|
||||||
|
gendered = await _person(client, h, tid, "James", "Known")
|
||||||
|
await client.patch(
|
||||||
|
f"/api/v1/trees/{tid}/persons/{gendered}", json={"gender": "male"}, headers=h
|
||||||
|
)
|
||||||
|
|
||||||
|
prev = (await client.get(f"/api/v1/trees/{tid}/cleanup/gender/guess", headers=h)).json()
|
||||||
|
by = {p["name"]: p["proposed_gender"] for p in prev}
|
||||||
|
assert by == {"William Paul": "male", "Flora Reier": "female"}
|
||||||
|
|
||||||
|
updates = [{"person_id": p["person_id"], "gender": p["proposed_gender"]} for p in prev]
|
||||||
|
r = await client.post(
|
||||||
|
f"/api/v1/trees/{tid}/cleanup/gender", json={"updates": updates}, headers=h
|
||||||
|
)
|
||||||
|
assert r.status_code == 200 and r.json()["updated"] == 2
|
||||||
|
|
||||||
|
|
||||||
async def test_name_issues_preview_and_fix(client):
|
async def test_name_issues_preview_and_fix(client):
|
||||||
h, tid = await _tree(client, "cl-name@example.com")
|
h, tid = await _tree(client, "cl-name@example.com")
|
||||||
# surname got a date; real surname landed in the given name.
|
# surname got a date; real surname landed in the given name.
|
||||||
|
|||||||
@@ -77,6 +77,15 @@ export default function CleanupPage() {
|
|||||||
setGenSel(new Set(data.map((g) => g.person_id)));
|
setGenSel(new Set(data.map((g) => g.person_id)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
async function guessGender() {
|
||||||
|
setGenMsg(null);
|
||||||
|
const { data } = await api.GET("/api/v1/trees/{tree_id}/cleanup/gender/guess", {
|
||||||
|
params: { path: { tree_id: treeId } },
|
||||||
|
});
|
||||||
|
setGender(data ?? []);
|
||||||
|
setGenSel(new Set((data ?? []).map((g) => g.person_id)));
|
||||||
|
}
|
||||||
|
|
||||||
async function applyGender() {
|
async function applyGender() {
|
||||||
const updates = (gender ?? [])
|
const updates = (gender ?? [])
|
||||||
.filter((g) => genSel.has(g.person_id))
|
.filter((g) => genSel.has(g.person_id))
|
||||||
@@ -205,9 +214,18 @@ export default function CleanupPage() {
|
|||||||
onChange={previewGender}
|
onChange={previewGender}
|
||||||
className="hidden"
|
className="hidden"
|
||||||
/>
|
/>
|
||||||
<Button variant="outline" onClick={() => genFile.current?.click()}>
|
<div className="flex flex-wrap gap-2">
|
||||||
Choose source GEDCOM
|
<Button variant="outline" onClick={() => genFile.current?.click()}>
|
||||||
</Button>
|
Choose source GEDCOM
|
||||||
|
</Button>
|
||||||
|
<Button variant="outline" onClick={guessGender}>
|
||||||
|
Guess from first name
|
||||||
|
</Button>
|
||||||
|
</div>
|
||||||
|
<p className="text-xs text-[var(--muted)]">
|
||||||
|
“Guess from first name” uses a built-in name dictionary for people with no sex set;
|
||||||
|
ambiguous names (Marion, Frances, …) are left for you to decide.
|
||||||
|
</p>
|
||||||
{genMsg && <p className="text-sm text-bronze">{genMsg}</p>}
|
{genMsg && <p className="text-sm text-bronze">{genMsg}</p>}
|
||||||
{gender && (
|
{gender && (
|
||||||
<div className="space-y-2">
|
<div className="space-y-2">
|
||||||
|
|||||||
Vendored
+51
@@ -714,6 +714,26 @@ export interface paths {
|
|||||||
patch?: never;
|
patch?: never;
|
||||||
trace?: never;
|
trace?: never;
|
||||||
};
|
};
|
||||||
|
"/api/v1/trees/{tree_id}/cleanup/gender/guess": {
|
||||||
|
parameters: {
|
||||||
|
query?: never;
|
||||||
|
header?: never;
|
||||||
|
path?: never;
|
||||||
|
cookie?: never;
|
||||||
|
};
|
||||||
|
/**
|
||||||
|
* Guess Gender
|
||||||
|
* @description Best-guess sex from first names (bundled dictionary) for people missing it.
|
||||||
|
*/
|
||||||
|
get: operations["guess_gender_api_v1_trees__tree_id__cleanup_gender_guess_get"];
|
||||||
|
put?: never;
|
||||||
|
post?: never;
|
||||||
|
delete?: never;
|
||||||
|
options?: never;
|
||||||
|
head?: never;
|
||||||
|
patch?: never;
|
||||||
|
trace?: never;
|
||||||
|
};
|
||||||
"/api/v1/trees/{tree_id}/cleanup/gender": {
|
"/api/v1/trees/{tree_id}/cleanup/gender": {
|
||||||
parameters: {
|
parameters: {
|
||||||
query?: never;
|
query?: never;
|
||||||
@@ -3481,6 +3501,37 @@ export interface operations {
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
guess_gender_api_v1_trees__tree_id__cleanup_gender_guess_get: {
|
||||||
|
parameters: {
|
||||||
|
query?: never;
|
||||||
|
header?: never;
|
||||||
|
path: {
|
||||||
|
tree_id: string;
|
||||||
|
};
|
||||||
|
cookie?: never;
|
||||||
|
};
|
||||||
|
requestBody?: never;
|
||||||
|
responses: {
|
||||||
|
/** @description Successful Response */
|
||||||
|
200: {
|
||||||
|
headers: {
|
||||||
|
[name: string]: unknown;
|
||||||
|
};
|
||||||
|
content: {
|
||||||
|
"application/json": components["schemas"]["GenderProposal"][];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
/** @description Validation Error */
|
||||||
|
422: {
|
||||||
|
headers: {
|
||||||
|
[name: string]: unknown;
|
||||||
|
};
|
||||||
|
content: {
|
||||||
|
"application/json": components["schemas"]["HTTPValidationError"];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
apply_gender_api_v1_trees__tree_id__cleanup_gender_post: {
|
apply_gender_api_v1_trees__tree_id__cleanup_gender_post: {
|
||||||
parameters: {
|
parameters: {
|
||||||
query?: never;
|
query?: never;
|
||||||
|
|||||||
@@ -2867,6 +2867,54 @@
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"/api/v1/trees/{tree_id}/cleanup/gender/guess": {
|
||||||
|
"get": {
|
||||||
|
"tags": [
|
||||||
|
"cleanup"
|
||||||
|
],
|
||||||
|
"summary": "Guess Gender",
|
||||||
|
"description": "Best-guess sex from first names (bundled dictionary) for people missing it.",
|
||||||
|
"operationId": "guess_gender_api_v1_trees__tree_id__cleanup_gender_guess_get",
|
||||||
|
"parameters": [
|
||||||
|
{
|
||||||
|
"name": "tree_id",
|
||||||
|
"in": "path",
|
||||||
|
"required": true,
|
||||||
|
"schema": {
|
||||||
|
"type": "string",
|
||||||
|
"format": "uuid",
|
||||||
|
"title": "Tree Id"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"responses": {
|
||||||
|
"200": {
|
||||||
|
"description": "Successful Response",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"$ref": "#/components/schemas/GenderProposal"
|
||||||
|
},
|
||||||
|
"title": "Response Guess Gender Api V1 Trees Tree Id Cleanup Gender Guess Get"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"422": {
|
||||||
|
"description": "Validation Error",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"$ref": "#/components/schemas/HTTPValidationError"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
"/api/v1/trees/{tree_id}/cleanup/gender": {
|
"/api/v1/trees/{tree_id}/cleanup/gender": {
|
||||||
"post": {
|
"post": {
|
||||||
"tags": [
|
"tags": [
|
||||||
|
|||||||
Reference in New Issue
Block a user