Skip to content

Commit

Permalink
[IAMTEAM-111] Fix users' display name casing being mangled during lex…
Browse files Browse the repository at this point in the history
…ical analysis (#127)

* [IAMTEAM-111] Fix users' display name casing being mangled during lexical analysis

* [Bot] Update version to 2.1.5

Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com>
  • Loading branch information
Thomas Thorogood and github-actions[bot] authored Jan 7, 2022
1 parent 5326bb0 commit 5b4b102
Show file tree
Hide file tree
Showing 9 changed files with 259 additions and 209 deletions.
182 changes: 1 addition & 181 deletions husky_directory/models/pws.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,31 +6,14 @@
"""
from __future__ import annotations

import re
from typing import Any, Dict, List, NoReturn, Optional

from inflection import humanize
from pydantic import BaseModel, Extra, Field, validator

from .base import DirectoryBaseModel
from .common import RecordConstraint, UWDepartmentRole
from .enum import AffiliationState
from ..util import camelize

_can_humanize_expr = re.compile("^[a-zA-Z]+$")


def humanize_(val: str) -> str:
"""
Don't use the humanize function for names with punctuation,
as humanize may make them worse. For instance "Anne-marie"
instead of "Anne-Marie".
"""
if re.fullmatch(_can_humanize_expr, val):
return humanize(val)
return val


class PWSBaseModel(BaseModel):
class Config:
Expand Down Expand Up @@ -157,7 +140,7 @@ class StudentDirectoryListing(PWSBaseModel):
publish_in_directory: bool
phone: Optional[str]
email: Optional[str]
# "class" is a reserved keyword, so we have to name this field somethign else.
# "class" is a reserved keyword, so we have to name this field something else.
class_level: Optional[str] = Field(default=None, alias="Class")
departments: List[str] = []

Expand Down Expand Up @@ -189,144 +172,6 @@ class NamedIdentity(PWSBaseModel):
preferred_middle_name: Optional[str]
preferred_last_name: Optional[str] = Field(None, alias="PreferredSurname")

# These next fields are calculated on instantiation
# to make it easier to work with these names in
# meaningful ways. They are listed as optional
# because they aren't required to create the object,
# but they will always be populated during creation.
displayed_surname: Optional[str]
displayed_first_name: Optional[str]
displayed_middle_name: Optional[str]
name_tokens: List[str] = []
canonical_tokens: List[str] = []
sort_key: Optional[str]

@validator(
"display_name",
"registered_name",
"registered_first_middle_name",
"registered_surname",
"preferred_first_name",
"preferred_middle_name",
"preferred_last_name",
always=True,
)
def sanitize_name_fields(cls, value: Optional[str]) -> Optional[str]:
if value:
return " ".join(humanize_(v) for v in value.split())
return value

@validator("displayed_surname", always=True)
def populate_displayed_surname(cls, v: Any, values: Dict):
"""
Returns the canonical surname for the identity, if there is one;
otherwise returns the last token in the user's display name.
"""
display_name = values.get("display_name")
preferred_last_name = values.get("preferred_last_name")
registered_surname = values.get("registered_surname")

if preferred_last_name and preferred_last_name in display_name:
return preferred_last_name
elif registered_surname and registered_surname in display_name:
return registered_surname

# This should only happen if we have dirty data.
# If nothing makes sense, we'll just assume the
# default of a one-token surname.
return display_name.split()[-1]

@validator("displayed_first_name", always=True)
def populate_displayed_first_name(cls, v: Any, values: Dict):
"""
Returns the canonical first name for the identity, if there is one;
otherwise returns the first token in the user's display name.
"""
display_name = values.get("display_name")
last_name = values.get("displayed_surname")
last_name_index = display_name.index(last_name)
first_middle = display_name[:last_name_index]
preferred_first_name = values.get("preferred_first_name")
registered_first_middle = values.get("registered_first_middle_name")

if preferred_first_name and preferred_first_name in first_middle:
return preferred_first_name
elif registered_first_middle and registered_first_middle in first_middle:
return first_middle.strip()

# This should only happen if we have dirty data.
# If nothing makes sense, we'll just assume the
# default of a one-token name.
return display_name.split()[0]

@validator("displayed_middle_name", always=True)
def populate_displayed_middle_name(cls, v: Any, values: Dict):
"""
Returns the canonical middle name for the identity, if
they have set a preferred middle name. Otherwise, returns
whatever part of the name is not the first name and is not the last name.
"""
preferred_middle_name = values.get("preferred_middle_name")
registered_first_middle_name = values.get("registered_first_middle_name")
displayed_first_name = values.get("displayed_first_name")
displayed_surname = values.get("displayed_surname")
display_name = values.get("display_name")

if preferred_middle_name and preferred_middle_name in display_name:
return preferred_middle_name

splice_index = len(displayed_first_name) + 1

if (
registered_first_middle_name
and displayed_first_name in registered_first_middle_name
):
middle_name = registered_first_middle_name[splice_index:]
else:
surname_index = display_name.index(displayed_surname)
middle_name = display_name[splice_index:surname_index]

if middle_name and middle_name in display_name:
return middle_name
return None

@validator("name_tokens", always=True)
def populate_name_tokens(cls, v: Any, values: Dict):
"""
Populates a name tokens field to be used for processing,
grouping together tokens that are multi-token name pieces.
For instance "Ana Mari Cauce" will return ["Ana Mari", "Cauce"]
"""
return [
values.get(field)
for field in (
"displayed_first_name",
"displayed_middle_name",
"displayed_surname",
)
if values.get(field)
]

@validator("canonical_tokens", always=True)
def populate_sorted_name_tokens(cls, v: Any, values: Dict):
tokens = values.get("name_tokens")
result = [tokens[0]]
if len(tokens) > 1:
result.insert(0, tokens[-1])
if len(tokens) > 2:
result.extend(tokens[1:-1])
return result

@validator("sort_key", always=True)
def populate_sort_key(cls, v: Any, values: Dict):
"""
Pre-calculates the sort key for the display name,
using the grouped name tokens.
"Ana Mari Cauce" becomes "Cauce Ana Mari"
"""
return " ".join(values.get("canonical_tokens"))


class PersonOutput(NamedIdentity):
affiliations: PersonAffiliations = Field(
Expand Down Expand Up @@ -380,28 +225,3 @@ class ListPersonsOutput(ListResponsesOutputWrapper):
)
previous: Optional[ListPersonsInput] = Field(None, description="See `next`")
request_statistics: Optional[ListPersonsRequestStatistics]


class ResultBucket(DirectoryBaseModel):
description: str
students: List[PersonOutput] = []
employees: List[PersonOutput] = []

# The relevance is an index value to help sort the
# buckets themselves. The lower the value, the closer
# to the beginning of a list of buckets this bucket will be.
relevance: int = 0

def add_person(self, pws_person: PersonOutput) -> NoReturn:
if pws_person.affiliations.employee:
self.employees.append(pws_person)
if pws_person.affiliations.student:
self.students.append(pws_person)

@property
def sorted_students(self) -> List[PersonOutput]:
return sorted(self.students, key=lambda p: p.sort_key)

@property
def sorted_employees(self) -> List[PersonOutput]:
return sorted(self.employees, key=lambda p: p.sort_key)
30 changes: 30 additions & 0 deletions husky_directory/models/transforms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from typing import List, NoReturn

from husky_directory.models.base import DirectoryBaseModel
from husky_directory.models.pws import PersonOutput
from husky_directory.services.name_analyzer import NameAnalyzer


class ResultBucket(DirectoryBaseModel):
description: str
students: List[PersonOutput] = []
employees: List[PersonOutput] = []

# The relevance is an index value to help sort the
# buckets themselves. The lower the value, the closer
# to the beginning of a list of buckets this bucket will be.
relevance: int = 0

def add_person(self, pws_person: PersonOutput) -> NoReturn:
if pws_person.affiliations.employee:
self.employees.append(pws_person)
if pws_person.affiliations.student:
self.students.append(pws_person)

@property
def sorted_students(self) -> List[PersonOutput]:
return sorted(self.students, key=lambda p: NameAnalyzer(p).sort_key)

@property
def sorted_employees(self) -> List[PersonOutput]:
return sorted(self.employees, key=lambda p: NameAnalyzer(p).sort_key)
Loading

0 comments on commit 5b4b102

Please sign in to comment.