[EDS-606] [EDS-603] Improve query accuracy and efficiency (#120)

* Support toggling experimental features by environment. Defaults to false. * Add experimental feature to toggle search mode for testing the new search. * Add an experimental search type to improve query efficiency - Adds a new experimental search option to allow reviewers to toggle between "classic" and "experimental" search mode. This is not a permanent or customer-facing UX component. - Implements a new search mode that can be selected; only has an effect when searching by name. This new mode reduces request times by about 60% and provides more accurate results. * Add an experimental search type to improve query efficiency - Adds a new experimental search option to allow reviewers to toggle between "classic" and "experimental" search mode. This is not a permanent or customer-facing UX component. - Implements a new search mode that can be selected; only has an effect when searching by name. This new mode reduces request times by about 60% and provides more accurate results. * [Bot] Update version to 2.0.12 Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com>
UWIT-IAM · Oct 6, 2021 · 3780d16 · 3780d16
1 parent 2ba6a29
commit 3780d16
Show file tree

Hide file tree

Showing 23 changed files with 708 additions and 246 deletions.
diff --git a/docker/husky-directory-base.dockerfile b/docker/husky-directory-base.dockerfile
@@ -1,8 +1,9 @@
 ARG BASE_VERSION=latest
 FROM ghcr.io/uwit-iam/uw-saml-poetry:${BASE_VERSION} as poetry-base
+RUN apt-get update && apt-get -y install gcc
 ARG FINGERPRINT=""
 WORKDIR /app
 COPY poetry.lock pyproject.toml ./
 ENV PATH="$POETRY_HOME/bin:$PATH" \
     UW_HUSKY_DIRECTORY_BASE_FINGERPRINT=${FINGERPRINT}
-RUN poetry install
+RUN poetry install --no-root --no-interaction && apt-get -y remove gcc && apt-get -y autoremove
diff --git a/husky_directory/app_config.py b/husky_directory/app_config.py
@@ -201,6 +201,7 @@ class ApplicationConfig(FlaskConfigurationSettings):
     version: Optional[str] = Field(None, env="HUSKY_DIRECTORY_VERSION")
     start_time: datetime = Field(datetime.now())
     deployment_id: Optional[str] = Field(None, env="DEPLOYMENT_ID")
+    show_experimental: Optional[bool] = Field(None, env="SHOW_EXPERIMENTAL_FEATURES")
 
     # Aggregated Settings
     pws_settings: PWSSettings = PWSSettings()

diff --git a/husky_directory/blueprints/search.py b/husky_directory/blueprints/search.py
@@ -9,6 +9,7 @@
 from werkzeug.exceptions import BadRequest, HTTPException
 from werkzeug.local import LocalProxy
 
+from husky_directory.app_config import ApplicationConfig
 from husky_directory.models.search import (
     DirectoryBaseModel,
     Person,
@@ -33,6 +34,7 @@ class Config(DirectoryBaseModel.Config):
     error: Optional[ErrorModel]
     status_code: int = 200
     uwnetid: Optional[str] = None
+    show_experimental: bool = False
 
 
 @singleton
@@ -54,8 +56,10 @@ def __init__(self, logger: Logger, injector: Injector):
         )
 
     @staticmethod
-    def index(session: LocalProxy):
-        context = RenderingContext(uwnetid=session.get("uwnetid"))
+    def index(session: LocalProxy, settings: ApplicationConfig):
+        context = RenderingContext(
+            uwnetid=session.get("uwnetid"), show_experimental=settings.show_experimental
+        )
         return (
             render_template("views/index.html", **context.dict(exclude_none=True)),
             200,
@@ -124,9 +128,12 @@ def search_listing(
         service: DirectorySearchService,
         logger: Logger,
         session: LocalProxy,
+        settings: ApplicationConfig,
     ):
-        context = RenderingContext.construct()
-        context.uwnetid = session.get("uwnetid")
+        context = RenderingContext.construct(
+            uwnetid=session.get("uwnetid"),
+            show_experimental=settings.show_experimental,
+        )
         try:
             form_input = SearchDirectoryFormInput.parse_obj(request.form)
             context.request_input = form_input

diff --git a/husky_directory/models/enum.py b/husky_directory/models/enum.py
@@ -24,3 +24,13 @@ class PopulationType(Enum):
 class ResultDetail(Enum):
     full = "full"
     summary = "summary"
+
+
+class SearchType(Enum):
+    # Old-school cool. It's inefficient but
+    # very explicit.
+    classic = "classic"
+
+    # In active development, constantly striving to
+    # provide more relevant results, faster!
+    experimental = "experimental"
diff --git a/husky_directory/models/pws.py b/husky_directory/models/pws.py
@@ -6,15 +6,31 @@
 """
 from __future__ import annotations
 
+import re
 from typing import Any, Dict, List, NoReturn, Optional
 
 from inflection import humanize
 from pydantic import BaseModel, Extra, Field, validator
 
+from .base import DirectoryBaseModel
 from .common import RecordConstraint, UWDepartmentRole
 from .enum import AffiliationState
 from ..util import camelize
 
+_can_humanize_expr = re.compile("^[a-zA-Z]+$")
+
+
+def humanize_(val: str) -> str:
+    """
+
+    Don't use the humanize function for names with punctuation,
+    as humanize may make them worse. For instance "Anne-marie"
+    instead of "Anne-Marie".
+    """
+    if re.fullmatch(_can_humanize_expr, val):
+        return humanize(val)
+    return val
+
 
 class PWSBaseModel(BaseModel):
     class Config:
@@ -173,6 +189,18 @@ class NamedIdentity(PWSBaseModel):
     preferred_middle_name: Optional[str]
     preferred_last_name: Optional[str] = Field(None, alias="PreferredSurname")
 
+    # These next fields are calculated on instantiation
+    # to make it easier to work with these names in
+    # meaningful ways. They are listed as optional
+    # because they aren't required to create the object,
+    # but they will always be populated during creation.
+    displayed_surname: Optional[str]
+    displayed_first_name: Optional[str]
+    displayed_middle_name: Optional[str]
+    name_tokens: List[str] = []
+    canonical_tokens: List[str] = []
+    sort_key: Optional[str]
+
     @validator(
         "display_name",
         "registered_name",
@@ -181,19 +209,123 @@ class NamedIdentity(PWSBaseModel):
         "preferred_first_name",
         "preferred_middle_name",
         "preferred_last_name",
+        always=True,
     )
-    def humanize_name_field(cls, value: Optional[str]) -> Optional[str]:
+    def sanitize_name_fields(cls, value: Optional[str]) -> Optional[str]:
         if value:
-            return " ".join(humanize(v) for v in value.split())
+            return " ".join(humanize_(v) for v in value.split())
         return value
 
-    def get_displayed_surname(self):
-        display_name_tokens = self.display_name.split()
-        if self.preferred_last_name and self.preferred_last_name in display_name_tokens:
-            return self.preferred_last_name
-        elif self.registered_surname in display_name_tokens:
-            return self.registered_surname
-        return self.display_name.split()[-1]
+    @validator("displayed_surname", always=True)
+    def populate_displayed_surname(cls, v: Any, values: Dict):
+        """
+        Returns the canonical surname for the identity, if there is one;
+        otherwise returns the last token in the user's display name.
+        """
+        display_name = values.get("display_name")
+        preferred_last_name = values.get("preferred_last_name")
+        registered_surname = values.get("registered_surname")
+
+        if preferred_last_name and preferred_last_name in display_name:
+            return preferred_last_name
+        elif registered_surname and registered_surname in display_name:
+            return registered_surname
+
+        # This should only happen if we have dirty data.
+        # If nothing makes sense, we'll just assume the
+        # default of a one-token surname.
+        return display_name.split()[-1]
+
+    @validator("displayed_first_name", always=True)
+    def populate_displayed_first_name(cls, v: Any, values: Dict):
+        """
+        Returns the canonical first name for the identity, if there is one;
+        otherwise returns the first token in the user's display name.
+        """
+        display_name = values.get("display_name")
+        last_name = values.get("displayed_surname")
+        last_name_index = display_name.index(last_name)
+        first_middle = display_name[:last_name_index]
+        preferred_first_name = values.get("preferred_first_name")
+        registered_first_middle = values.get("registered_first_middle_name")
+
+        if preferred_first_name and preferred_first_name in first_middle:
+            return preferred_first_name
+        elif registered_first_middle and registered_first_middle in first_middle:
+            return first_middle.strip()
+
+        # This should only happen if we have dirty data.
+        # If nothing makes sense, we'll just assume the
+        # default of a one-token name.
+        return display_name.split()[0]
+
+    @validator("displayed_middle_name", always=True)
+    def populate_displayed_middle_name(cls, v: Any, values: Dict):
+        """
+        Returns the canonical middle name for the identity, if
+        they have set a preferred middle name. Otherwise, returns
+        whatever part of the name is not the first name and is not the last name.
+        """
+        preferred_middle_name = values.get("preferred_middle_name")
+        registered_first_middle_name = values.get("registered_first_middle_name")
+        displayed_first_name = values.get("displayed_first_name")
+        displayed_surname = values.get("displayed_surname")
+        display_name = values.get("display_name")
+
+        if preferred_middle_name and preferred_middle_name in display_name:
+            return preferred_middle_name
+
+        splice_index = len(displayed_first_name) + 1
+
+        if (
+            registered_first_middle_name
+            and displayed_first_name in registered_first_middle_name
+        ):
+            middle_name = registered_first_middle_name[splice_index:]
+        else:
+            surname_index = display_name.index(displayed_surname)
+            middle_name = display_name[splice_index:surname_index]
+
+        if middle_name and middle_name in display_name:
+            return middle_name
+        return None
+
+    @validator("name_tokens", always=True)
+    def populate_name_tokens(cls, v: Any, values: Dict):
+        """
+        Populates a name tokens field to be used for processing,
+        grouping together tokens that are multi-token name pieces.
+        For instance "Ana Mari Cauce" will return ["Ana Mari", "Cauce"]
+        """
+        return [
+            values.get(field)
+            for field in (
+                "displayed_first_name",
+                "displayed_middle_name",
+                "displayed_surname",
+            )
+            if values.get(field)
+        ]
+
+    @validator("canonical_tokens", always=True)
+    def populate_sorted_name_tokens(cls, v: Any, values: Dict):
+        tokens = values.get("name_tokens")
+        result = [tokens[0]]
+        if len(tokens) > 1:
+            result.insert(0, tokens[-1])
+            if len(tokens) > 2:
+                result.extend(tokens[1:-1])
+        return result
+
+    @validator("sort_key", always=True)
+    def populate_sort_key(cls, v: Any, values: Dict):
+        """
+        Pre-calculates the sort key for the display name,
+        using the grouped name tokens.
+
+        "Ana Mari Cauce" becomes "Cauce Ana Mari"
+        """
+        return " ".join(values.get("canonical_tokens"))
 
 
 class PersonOutput(NamedIdentity):
@@ -248,3 +380,28 @@ class ListPersonsOutput(ListResponsesOutputWrapper):
     )
     previous: Optional[ListPersonsInput] = Field(None, description="See `next`")
     request_statistics: Optional[ListPersonsRequestStatistics]
+
+
+class ResultBucket(DirectoryBaseModel):
+    description: str
+    students: List[PersonOutput] = []
+    employees: List[PersonOutput] = []
+
+    # The relevance is an index value to help sort the
+    # buckets themselves. The lower the value, the closer
+    # to the beginning of a list of buckets this bucket will be.
+    relevance: int = 0
+
+    def add_person(self, pws_person: PersonOutput) -> NoReturn:
+        if pws_person.affiliations.employee:
+            self.employees.append(pws_person)
+        if pws_person.affiliations.student:
+            self.students.append(pws_person)
+
+    @property
+    def sorted_students(self) -> List[PersonOutput]:
+        return sorted(self.students, key=lambda p: p.sort_key)
+
+    @property
+    def sorted_employees(self) -> List[PersonOutput]:
+        return sorted(self.employees, key=lambda p: p.sort_key)
diff --git a/husky_directory/models/search.py b/husky_directory/models/search.py
@@ -11,7 +11,7 @@
 
 from husky_directory.models.base import DirectoryBaseModel
 from husky_directory.models.common import UWDepartmentRole
-from husky_directory.models.enum import PopulationType, ResultDetail
+from husky_directory.models.enum import PopulationType, ResultDetail, SearchType
 
 # There is no direct dependency on the model here, we only need it for type checking;
 # this protects us from accidentally creating a cyclic dependency between the modules.
@@ -59,6 +59,7 @@ class SearchDirectoryFormInput(DirectoryBaseModel):
     render_query: Optional[str]
     render_population: Optional[PopulationType]
     render_length: Optional[ResultDetail]
+    search_type: SearchType = SearchType.classic
 
     include_test_identities: bool = False  # Not currently supported
 
@@ -135,6 +136,7 @@ class SearchDirectoryInput(DirectoryBaseModel):
     population: PopulationType = PopulationType.employees
     include_test_identities: bool = False
     person_href: Optional[str]
+    search_type: SearchType = SearchType.classic
 
     @classmethod
     def search_methods(cls) -> List[str]:
@@ -210,6 +212,7 @@ class Person(DirectoryBaseModel):
     box_number: Optional[str]
     departments: List[UWDepartmentRole] = []
     sort_key: Optional[str]
+    populations: List[PopulationType] = []
 
     href: str