specific list of characters we convert and remove unneeded tests

praekeltfoundation · Feb 18, 2025 · 3b890da · 3b890da
1 parent 3206672
commit 3b890da
Show file tree

Hide file tree

Showing 6 changed files with 59 additions and 54 deletions.
diff --git a/home/import_assessments.py b/home/import_assessments.py
@@ -13,7 +13,6 @@
 
 from home.import_helpers import (
     ImportException,
-    convert_headers_to_snake_case,
     validate_using_form,
 )
 from home.import_helpers import (
@@ -116,17 +115,9 @@ def parse_file(self) -> list["AssessmentRow"]:
         rows = [row for _, row in row_iterator]
 
         original_headers = rows[0].keys()
-        headers_mapping = convert_headers_to_snake_case(list(original_headers))
-        snake_case_headers = list(headers_mapping.values())
-        self.validate_headers(snake_case_headers, row_num=1)
-        transformed_rows = [
-            {headers_mapping[key]: value for key, value in row.items()} for row in rows
-        ]
+        self.validate_headers(list(original_headers), row_num=1)
 
-        return [
-            AssessmentRow.from_flat(row, i + 2)
-            for i, row in enumerate(transformed_rows)
-        ]
+        return [AssessmentRow.from_flat(row, i + 2) for i, row in enumerate(rows)]
 
     def set_progress(self, message: str, progress: int) -> None:
         self.progress_queue.put_nowait(progress)

diff --git a/home/import_helpers.py b/home/import_helpers.py
@@ -25,6 +25,45 @@
 
 from .xlsx_helpers import get_active_sheet
 
+TYPO_KEYWORDS = [
+    "question type",
+    "question-type",
+    "high result page",
+    "high-result-page",
+    "high inflection",
+    "high-inflection",
+    "medium result page",
+    "medium-result-page",
+    "medium inflection",
+    "medium-inflection",
+    "low result page",
+    "low-result-page",
+    "skip threshold",
+    "skip-threshold",
+    "skip high result page",
+    "skip-high-result-page",
+    "generic error",
+    "generic_error",
+    "answer semantic ids",
+    "answer-semantic-id",
+    "question semantic id",
+    "question-semantic-id",
+    "answer responses",
+    "answer-responses",
+]
+"""
+List of keywords known to be common user typos or formatting inconsistencies.
+
+These keywords are identified as common variations or errors in user input that
+should be corrected by converting them to snake_case format. The list contains
+different representations of header titles from CMS-Forms for conversion to snake_case.
+
+Any additional keywords from Content Pages and other import applications that need
+similar corrections should be appended to this list to maintain uniformity in data processing.
+Contentset uses Pascal casing so changes to the application may be needed first before including
+those variations in the list.
+"""
+
 
 class ImportException(Exception):
     """
@@ -192,29 +231,39 @@ def to_snake_case(s: str) -> str:
 
 def fix_rows(rows: Iterator[dict[str | Any, Any]]) -> Iterator[dict[str, str | None]]:
     """
-    Fix keys for all rows by lowercasing keys and removing whitespace from keys and values
+    Fix keys for all rows by lowercasing keys, optionally converting to snake_case
+    if header text matches typo_keywords, and removing whitespace from keys and values.
     """
+
     try:
         first_row = next(rows)
     except StopIteration:
         return iter([])
 
-    if len(first_row) != len(fix_row(first_row)):
+    if len(first_row) != len(fix_row(first_row, TYPO_KEYWORDS)):
         raise ImportException(
             "Invalid format. Please check that there are no duplicate headers."
         )
-    yield fix_row(first_row)
+    yield fix_row(first_row, TYPO_KEYWORDS)
 
     for row in rows:
-        yield fix_row(row)
+        yield fix_row(row, TYPO_KEYWORDS)
 
 
-def fix_row(row: dict[str, str | None]) -> dict[str, str | None]:
+def fix_row(row: dict[str, str | None], keywords: list[str]) -> dict[str, str | None]:
     """
-    Fix a single row by lowercasing the key and removing whitespace from the key and value
+    Fix a single row by lowercasing the key, converting it to snake_case
+    if it matches a typo_keyword, and removing whitespace from the key and value.
     """
     try:
-        return {_normalise_key(k): _normalise_value(v) for k, v in row.items()}
+        return {
+            (
+                to_snake_case(_normalise_key(k))
+                if _normalise_key(k) in keywords
+                else _normalise_key(k)
+            ): _normalise_value(v)
+            for k, v in row.items()
+        }
     except AttributeError:
         raise ImportException(
             "Invalid format. Please check that all row values have headers."

diff --git a/home/tests/import-export-data/assessments_missing_generic_error.csv b/home/tests/import-export-data/assessments_missing_generic_error.csv
diff --git a/home/tests/import-export-data/assessments_missing_locale.csv b/home/tests/import-export-data/assessments_missing_locale.csv
diff --git a/home/tests/import-export-data/broken_assessment.csv b/home/tests/import-export-data/broken_assessment.csv
@@ -1,2 +1,2 @@
 this,is,not,a,valid,content,csv
-"For real, it's totally not.",,,,,,
+"For real, it's totally not."
diff --git a/home/tests/test_assessment_import_export.py b/home/tests/test_assessment_import_export.py
@@ -678,29 +678,6 @@ def test_missing_title(self, csv_impexp: ImportExport) -> None:
         assert e.value.message == "The import file is missing required fields: title"
         assert e.value.row_num == 4
 
-    def test_missing_locale(self, csv_impexp: ImportExport) -> None:
-        """
-        Importing a CSV with a missing locale field should return an error
-        that a locale is mmissing
-        """
-        with pytest.raises(ImportAssessmentException) as e:
-            csv_impexp.import_file("assessments_missing_locale.csv")
-        assert e.value.message == "The import file is missing required fields: locale"
-        assert e.value.row_num == 5
-
-    def test_missing_generic_error(self, csv_impexp: ImportExport) -> None:
-        """
-        Importing a CSV with a missing generic error field should return an error
-        that a generic error is mmissing
-        """
-        with pytest.raises(ImportAssessmentException) as e:
-            csv_impexp.import_file("assessments_missing_generic_error.csv")
-        assert (
-            e.value.message
-            == "The import file is missing required fields: generic_error"
-        )
-        assert e.value.row_num == 2
-
     def test_empty_rows(self, csv_impexp: ImportExport) -> None:
         """
         Importing an empty CSV should return an error that the