Add check for invalid characters in headers, an exception and a test …

…to handle it
praekeltfoundation · Feb 18, 2025 · cc507cd · cc507cd
1 parent 3b890da
commit cc507cd
Show file tree

Hide file tree

Showing 4 changed files with 74 additions and 62 deletions.
diff --git a/home/import_assessments.py b/home/import_assessments.py
@@ -13,6 +13,7 @@
 
 from home.import_helpers import (
     ImportException,
+    convert_headers_to_snake_case,
     validate_using_form,
 )
 from home.import_helpers import (
@@ -115,9 +116,19 @@ def parse_file(self) -> list["AssessmentRow"]:
         rows = [row for _, row in row_iterator]
 
         original_headers = rows[0].keys()
-        self.validate_headers(list(original_headers), row_num=1)
+        headers_mapping = convert_headers_to_snake_case(
+            list(original_headers), row_num=1
+        )
+        snake_case_headers = list(headers_mapping.values())
+        self.validate_headers(snake_case_headers, row_num=1)
+        transformed_rows = [
+            {headers_mapping[key]: value for key, value in row.items()} for row in rows
+        ]
 
-        return [AssessmentRow.from_flat(row, i + 2) for i, row in enumerate(rows)]
+        return [
+            AssessmentRow.from_flat(row, i + 2)
+            for i, row in enumerate(transformed_rows)
+        ]
 
     def set_progress(self, message: str, progress: int) -> None:
         self.progress_queue.put_nowait(progress)

diff --git a/home/import_helpers.py b/home/import_helpers.py
@@ -25,44 +25,32 @@
 
 from .xlsx_helpers import get_active_sheet
 
-TYPO_KEYWORDS = [
-    "question type",
-    "question-type",
-    "high result page",
-    "high-result-page",
-    "high inflection",
-    "high-inflection",
-    "medium result page",
-    "medium-result-page",
-    "medium inflection",
-    "medium-inflection",
-    "low result page",
-    "low-result-page",
-    "skip threshold",
-    "skip-threshold",
-    "skip high result page",
-    "skip-high-result-page",
-    "generic error",
-    "generic_error",
-    "answer semantic ids",
-    "answer-semantic-id",
-    "question semantic id",
-    "question-semantic-id",
-    "answer responses",
-    "answer-responses",
-]
-"""
-List of keywords known to be common user typos or formatting inconsistencies.
-
-These keywords are identified as common variations or errors in user input that
-should be corrected by converting them to snake_case format. The list contains
-different representations of header titles from CMS-Forms for conversion to snake_case.
-
-Any additional keywords from Content Pages and other import applications that need
-similar corrections should be appended to this list to maintain uniformity in data processing.
-Contentset uses Pascal casing so changes to the application may be needed first before including
-those variations in the list.
-"""
+INVALID_CHARACTERS = {
+    ":",
+    ";",
+    "!",
+    "@",
+    "#",
+    "$",
+    "%",
+    "^",
+    "&",
+    "*",
+    "(",
+    ")",
+    "+",
+    "=",
+    "{",
+    "}",
+    "[",
+    "]",
+    "|",
+    "\\",
+    "/",
+    "?",
+    ">",
+    "<",
+}
 
 
 class ImportException(Exception):
@@ -215,55 +203,50 @@ def check_empty_rows(rows: list[dict[str, Any]], row_num: int) -> None:
         )
 
 
-def convert_headers_to_snake_case(headers: list[str]) -> dict[str, str]:
+def convert_headers_to_snake_case(headers: list[str], row_num: int) -> dict[str, str]:
     """
     Converts a list of headers to snake_case and returns a mapping.
     """
-    return {header: to_snake_case(header) for header in headers}
+    return {header: to_snake_case(header, row_num) for header in headers}
 
 
-def to_snake_case(s: str) -> str:
+def to_snake_case(s: str, row_num: int) -> str:
     """
-    Converts string to snake_case.
+    Converts a given string to snake_case if it contains spaces or hyphens.
+    Throws an exception for invalid headers containing special characters.
     """
-    return re.sub(r"[\W_]+", "_", s).lower().strip("_")
+    if any(char in s for char in INVALID_CHARACTERS):
+        raise ImportException(
+            f"Invalid header: '{s}' contains invalid characters.", row_num=row_num
+        )
+    return re.sub(r"[\W_]+", "_", s).strip("_")
 
 
 def fix_rows(rows: Iterator[dict[str | Any, Any]]) -> Iterator[dict[str, str | None]]:
     """
-    Fix keys for all rows by lowercasing keys, optionally converting to snake_case
-    if header text matches typo_keywords, and removing whitespace from keys and values.
+    Fix keys for all rows by lowercasing keys and removing whitespace from keys and values
     """
-
     try:
         first_row = next(rows)
     except StopIteration:
         return iter([])
 
-    if len(first_row) != len(fix_row(first_row, TYPO_KEYWORDS)):
+    if len(first_row) != len(fix_row(first_row)):
         raise ImportException(
             "Invalid format. Please check that there are no duplicate headers."
         )
-    yield fix_row(first_row, TYPO_KEYWORDS)
+    yield fix_row(first_row)
 
     for row in rows:
-        yield fix_row(row, TYPO_KEYWORDS)
+        yield fix_row(row)
 
 
-def fix_row(row: dict[str, str | None], keywords: list[str]) -> dict[str, str | None]:
+def fix_row(row: dict[str, str | None]) -> dict[str, str | None]:
     """
-    Fix a single row by lowercasing the key, converting it to snake_case
-    if it matches a typo_keyword, and removing whitespace from the key and value.
+    Fix a single row by lowercasing the key and removing whitespace from the key and value
     """
     try:
-        return {
-            (
-                to_snake_case(_normalise_key(k))
-                if _normalise_key(k) in keywords
-                else _normalise_key(k)
-            ): _normalise_value(v)
-            for k, v in row.items()
-        }
+        return {_normalise_key(k): _normalise_value(v) for k, v in row.items()}
     except AttributeError:
         raise ImportException(
             "Invalid format. Please check that all row values have headers."

diff --git a/home/tests/import-export-data/invalid_character.csv b/home/tests/import-export-data/invalid_character.csv
@@ -0,0 +1,5 @@
+title,question type:,tags,Slug,version,locale,High result page,High inflection,Medium result page,Medium inflection,Low result page,Skip threshold,Skip high result page,generic_error,question,explainer,error,min,max,answers,scores,Answer semantic ids,Question semantic id,Answer responses
+Freetext Question,freetext_question,draft-assessment,Draft-assessment,v1.0,en,high-inflection,5,medium-score,3,,0,,This is a generic error for draft page,Is this a draft assessment,,,,,,,,draf-assessment,
+Test min max range,integer_question,test-min-max-range,test-min-max-range,v1.0,en,high-inflection,5,medium-score,3,,0,,This is a generic error,Lowest temeprature you're experienced,We need to know some things,This is an error message,0,30,,,,lowest-temperature,
+Weather Trivia,integer_question,weather-trivia,weather-trivia,v1.0,en,high-inflection,5,medium-score,1,low-score,0,,"Sorry, we didn't quite get that.",What's the coldest weather you're experienced?,We need to know some things,Your reply should be between {min} and {max},50,70,,,,coldest-weather,
+Draft Assessment 2,freetext_question,draft-assessment,Draft-assessment-2,v1.0,en,high-inflection,5,medium-score,3,,0,,This is a generic error for draft page,Is this a draft assessment,,,,,,,,draf-assessment,
diff --git a/home/tests/test_assessment_import_export.py b/home/tests/test_assessment_import_export.py
@@ -690,6 +690,19 @@ def test_empty_rows(self, csv_impexp: ImportExport) -> None:
         ]
         assert e.value.row_num == 1
 
+    def test_invalid_character(self, csv_impexp: ImportExport) -> None:
+        """
+        Importing an empty CSV with an invalid character in the header
+        should return an error with the line number and header
+        that was wrongly entered.
+        """
+        with pytest.raises(ImportException) as e:
+            csv_impexp.import_file("invalid_character.csv")
+        assert e.value.message == [
+            "Invalid header: 'question type:' contains invalid characters."
+        ]
+        assert e.value.row_num == 1
+
 
 @pytest.mark.usefixtures("result_content_pages")
 @pytest.mark.django_db()