diff --git a/home/import_assessments.py b/home/import_assessments.py index 50779595..f5d5cbe0 100644 --- a/home/import_assessments.py +++ b/home/import_assessments.py @@ -13,6 +13,7 @@ from home.import_helpers import ( ImportException, + convert_headers_to_snake_case, validate_using_form, ) from home.import_helpers import ( @@ -115,9 +116,19 @@ def parse_file(self) -> list["AssessmentRow"]: rows = [row for _, row in row_iterator] original_headers = rows[0].keys() - self.validate_headers(list(original_headers), row_num=1) + headers_mapping = convert_headers_to_snake_case( + list(original_headers), row_num=1 + ) + snake_case_headers = list(headers_mapping.values()) + self.validate_headers(snake_case_headers, row_num=1) + transformed_rows = [ + {headers_mapping[key]: value for key, value in row.items()} for row in rows + ] - return [AssessmentRow.from_flat(row, i + 2) for i, row in enumerate(rows)] + return [ + AssessmentRow.from_flat(row, i + 2) + for i, row in enumerate(transformed_rows) + ] def set_progress(self, message: str, progress: int) -> None: self.progress_queue.put_nowait(progress) diff --git a/home/import_helpers.py b/home/import_helpers.py index c45355a2..8467a814 100644 --- a/home/import_helpers.py +++ b/home/import_helpers.py @@ -25,44 +25,32 @@ from .xlsx_helpers import get_active_sheet -TYPO_KEYWORDS = [ - "question type", - "question-type", - "high result page", - "high-result-page", - "high inflection", - "high-inflection", - "medium result page", - "medium-result-page", - "medium inflection", - "medium-inflection", - "low result page", - "low-result-page", - "skip threshold", - "skip-threshold", - "skip high result page", - "skip-high-result-page", - "generic error", - "generic_error", - "answer semantic ids", - "answer-semantic-id", - "question semantic id", - "question-semantic-id", - "answer responses", - "answer-responses", -] -""" -List of keywords known to be common user typos or formatting inconsistencies. - -These keywords are identified as common variations or errors in user input that -should be corrected by converting them to snake_case format. The list contains -different representations of header titles from CMS-Forms for conversion to snake_case. - -Any additional keywords from Content Pages and other import applications that need -similar corrections should be appended to this list to maintain uniformity in data processing. -Contentset uses Pascal casing so changes to the application may be needed first before including -those variations in the list. -""" +INVALID_CHARACTERS = { + ":", + ";", + "!", + "@", + "#", + "$", + "%", + "^", + "&", + "*", + "(", + ")", + "+", + "=", + "{", + "}", + "[", + "]", + "|", + "\\", + "/", + "?", + ">", + "<", +} class ImportException(Exception): @@ -215,55 +203,50 @@ def check_empty_rows(rows: list[dict[str, Any]], row_num: int) -> None: ) -def convert_headers_to_snake_case(headers: list[str]) -> dict[str, str]: +def convert_headers_to_snake_case(headers: list[str], row_num: int) -> dict[str, str]: """ Converts a list of headers to snake_case and returns a mapping. """ - return {header: to_snake_case(header) for header in headers} + return {header: to_snake_case(header, row_num) for header in headers} -def to_snake_case(s: str) -> str: +def to_snake_case(s: str, row_num: int) -> str: """ - Converts string to snake_case. + Converts a given string to snake_case if it contains spaces or hyphens. + Throws an exception for invalid headers containing special characters. """ - return re.sub(r"[\W_]+", "_", s).lower().strip("_") + if any(char in s for char in INVALID_CHARACTERS): + raise ImportException( + f"Invalid header: '{s}' contains invalid characters.", row_num=row_num + ) + return re.sub(r"[\W_]+", "_", s).strip("_") def fix_rows(rows: Iterator[dict[str | Any, Any]]) -> Iterator[dict[str, str | None]]: """ - Fix keys for all rows by lowercasing keys, optionally converting to snake_case - if header text matches typo_keywords, and removing whitespace from keys and values. + Fix keys for all rows by lowercasing keys and removing whitespace from keys and values """ - try: first_row = next(rows) except StopIteration: return iter([]) - if len(first_row) != len(fix_row(first_row, TYPO_KEYWORDS)): + if len(first_row) != len(fix_row(first_row)): raise ImportException( "Invalid format. Please check that there are no duplicate headers." ) - yield fix_row(first_row, TYPO_KEYWORDS) + yield fix_row(first_row) for row in rows: - yield fix_row(row, TYPO_KEYWORDS) + yield fix_row(row) -def fix_row(row: dict[str, str | None], keywords: list[str]) -> dict[str, str | None]: +def fix_row(row: dict[str, str | None]) -> dict[str, str | None]: """ - Fix a single row by lowercasing the key, converting it to snake_case - if it matches a typo_keyword, and removing whitespace from the key and value. + Fix a single row by lowercasing the key and removing whitespace from the key and value """ try: - return { - ( - to_snake_case(_normalise_key(k)) - if _normalise_key(k) in keywords - else _normalise_key(k) - ): _normalise_value(v) - for k, v in row.items() - } + return {_normalise_key(k): _normalise_value(v) for k, v in row.items()} except AttributeError: raise ImportException( "Invalid format. Please check that all row values have headers." diff --git a/home/tests/import-export-data/invalid_character.csv b/home/tests/import-export-data/invalid_character.csv new file mode 100644 index 00000000..88aac940 --- /dev/null +++ b/home/tests/import-export-data/invalid_character.csv @@ -0,0 +1,5 @@ +title,question type:,tags,Slug,version,locale,High result page,High inflection,Medium result page,Medium inflection,Low result page,Skip threshold,Skip high result page,generic_error,question,explainer,error,min,max,answers,scores,Answer semantic ids,Question semantic id,Answer responses +Freetext Question,freetext_question,draft-assessment,Draft-assessment,v1.0,en,high-inflection,5,medium-score,3,,0,,This is a generic error for draft page,Is this a draft assessment,,,,,,,,draf-assessment, +Test min max range,integer_question,test-min-max-range,test-min-max-range,v1.0,en,high-inflection,5,medium-score,3,,0,,This is a generic error,Lowest temeprature you're experienced,We need to know some things,This is an error message,0,30,,,,lowest-temperature, +Weather Trivia,integer_question,weather-trivia,weather-trivia,v1.0,en,high-inflection,5,medium-score,1,low-score,0,,"Sorry, we didn't quite get that.",What's the coldest weather you're experienced?,We need to know some things,Your reply should be between {min} and {max},50,70,,,,coldest-weather, +Draft Assessment 2,freetext_question,draft-assessment,Draft-assessment-2,v1.0,en,high-inflection,5,medium-score,3,,0,,This is a generic error for draft page,Is this a draft assessment,,,,,,,,draf-assessment, diff --git a/home/tests/test_assessment_import_export.py b/home/tests/test_assessment_import_export.py index 0645076b..08214fc4 100644 --- a/home/tests/test_assessment_import_export.py +++ b/home/tests/test_assessment_import_export.py @@ -690,6 +690,19 @@ def test_empty_rows(self, csv_impexp: ImportExport) -> None: ] assert e.value.row_num == 1 + def test_invalid_character(self, csv_impexp: ImportExport) -> None: + """ + Importing an empty CSV with an invalid character in the header + should return an error with the line number and header + that was wrongly entered. + """ + with pytest.raises(ImportException) as e: + csv_impexp.import_file("invalid_character.csv") + assert e.value.message == [ + "Invalid header: 'question type:' contains invalid characters." + ] + assert e.value.row_num == 1 + @pytest.mark.usefixtures("result_content_pages") @pytest.mark.django_db()