Skip to content

Commit

Permalink
Add check for invalid characters in headers, an exception and a test …
Browse files Browse the repository at this point in the history
…to handle it
  • Loading branch information
DevChima committed Feb 18, 2025
1 parent 3b890da commit cc507cd
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 62 deletions.
15 changes: 13 additions & 2 deletions home/import_assessments.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from home.import_helpers import (
ImportException,
convert_headers_to_snake_case,
validate_using_form,
)
from home.import_helpers import (
Expand Down Expand Up @@ -115,9 +116,19 @@ def parse_file(self) -> list["AssessmentRow"]:
rows = [row for _, row in row_iterator]

original_headers = rows[0].keys()
self.validate_headers(list(original_headers), row_num=1)
headers_mapping = convert_headers_to_snake_case(
list(original_headers), row_num=1
)
snake_case_headers = list(headers_mapping.values())
self.validate_headers(snake_case_headers, row_num=1)
transformed_rows = [
{headers_mapping[key]: value for key, value in row.items()} for row in rows
]

return [AssessmentRow.from_flat(row, i + 2) for i, row in enumerate(rows)]
return [
AssessmentRow.from_flat(row, i + 2)
for i, row in enumerate(transformed_rows)
]

def set_progress(self, message: str, progress: int) -> None:
self.progress_queue.put_nowait(progress)
Expand Down
103 changes: 43 additions & 60 deletions home/import_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,44 +25,32 @@

from .xlsx_helpers import get_active_sheet

TYPO_KEYWORDS = [
"question type",
"question-type",
"high result page",
"high-result-page",
"high inflection",
"high-inflection",
"medium result page",
"medium-result-page",
"medium inflection",
"medium-inflection",
"low result page",
"low-result-page",
"skip threshold",
"skip-threshold",
"skip high result page",
"skip-high-result-page",
"generic error",
"generic_error",
"answer semantic ids",
"answer-semantic-id",
"question semantic id",
"question-semantic-id",
"answer responses",
"answer-responses",
]
"""
List of keywords known to be common user typos or formatting inconsistencies.
These keywords are identified as common variations or errors in user input that
should be corrected by converting them to snake_case format. The list contains
different representations of header titles from CMS-Forms for conversion to snake_case.
Any additional keywords from Content Pages and other import applications that need
similar corrections should be appended to this list to maintain uniformity in data processing.
Contentset uses Pascal casing so changes to the application may be needed first before including
those variations in the list.
"""
INVALID_CHARACTERS = {
":",
";",
"!",
"@",
"#",
"$",
"%",
"^",
"&",
"*",
"(",
")",
"+",
"=",
"{",
"}",
"[",
"]",
"|",
"\\",
"/",
"?",
">",
"<",
}


class ImportException(Exception):
Expand Down Expand Up @@ -215,55 +203,50 @@ def check_empty_rows(rows: list[dict[str, Any]], row_num: int) -> None:
)


def convert_headers_to_snake_case(headers: list[str]) -> dict[str, str]:
def convert_headers_to_snake_case(headers: list[str], row_num: int) -> dict[str, str]:
"""
Converts a list of headers to snake_case and returns a mapping.
"""
return {header: to_snake_case(header) for header in headers}
return {header: to_snake_case(header, row_num) for header in headers}


def to_snake_case(s: str) -> str:
def to_snake_case(s: str, row_num: int) -> str:
"""
Converts string to snake_case.
Converts a given string to snake_case if it contains spaces or hyphens.
Throws an exception for invalid headers containing special characters.
"""
return re.sub(r"[\W_]+", "_", s).lower().strip("_")
if any(char in s for char in INVALID_CHARACTERS):
raise ImportException(
f"Invalid header: '{s}' contains invalid characters.", row_num=row_num
)
return re.sub(r"[\W_]+", "_", s).strip("_")


def fix_rows(rows: Iterator[dict[str | Any, Any]]) -> Iterator[dict[str, str | None]]:
"""
Fix keys for all rows by lowercasing keys, optionally converting to snake_case
if header text matches typo_keywords, and removing whitespace from keys and values.
Fix keys for all rows by lowercasing keys and removing whitespace from keys and values
"""

try:
first_row = next(rows)
except StopIteration:
return iter([])

if len(first_row) != len(fix_row(first_row, TYPO_KEYWORDS)):
if len(first_row) != len(fix_row(first_row)):
raise ImportException(
"Invalid format. Please check that there are no duplicate headers."
)
yield fix_row(first_row, TYPO_KEYWORDS)
yield fix_row(first_row)

for row in rows:
yield fix_row(row, TYPO_KEYWORDS)
yield fix_row(row)


def fix_row(row: dict[str, str | None], keywords: list[str]) -> dict[str, str | None]:
def fix_row(row: dict[str, str | None]) -> dict[str, str | None]:
"""
Fix a single row by lowercasing the key, converting it to snake_case
if it matches a typo_keyword, and removing whitespace from the key and value.
Fix a single row by lowercasing the key and removing whitespace from the key and value
"""
try:
return {
(
to_snake_case(_normalise_key(k))
if _normalise_key(k) in keywords
else _normalise_key(k)
): _normalise_value(v)
for k, v in row.items()
}
return {_normalise_key(k): _normalise_value(v) for k, v in row.items()}
except AttributeError:
raise ImportException(
"Invalid format. Please check that all row values have headers."
Expand Down
5 changes: 5 additions & 0 deletions home/tests/import-export-data/invalid_character.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
title,question type:,tags,Slug,version,locale,High result page,High inflection,Medium result page,Medium inflection,Low result page,Skip threshold,Skip high result page,generic_error,question,explainer,error,min,max,answers,scores,Answer semantic ids,Question semantic id,Answer responses
Freetext Question,freetext_question,draft-assessment,Draft-assessment,v1.0,en,high-inflection,5,medium-score,3,,0,,This is a generic error for draft page,Is this a draft assessment,,,,,,,,draf-assessment,
Test min max range,integer_question,test-min-max-range,test-min-max-range,v1.0,en,high-inflection,5,medium-score,3,,0,,This is a generic error,Lowest temeprature you're experienced,We need to know some things,This is an error message,0,30,,,,lowest-temperature,
Weather Trivia,integer_question,weather-trivia,weather-trivia,v1.0,en,high-inflection,5,medium-score,1,low-score,0,,"Sorry, we didn't quite get that.",What's the coldest weather you're experienced?,We need to know some things,Your reply should be between {min} and {max},50,70,,,,coldest-weather,
Draft Assessment 2,freetext_question,draft-assessment,Draft-assessment-2,v1.0,en,high-inflection,5,medium-score,3,,0,,This is a generic error for draft page,Is this a draft assessment,,,,,,,,draf-assessment,
13 changes: 13 additions & 0 deletions home/tests/test_assessment_import_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -690,6 +690,19 @@ def test_empty_rows(self, csv_impexp: ImportExport) -> None:
]
assert e.value.row_num == 1

def test_invalid_character(self, csv_impexp: ImportExport) -> None:
"""
Importing an empty CSV with an invalid character in the header
should return an error with the line number and header
that was wrongly entered.
"""
with pytest.raises(ImportException) as e:
csv_impexp.import_file("invalid_character.csv")
assert e.value.message == [
"Invalid header: 'question type:' contains invalid characters."
]
assert e.value.row_num == 1


@pytest.mark.usefixtures("result_content_pages")
@pytest.mark.django_db()
Expand Down

0 comments on commit cc507cd

Please sign in to comment.