Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CMS Forms Flexible Imports #419

Merged
merged 12 commits into from
Feb 19, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions home/import_assessments.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from home.import_helpers import (
ImportException,
convert_headers_to_snake_case,
validate_using_form,
)
from home.import_helpers import (
Expand Down Expand Up @@ -115,9 +116,19 @@ def parse_file(self) -> list["AssessmentRow"]:
rows = [row for _, row in row_iterator]

original_headers = rows[0].keys()
self.validate_headers(list(original_headers), row_num=1)
headers_mapping = convert_headers_to_snake_case(
list(original_headers), row_num=1
)
snake_case_headers = list(headers_mapping.values())
self.validate_headers(snake_case_headers, row_num=1)
transformed_rows = [
{headers_mapping[key]: value for key, value in row.items()} for row in rows
]

return [AssessmentRow.from_flat(row, i + 2) for i, row in enumerate(rows)]
return [
AssessmentRow.from_flat(row, i + 2)
for i, row in enumerate(transformed_rows)
]

def set_progress(self, message: str, progress: int) -> None:
self.progress_queue.put_nowait(progress)
Expand Down
103 changes: 43 additions & 60 deletions home/import_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,44 +25,32 @@

from .xlsx_helpers import get_active_sheet

TYPO_KEYWORDS = [
"question type",
"question-type",
"high result page",
"high-result-page",
"high inflection",
"high-inflection",
"medium result page",
"medium-result-page",
"medium inflection",
"medium-inflection",
"low result page",
"low-result-page",
"skip threshold",
"skip-threshold",
"skip high result page",
"skip-high-result-page",
"generic error",
"generic_error",
"answer semantic ids",
"answer-semantic-id",
"question semantic id",
"question-semantic-id",
"answer responses",
"answer-responses",
]
"""
List of keywords known to be common user typos or formatting inconsistencies.

These keywords are identified as common variations or errors in user input that
should be corrected by converting them to snake_case format. The list contains
different representations of header titles from CMS-Forms for conversion to snake_case.

Any additional keywords from Content Pages and other import applications that need
similar corrections should be appended to this list to maintain uniformity in data processing.
Contentset uses Pascal casing so changes to the application may be needed first before including
those variations in the list.
"""
INVALID_CHARACTERS = {
":",
";",
"!",
"@",
"#",
"$",
"%",
"^",
"&",
"*",
"(",
")",
"+",
"=",
"{",
"}",
"[",
"]",
"|",
"\\",
"/",
"?",
">",
"<",
}


class ImportException(Exception):
Expand Down Expand Up @@ -215,55 +203,50 @@ def check_empty_rows(rows: list[dict[str, Any]], row_num: int) -> None:
)


def convert_headers_to_snake_case(headers: list[str]) -> dict[str, str]:
def convert_headers_to_snake_case(headers: list[str], row_num: int) -> dict[str, str]:
"""
Converts a list of headers to snake_case and returns a mapping.
"""
return {header: to_snake_case(header) for header in headers}
return {header: to_snake_case(header, row_num) for header in headers}


def to_snake_case(s: str) -> str:
def to_snake_case(s: str, row_num: int) -> str:
"""
Converts string to snake_case.
Converts a given string to snake_case if it contains spaces or hyphens.
Throws an exception for invalid headers containing special characters.
"""
return re.sub(r"[\W_]+", "_", s).lower().strip("_")
if any(char in s for char in INVALID_CHARACTERS):
raise ImportException(
f"Invalid header: '{s}' contains invalid characters.", row_num=row_num
)
return re.sub(r"[\W_]+", "_", s).strip("_")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rather than maintaining a long list of invalid non-word characters (some of which we may potentially want to use in the future) and replacing everything else with _, we can replace just the _-equivalent characters:

Suggested change
if any(char in s for char in INVALID_CHARACTERS):
raise ImportException(
f"Invalid header: '{s}' contains invalid characters.", row_num=row_num
)
return re.sub(r"[\W_]+", "_", s).strip("_")
return s.replace(" ", "_").replace("-", "_").strip("_")

(I typed this directly into the PR comment, so check that it works before accepting the suggestion.)



def fix_rows(rows: Iterator[dict[str | Any, Any]]) -> Iterator[dict[str, str | None]]:
"""
Fix keys for all rows by lowercasing keys, optionally converting to snake_case
if header text matches typo_keywords, and removing whitespace from keys and values.
Fix keys for all rows by lowercasing keys and removing whitespace from keys and values
"""

try:
first_row = next(rows)
except StopIteration:
return iter([])

if len(first_row) != len(fix_row(first_row, TYPO_KEYWORDS)):
if len(first_row) != len(fix_row(first_row)):
raise ImportException(
"Invalid format. Please check that there are no duplicate headers."
)
yield fix_row(first_row, TYPO_KEYWORDS)
yield fix_row(first_row)

for row in rows:
yield fix_row(row, TYPO_KEYWORDS)
yield fix_row(row)


def fix_row(row: dict[str, str | None], keywords: list[str]) -> dict[str, str | None]:
def fix_row(row: dict[str, str | None]) -> dict[str, str | None]:
"""
Fix a single row by lowercasing the key, converting it to snake_case
if it matches a typo_keyword, and removing whitespace from the key and value.
Fix a single row by lowercasing the key and removing whitespace from the key and value
"""
try:
return {
(
to_snake_case(_normalise_key(k))
if _normalise_key(k) in keywords
else _normalise_key(k)
): _normalise_value(v)
for k, v in row.items()
}
return {_normalise_key(k): _normalise_value(v) for k, v in row.items()}
except AttributeError:
raise ImportException(
"Invalid format. Please check that all row values have headers."
Expand Down
5 changes: 5 additions & 0 deletions home/tests/import-export-data/invalid_character.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
title,question type:,tags,Slug,version,locale,High result page,High inflection,Medium result page,Medium inflection,Low result page,Skip threshold,Skip high result page,generic_error,question,explainer,error,min,max,answers,scores,Answer semantic ids,Question semantic id,Answer responses
Freetext Question,freetext_question,draft-assessment,Draft-assessment,v1.0,en,high-inflection,5,medium-score,3,,0,,This is a generic error for draft page,Is this a draft assessment,,,,,,,,draf-assessment,
Test min max range,integer_question,test-min-max-range,test-min-max-range,v1.0,en,high-inflection,5,medium-score,3,,0,,This is a generic error,Lowest temeprature you're experienced,We need to know some things,This is an error message,0,30,,,,lowest-temperature,
Weather Trivia,integer_question,weather-trivia,weather-trivia,v1.0,en,high-inflection,5,medium-score,1,low-score,0,,"Sorry, we didn't quite get that.",What's the coldest weather you're experienced?,We need to know some things,Your reply should be between {min} and {max},50,70,,,,coldest-weather,
Draft Assessment 2,freetext_question,draft-assessment,Draft-assessment-2,v1.0,en,high-inflection,5,medium-score,3,,0,,This is a generic error for draft page,Is this a draft assessment,,,,,,,,draf-assessment,
13 changes: 13 additions & 0 deletions home/tests/test_assessment_import_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -690,6 +690,19 @@ def test_empty_rows(self, csv_impexp: ImportExport) -> None:
]
assert e.value.row_num == 1

def test_invalid_character(self, csv_impexp: ImportExport) -> None:
"""
Importing an empty CSV with an invalid character in the header
should return an error with the line number and header
that was wrongly entered.
"""
with pytest.raises(ImportException) as e:
csv_impexp.import_file("invalid_character.csv")
assert e.value.message == [
"Invalid header: 'question type:' contains invalid characters."
]
assert e.value.row_num == 1


@pytest.mark.usefixtures("result_content_pages")
@pytest.mark.django_db()
Expand Down