diff --git a/taipy/core/data/_file_datanode_mixin.py b/taipy/core/data/_file_datanode_mixin.py index 4521922d25..d24356dfaa 100644 --- a/taipy/core/data/_file_datanode_mixin.py +++ b/taipy/core/data/_file_datanode_mixin.py @@ -14,11 +14,13 @@ import shutil from datetime import datetime from os.path import isfile -from typing import Any, Dict, Optional +from typing import Any, Callable, Dict, Optional from taipy.config.config import Config +from taipy.logger._taipy_logger import _TaipyLogger from .._entity._reload import _self_reload +from ..reason import InvalidUploadFile, ReasonCollection, UploadFileCanNotBeRead from .data_node import DataNode from .data_node_id import Edit @@ -34,6 +36,8 @@ class _FileDataNodeMixin(object): _DEFAULT_PATH_KEY = "default_path" _IS_GENERATED_KEY = "is_generated" + __logger = _TaipyLogger._get_logger() + def __init__(self, properties: Dict) -> None: self._path: str = properties.get(self._PATH_KEY, properties.get(self._DEFAULT_PATH_KEY)) self._is_generated: bool = properties.get(self._IS_GENERATED_KEY, self._path is None) @@ -92,3 +96,56 @@ def _migrate_path(self, storage_type, old_path) -> str: if os.path.exists(old_path): shutil.move(old_path, new_path) return new_path + + def _get_downloadable_path(self) -> str: + """Get the downloadable path of the file data of the data node. + + Returns: + The downloadable path of the file data of the data node if it exists, otherwise an empty string. + """ + if os.path.exists(self.path) and isfile(self._path): + return self.path + + return "" + + def _upload(self, path: str, upload_checker: Optional[Callable[[str, Any], bool]] = None) -> ReasonCollection: + """Upload a file data to the data node. + + Parameters: + path (str): The path of the file to upload to the data node. + upload_checker (Optional[Callable[[str, Any], bool]]): A function to check if the upload is allowed. + The function takes the title of the upload data and the data itself as arguments and returns + True if the upload is allowed, otherwise False. + + Returns: + True if the upload was successful, otherwise False. + """ + from ._data_manager_factory import _DataManagerFactory + + reason_collection = ReasonCollection() + + upload_path = pathlib.Path(path) + + try: + upload_data = self._read_from_path(str(upload_path)) + except Exception as err: + self.__logger.error(f"Error while uploading {upload_path.name} to data node {self.id}:") # type: ignore[attr-defined] + self.__logger.error(f"Error: {err}") + reason_collection._add_reason(self.id, UploadFileCanNotBeRead(upload_path.name, self.id)) # type: ignore[attr-defined] + return reason_collection + + if upload_checker is not None: + if not upload_checker(upload_path.name, upload_data): + reason_collection._add_reason(self.id, InvalidUploadFile(upload_path.name, self.id)) # type: ignore[attr-defined] + return reason_collection + + shutil.copy(upload_path, self.path) + + self.track_edit(timestamp=datetime.now()) # type: ignore[attr-defined] + self.unlock_edit() # type: ignore[attr-defined] + _DataManagerFactory._build_manager()._set(self) # type: ignore[arg-type] + + return reason_collection + + def _read_from_path(self, path: Optional[str] = None, **read_kwargs) -> Any: + raise NotImplementedError diff --git a/taipy/core/data/_tabular_datanode_mixin.py b/taipy/core/data/_tabular_datanode_mixin.py index 0cf50ad420..cf6cbd1b90 100644 --- a/taipy/core/data/_tabular_datanode_mixin.py +++ b/taipy/core/data/_tabular_datanode_mixin.py @@ -29,15 +29,16 @@ class _TabularDataNodeMixin(object): _VALID_STRING_EXPOSED_TYPES = [_EXPOSED_TYPE_PANDAS, _EXPOSED_TYPE_NUMPY] def __init__(self, **kwargs) -> None: - self._decoder: Union[Callable[[List[Any]], Any], Callable[[Dict[Any, Any]], Any]] + self._decoder: Union[Callable, Any] self.custom_document = kwargs.get(self._EXPOSED_TYPE_PROPERTY) - if kwargs.get(self._HAS_HEADER_PROPERTY, True): - self._decoder = self._default_decoder_with_header - else: - self._decoder = self._default_decoder_without_header + custom_decoder = getattr(self.custom_document, "decode", None) if callable(custom_decoder): self._decoder = custom_decoder + elif kwargs.get(self._HAS_HEADER_PROPERTY, True): + self._decoder = self._default_decoder_with_header + else: + self._decoder = self._default_decoder_without_header self._encoder = self._default_encoder custom_encoder = getattr(self.custom_document, "encode", None) diff --git a/taipy/core/data/csv.py b/taipy/core/data/csv.py index 3015390491..30cd710625 100644 --- a/taipy/core/data/csv.py +++ b/taipy/core/data/csv.py @@ -137,41 +137,48 @@ def storage_type(cls) -> str: return cls.__STORAGE_TYPE def _read(self): + return self._read_from_path() + + def _read_from_path(self, path: Optional[str] = None, **read_kwargs) -> Any: + if path is None: + path = self._path + properties = self.properties if properties[self._EXPOSED_TYPE_PROPERTY] == self._EXPOSED_TYPE_PANDAS: - return self._read_as_pandas_dataframe() + return self._read_as_pandas_dataframe(path=path) if properties[self._EXPOSED_TYPE_PROPERTY] == self._EXPOSED_TYPE_NUMPY: - return self._read_as_numpy() - return self._read_as() + return self._read_as_numpy(path=path) + return self._read_as(path=path) - def _read_as(self): + def _read_as(self, path: str): properties = self.properties - with open(self._path, encoding=properties[self.__ENCODING_KEY]) as csvFile: + with open(path, encoding=properties[self.__ENCODING_KEY]) as csvFile: if properties[self._HAS_HEADER_PROPERTY]: - reader = csv.DictReader(csvFile) - else: - reader = csv.reader(csvFile) + reader_with_header = csv.DictReader(csvFile) + return [self._decoder(line) for line in reader_with_header] - return [self._decoder(line) for line in reader] + reader_without_header = csv.reader(csvFile) + return [self._decoder(line) for line in reader_without_header] - def _read_as_numpy(self) -> np.ndarray: - return self._read_as_pandas_dataframe().to_numpy() + def _read_as_numpy(self, path: str) -> np.ndarray: + return self._read_as_pandas_dataframe(path=path).to_numpy() def _read_as_pandas_dataframe( - self, usecols: Optional[List[int]] = None, column_names: Optional[List[str]] = None + self, + path: str, + usecols: Optional[List[int]] = None, + column_names: Optional[List[str]] = None, ) -> pd.DataFrame: try: properties = self.properties if properties[self._HAS_HEADER_PROPERTY]: if column_names: - return pd.read_csv(self._path, encoding=properties[self.__ENCODING_KEY])[column_names] - return pd.read_csv(self._path, encoding=properties[self.__ENCODING_KEY]) + return pd.read_csv(path, encoding=properties[self.__ENCODING_KEY])[column_names] + return pd.read_csv(path, encoding=properties[self.__ENCODING_KEY]) else: if usecols: - return pd.read_csv( - self._path, encoding=properties[self.__ENCODING_KEY], header=None, usecols=usecols - ) - return pd.read_csv(self._path, encoding=properties[self.__ENCODING_KEY], header=None) + return pd.read_csv(path, encoding=properties[self.__ENCODING_KEY], header=None, usecols=usecols) + return pd.read_csv(path, encoding=properties[self.__ENCODING_KEY], header=None) except pd.errors.EmptyDataError: return pd.DataFrame() diff --git a/taipy/core/data/excel.py b/taipy/core/data/excel.py index 0042c225ad..05c07ae8b7 100644 --- a/taipy/core/data/excel.py +++ b/taipy/core/data/excel.py @@ -10,7 +10,7 @@ # specific language governing permissions and limitations under the License. from datetime import datetime, timedelta -from typing import Any, Dict, List, Optional, Set, Tuple, Union +from typing import Any, Dict, List, Optional, Set, Union import numpy as np import pandas as pd @@ -150,39 +150,45 @@ def _check_exposed_type(exposed_type): _TabularDataNodeMixin._check_exposed_type(t) def _read(self): + return self._read_from_path() + + def _read_from_path(self, path: Optional[str] = None, **read_kwargs) -> Any: + if path is None: + path = self._path + exposed_type = self.properties[self._EXPOSED_TYPE_PROPERTY] if exposed_type == self._EXPOSED_TYPE_PANDAS: - return self._read_as_pandas_dataframe() + return self._read_as_pandas_dataframe(path=path) if exposed_type == self._EXPOSED_TYPE_NUMPY: - return self._read_as_numpy() - return self._read_as() + return self._read_as_numpy(path=path) + return self._read_as(path=path) def _read_sheet_with_exposed_type( - self, sheet_exposed_type: str, sheet_name: str + self, path: str, sheet_exposed_type: str, sheet_name: str ) -> Optional[Union[np.ndarray, pd.DataFrame]]: if sheet_exposed_type == self._EXPOSED_TYPE_NUMPY: - return self._read_as_pandas_dataframe(sheet_name).to_numpy() # type: ignore + return self._read_as_numpy(path, sheet_name) elif sheet_exposed_type == self._EXPOSED_TYPE_PANDAS: - return self._read_as_pandas_dataframe(sheet_name) + return self._read_as_pandas_dataframe(path, sheet_name) return None - def _read_as(self): + def _read_as(self, path: str): try: properties = self.properties - excel_file = load_workbook(self._path) + excel_file = load_workbook(path) exposed_type = properties[self._EXPOSED_TYPE_PROPERTY] work_books = {} sheet_names = excel_file.sheetnames user_provided_sheet_names = properties.get(self.__SHEET_NAME_PROPERTY) or [] - if not isinstance(user_provided_sheet_names, (List, Set, Tuple)): + if not isinstance(user_provided_sheet_names, (list, set, tuple)): user_provided_sheet_names = [user_provided_sheet_names] provided_sheet_names = user_provided_sheet_names or sheet_names for sheet_name in provided_sheet_names: if sheet_name not in sheet_names: - raise NonExistingExcelSheet(sheet_name, self._path) + raise NonExistingExcelSheet(sheet_name, path) if isinstance(exposed_type, List): if len(provided_sheet_names) != len(exposed_type): @@ -201,7 +207,7 @@ def _read_as(self): sheet_exposed_type = exposed_type[i] if isinstance(sheet_exposed_type, str): - sheet_data = self._read_sheet_with_exposed_type(sheet_exposed_type, sheet_name) + sheet_data = self._read_sheet_with_exposed_type(path, sheet_exposed_type, sheet_name) if sheet_data is not None: work_books[sheet_name] = sheet_data continue @@ -223,14 +229,16 @@ def _read_as(self): return work_books - def _read_as_numpy(self): - sheets = self._read_as_pandas_dataframe() + def _read_as_numpy(self, path: str, sheet_names=None): + sheets = self._read_as_pandas_dataframe(path=path, sheet_names=sheet_names) if isinstance(sheets, dict): return {sheet_name: df.to_numpy() for sheet_name, df in sheets.items()} return sheets.to_numpy() - def _do_read_excel(self, sheet_names, kwargs) -> Union[Dict[Union[int, str], pd.DataFrame], pd.DataFrame]: - return pd.read_excel(self._path, sheet_name=sheet_names, **kwargs) + def _do_read_excel( + self, path: str, sheet_names, kwargs + ) -> Union[Dict[Union[int, str], pd.DataFrame], pd.DataFrame]: + return pd.read_excel(path, sheet_name=sheet_names, **kwargs) def __get_sheet_names_and_header(self, sheet_names): kwargs = {} @@ -241,10 +249,12 @@ def __get_sheet_names_and_header(self, sheet_names): kwargs["header"] = None return sheet_names, kwargs - def _read_as_pandas_dataframe(self, sheet_names=None) -> Union[Dict[Union[int, str], pd.DataFrame], pd.DataFrame]: + def _read_as_pandas_dataframe( + self, path: str, sheet_names=None + ) -> Union[Dict[Union[int, str], pd.DataFrame], pd.DataFrame]: sheet_names, kwargs = self.__get_sheet_names_and_header(sheet_names) try: - return self._do_read_excel(sheet_names, kwargs) + return self._do_read_excel(path, sheet_names, kwargs) except pd.errors.EmptyDataError: return pd.DataFrame() diff --git a/taipy/core/data/json.py b/taipy/core/data/json.py index f361bc9e60..3790787e8d 100644 --- a/taipy/core/data/json.py +++ b/taipy/core/data/json.py @@ -150,7 +150,13 @@ def decoder(self, decoder: json.JSONDecoder): self.properties[self._DECODER_KEY] = decoder def _read(self): - with open(self._path, "r", encoding=self.properties[self.__ENCODING_KEY]) as f: + return self._read_from_path() + + def _read_from_path(self, path: Optional[str] = None, **read_kwargs) -> Any: + if path is None: + path = self._path + + with open(path, "r", encoding=self.properties[self.__ENCODING_KEY]) as f: return json.load(f, cls=self._decoder) def _append(self, data: Any): diff --git a/taipy/core/data/parquet.py b/taipy/core/data/parquet.py index 31dedafb5e..57729c5001 100644 --- a/taipy/core/data/parquet.py +++ b/taipy/core/data/parquet.py @@ -11,7 +11,7 @@ from datetime import datetime, timedelta from os.path import isdir, isfile -from typing import Any, Dict, List, Optional, Set, Union +from typing import Any, Dict, List, Optional, Set import numpy as np import pandas as pd @@ -181,18 +181,43 @@ def storage_type(cls) -> str: return cls.__STORAGE_TYPE def _read(self): - return self.read_with_kwargs() + return self._read_from_path() - def _read_as(self, read_kwargs: Dict): + def _read_from_path(self, path: Optional[str] = None, **read_kwargs) -> Any: + if path is None: + path = self._path + + # return None if data was never written + if not self.last_edit_date: + self._DataNode__logger.warning( + f"Data node {self.id} from config {self.config_id} is being read but has never been written." + ) + return None + + kwargs = self.properties[self.__READ_KWARGS_PROPERTY] + kwargs.update( + { + self.__ENGINE_PROPERTY: self.properties[self.__ENGINE_PROPERTY], + } + ) + kwargs.update(read_kwargs) + + if self.properties[self._EXPOSED_TYPE_PROPERTY] == self._EXPOSED_TYPE_PANDAS: + return self._read_as_pandas_dataframe(path, kwargs) + if self.properties[self._EXPOSED_TYPE_PROPERTY] == self._EXPOSED_TYPE_NUMPY: + return self._read_as_numpy(path, kwargs) + return self._read_as(path, kwargs) + + def _read_as(self, path: str, read_kwargs: Dict): custom_class = self.properties[self._EXPOSED_TYPE_PROPERTY] - list_of_dicts = self._read_as_pandas_dataframe(read_kwargs).to_dict(orient="records") + list_of_dicts = self._read_as_pandas_dataframe(path, read_kwargs).to_dict(orient="records") return [custom_class(**dct) for dct in list_of_dicts] - def _read_as_numpy(self, read_kwargs: Dict) -> np.ndarray: - return self._read_as_pandas_dataframe(read_kwargs).to_numpy() + def _read_as_numpy(self, path: str, read_kwargs: Dict) -> np.ndarray: + return self._read_as_pandas_dataframe(path, read_kwargs).to_numpy() - def _read_as_pandas_dataframe(self, read_kwargs: Dict) -> pd.DataFrame: - return pd.read_parquet(self._path, **read_kwargs) + def _read_as_pandas_dataframe(self, path: str, read_kwargs: Dict) -> pd.DataFrame: + return pd.read_parquet(path, **read_kwargs) def _append(self, data: Any): self.write_with_kwargs(data, engine="fastparquet", append=True) @@ -237,28 +262,4 @@ def read_with_kwargs(self, **read_kwargs): **read_kwargs (dict[str, any]): The keyword arguments passed to the function `pandas.read_parquet()`. """ - # return None if data was never written - if not self.last_edit_date: - self._DataNode__logger.warning( - f"Data node {self.id} from config {self.config_id} is being read but has never been written." - ) - return None - - properties = self.properties - exposed_type = properties[self._EXPOSED_TYPE_PROPERTY] - kwargs = properties[self.__READ_KWARGS_PROPERTY] - kwargs.update( - { - self.__ENGINE_PROPERTY: properties[self.__ENGINE_PROPERTY], - } - ) - kwargs.update(read_kwargs) - - return self._do_read_with_kwargs(exposed_type, kwargs) - - def _do_read_with_kwargs(self, exposed_type, read_kwargs) -> Union[pd.DataFrame, np.ndarray, List]: - if exposed_type == self._EXPOSED_TYPE_PANDAS: - return self._read_as_pandas_dataframe(read_kwargs) - if exposed_type == self._EXPOSED_TYPE_NUMPY: - return self._read_as_numpy(read_kwargs) - return self._read_as(read_kwargs) + return self._read_from_path(**read_kwargs) diff --git a/taipy/core/data/pickle.py b/taipy/core/data/pickle.py index 7c7693ff68..fd66992d28 100644 --- a/taipy/core/data/pickle.py +++ b/taipy/core/data/pickle.py @@ -11,7 +11,7 @@ import pickle from datetime import datetime, timedelta -from typing import List, Optional, Set +from typing import Any, List, Optional, Set from taipy.config.common.scope import Scope @@ -116,7 +116,13 @@ def storage_type(cls) -> str: return cls.__STORAGE_TYPE def _read(self): - with open(self._path, "rb") as pf: + return self._read_from_path() + + def _read_from_path(self, path: Optional[str] = None, **read_kwargs) -> Any: + if path is None: + path = self._path + + with open(path, "rb") as pf: return pickle.load(pf) def _write(self, data): diff --git a/taipy/core/reason/__init__.py b/taipy/core/reason/__init__.py index d32434a44f..4d1cade6ac 100644 --- a/taipy/core/reason/__init__.py +++ b/taipy/core/reason/__init__.py @@ -13,8 +13,10 @@ DataNodeEditInProgress, DataNodeIsNotWritten, EntityIsNotSubmittableEntity, + InvalidUploadFile, NotGlobalScope, Reason, + UploadFileCanNotBeRead, WrongConfigType, ) from .reason_collection import ReasonCollection diff --git a/taipy/core/reason/reason.py b/taipy/core/reason/reason.py index f627e2a0b8..dc7cf56fcc 100644 --- a/taipy/core/reason/reason.py +++ b/taipy/core/reason/reason.py @@ -123,3 +123,35 @@ class NotGlobalScope(Reason): def __init__(self, config_id: str): Reason.__init__(self, f'Data node config "{config_id}" does not have GLOBAL scope') + + +class UploadFileCanNotBeRead(Reason, _DataNodeReasonMixin): + """ + The uploaded file can not be read, therefore is not a valid data file for the data node. + + Attributes: + file_name (str): The name of the file that was uploaded. + datanode_id (str): The datanode id that the file is intended to upload to. + """ + + def __init__(self, file_name: str, datanode_id: str): + Reason.__init__( + self, + f"The uploaded file {file_name} can not be read, " + f'therefore is not a valid data file for data node "{datanode_id}"', + ) + _DataNodeReasonMixin.__init__(self, datanode_id) + + +class InvalidUploadFile(Reason, _DataNodeReasonMixin): + """ + The uploaded file has invalid data, therefore is not a valid data file for the data node. + + Attributes: + file_name (str): The name of the file that was uploaded. + datanode_id (str): The datanode id that the file is intended to upload to. + """ + + def __init__(self, file_name: str, datanode_id: str): + Reason.__init__(self, f'The uploaded file {file_name} has invalid data for data node "{datanode_id}"') + _DataNodeReasonMixin.__init__(self, datanode_id) diff --git a/tests/core/data/test_csv_data_node.py b/tests/core/data/test_csv_data_node.py index 54a3b9c156..f64e994a2c 100644 --- a/tests/core/data/test_csv_data_node.py +++ b/tests/core/data/test_csv_data_node.py @@ -13,11 +13,14 @@ import os import pathlib import uuid -from datetime import datetime +from datetime import datetime, timedelta from time import sleep +import freezegun +import numpy as np import pandas as pd import pytest +from pandas.testing import assert_frame_equal from taipy.config.common.scope import Scope from taipy.config.config import Config @@ -190,3 +193,125 @@ def test_migrate_to_new_path(self, tmp_path): assert ".data" not in dn.path assert os.path.exists(dn.path) + + def test_get_downloadable_path(self): + path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/example.csv") + dn = CSVDataNode("foo", Scope.SCENARIO, properties={"path": path, "exposed_type": "pandas"}) + assert dn._get_downloadable_path() == path + + def test_get_downloadable_path_with_not_existing_file(self): + dn = CSVDataNode("foo", Scope.SCENARIO, properties={"path": "NOT_EXISTING.csv", "exposed_type": "pandas"}) + assert dn._get_downloadable_path() == "" + + def test_upload(self, csv_file, tmpdir_factory): + old_csv_path = tmpdir_factory.mktemp("data").join("df.csv").strpath + old_data = pd.DataFrame([{"a": 0, "b": 1, "c": 2}, {"a": 3, "b": 4, "c": 5}]) + + dn = CSVDataNode("foo", Scope.SCENARIO, properties={"path": old_csv_path, "exposed_type": "pandas"}) + dn.write(old_data) + old_last_edit_date = dn.last_edit_date + + upload_content = pd.read_csv(csv_file) + + with freezegun.freeze_time(old_last_edit_date + timedelta(seconds=1)): + dn._upload(csv_file) + + assert_frame_equal(dn.read(), upload_content) # The content of the dn should change to the uploaded content + assert dn.last_edit_date > old_last_edit_date + assert dn.path == old_csv_path # The path of the dn should not change + + def test_upload_with_upload_check_pandas(self, csv_file, tmpdir_factory): + old_csv_path = tmpdir_factory.mktemp("data").join("df.csv").strpath + old_data = pd.DataFrame([{"a": 0, "b": 1, "c": 2}, {"a": 3, "b": 4, "c": 5}]) + + dn = CSVDataNode("foo", Scope.SCENARIO, properties={"path": old_csv_path, "exposed_type": "pandas"}) + dn.write(old_data) + old_last_edit_date = dn.last_edit_date + + def check_data_column(upload_path, upload_data): + return upload_path.endswith(".csv") and upload_data.columns.tolist() == ["a", "b", "c"] + + not_exists_csv_path = tmpdir_factory.mktemp("data").join("not_exists.csv").strpath + reasons = dn._upload(not_exists_csv_path, upload_checker=check_data_column) + assert bool(reasons) is False + assert ( + str(list(reasons._reasons[dn.id])[0]) == "The uploaded file not_exists.csv can not be read," + f' therefore is not a valid data file for data node "{dn.id}"' + ) + + not_csv_path = tmpdir_factory.mktemp("data").join("wrong_format_df.not_csv").strpath + old_data.to_csv(not_csv_path, index=False) + # The upload should fail when the file is not a csv + reasons = dn._upload(not_csv_path, upload_checker=check_data_column) + assert bool(reasons) is False + assert ( + str(list(reasons._reasons[dn.id])[0]) + == f'The uploaded file wrong_format_df.not_csv has invalid data for data node "{dn.id}"' + ) + + wrong_format_csv_path = tmpdir_factory.mktemp("data").join("wrong_format_df.csv").strpath + pd.DataFrame([{"a": 1, "b": 2, "d": 3}, {"a": 4, "b": 5, "d": 6}]).to_csv(wrong_format_csv_path, index=False) + # The upload should fail when check_data_column() return False + reasons = dn._upload(wrong_format_csv_path, upload_checker=check_data_column) + assert bool(reasons) is False + assert ( + str(list(reasons._reasons[dn.id])[0]) + == f'The uploaded file wrong_format_df.csv has invalid data for data node "{dn.id}"' + ) + + assert_frame_equal(dn.read(), old_data) # The content of the dn should not change when upload fails + assert dn.last_edit_date == old_last_edit_date # The last edit date should not change when upload fails + assert dn.path == old_csv_path # The path of the dn should not change + + # The upload should succeed when check_data_column() return True + assert dn._upload(csv_file, upload_checker=check_data_column) + + def test_upload_with_upload_check_numpy(self, tmpdir_factory): + old_csv_path = tmpdir_factory.mktemp("data").join("df.csv").strpath + old_data = np.array([[1, 2, 3], [4, 5, 6]]) + + new_csv_path = tmpdir_factory.mktemp("data").join("new_upload_data.csv").strpath + new_data = np.array([[1, 2, 3], [4, 5, 6]]) + pd.DataFrame(new_data).to_csv(new_csv_path, index=False) + + dn = CSVDataNode("foo", Scope.SCENARIO, properties={"path": old_csv_path, "exposed_type": "numpy"}) + dn.write(old_data) + old_last_edit_date = dn.last_edit_date + + def check_data_is_positive(upload_path, upload_data): + return upload_path.endswith(".csv") and np.all(upload_data > 0) + + not_exists_csv_path = tmpdir_factory.mktemp("data").join("not_exists.csv").strpath + reasons = dn._upload(not_exists_csv_path, upload_checker=check_data_is_positive) + assert bool(reasons) is False + assert ( + str(list(reasons._reasons[dn.id])[0]) == "The uploaded file not_exists.csv can not be read" + f', therefore is not a valid data file for data node "{dn.id}"' + ) + + not_csv_path = tmpdir_factory.mktemp("data").join("wrong_format_df.not_csv").strpath + pd.DataFrame(old_data).to_csv(not_csv_path, index=False) + # The upload should fail when the file is not a csv + reasons = dn._upload(not_csv_path, upload_checker=check_data_is_positive) + assert bool(reasons) is False + assert ( + str(list(reasons._reasons[dn.id])[0]) + == f'The uploaded file wrong_format_df.not_csv has invalid data for data node "{dn.id}"' + ) + + wrong_format_csv_path = tmpdir_factory.mktemp("data").join("wrong_format_df.csv").strpath + pd.DataFrame(np.array([[-1, 2, 3], [-4, -5, -6]])).to_csv(wrong_format_csv_path, index=False) + # The upload should fail when check_data_is_positive() return False + reasons = dn._upload(wrong_format_csv_path, upload_checker=check_data_is_positive) + assert bool(reasons) is False + assert ( + str(list(reasons._reasons[dn.id])[0]) + == f'The uploaded file wrong_format_df.csv has invalid data for data node "{dn.id}"' + ) + + np.array_equal(dn.read(), old_data) # The content of the dn should not change when upload fails + assert dn.last_edit_date == old_last_edit_date # The last edit date should not change when upload fails + assert dn.path == old_csv_path # The path of the dn should not change + + # The upload should succeed when check_data_is_positive() return True + assert dn._upload(new_csv_path, upload_checker=check_data_is_positive) diff --git a/tests/core/data/test_excel_data_node.py b/tests/core/data/test_excel_data_node.py index d86612ac07..903022813d 100644 --- a/tests/core/data/test_excel_data_node.py +++ b/tests/core/data/test_excel_data_node.py @@ -12,13 +12,15 @@ import os import pathlib import uuid -from datetime import datetime +from datetime import datetime, timedelta from time import sleep from typing import Dict +import freezegun import numpy as np import pandas as pd import pytest +from pandas.testing import assert_frame_equal from taipy.config.common.scope import Scope from taipy.config.config import Config @@ -406,3 +408,127 @@ def test_migrate_to_new_path(self, tmp_path): assert ".data" not in dn.path assert os.path.exists(dn.path) + + def test_get_download_path(self): + path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/example.xlsx") + dn = ExcelDataNode("foo", Scope.SCENARIO, properties={"path": path, "exposed_type": "pandas"}) + assert dn._get_downloadable_path() == path + + def test_get_downloadable_path_with_not_existing_file(self): + dn = ExcelDataNode("foo", Scope.SCENARIO, properties={"path": "NOT_EXISTING.xlsx", "exposed_type": "pandas"}) + assert dn._get_downloadable_path() == "" + + def test_upload(self, excel_file, tmpdir_factory): + old_xlsx_path = tmpdir_factory.mktemp("data").join("df.xlsx").strpath + old_data = pd.DataFrame([{"a": 0, "b": 1, "c": 2}, {"a": 3, "b": 4, "c": 5}]) + + dn = ExcelDataNode("foo", Scope.SCENARIO, properties={"path": old_xlsx_path, "exposed_type": "pandas"}) + dn.write(old_data) + old_last_edit_date = dn.last_edit_date + + upload_content = pd.read_excel(excel_file) + + with freezegun.freeze_time(old_last_edit_date + timedelta(seconds=1)): + dn._upload(excel_file) + + assert_frame_equal(dn.read()["Sheet1"], upload_content) # The data of dn should change to the uploaded content + assert dn.last_edit_date > old_last_edit_date + assert dn.path == old_xlsx_path # The path of the dn should not change + + def test_upload_with_upload_check_pandas(self, excel_file, tmpdir_factory): + old_xlsx_path = tmpdir_factory.mktemp("data").join("df.xlsx").strpath + old_data = pd.DataFrame([{"a": 0, "b": 1, "c": 2}, {"a": 3, "b": 4, "c": 5}]) + + dn = ExcelDataNode("foo", Scope.SCENARIO, properties={"path": old_xlsx_path, "exposed_type": "pandas"}) + dn.write(old_data) + old_last_edit_date = dn.last_edit_date + + def check_data_column(upload_path, upload_data): + """Check if the uploaded data has the correct file format and + the sheet named "Sheet1" has the correct columns. + """ + return upload_path.endswith(".xlsx") and upload_data["Sheet1"].columns.tolist() == ["a", "b", "c"] + + not_exists_xlsx_path = tmpdir_factory.mktemp("data").join("not_exists.xlsx").strpath + reasons = dn._upload(not_exists_xlsx_path, upload_checker=check_data_column) + assert bool(reasons) is False + assert ( + str(list(reasons._reasons[dn.id])[0]) == "The uploaded file not_exists.xlsx can not be read," + f' therefore is not a valid data file for data node "{dn.id}"' + ) + + not_xlsx_path = tmpdir_factory.mktemp("data").join("wrong_format_df.xlsm").strpath + old_data.to_excel(not_xlsx_path, index=False) + # The upload should fail when the file is not a xlsx + reasons = dn._upload(not_xlsx_path, upload_checker=check_data_column) + assert bool(reasons) is False + assert ( + str(list(reasons._reasons[dn.id])[0]) + == f'The uploaded file wrong_format_df.xlsm has invalid data for data node "{dn.id}"' + ) + + wrong_format_xlsx_path = tmpdir_factory.mktemp("data").join("wrong_format_df.xlsx").strpath + pd.DataFrame([{"a": 1, "b": 2, "d": 3}, {"a": 4, "b": 5, "d": 6}]).to_excel(wrong_format_xlsx_path, index=False) + # The upload should fail when check_data_column() return False + reasons = dn._upload(wrong_format_xlsx_path, upload_checker=check_data_column) + assert bool(reasons) is False + assert ( + str(list(reasons._reasons[dn.id])[0]) + == f'The uploaded file wrong_format_df.xlsx has invalid data for data node "{dn.id}"' + ) + + assert_frame_equal(dn.read()["Sheet1"], old_data) # The content of the dn should not change when upload fails + assert dn.last_edit_date == old_last_edit_date # The last edit date should not change when upload fails + assert dn.path == old_xlsx_path # The path of the dn should not change + + # The upload should succeed when check_data_column() return True + assert dn._upload(excel_file, upload_checker=check_data_column) + + def test_upload_with_upload_check_numpy(self, tmpdir_factory): + old_excel_path = tmpdir_factory.mktemp("data").join("df.xlsx").strpath + old_data = np.array([[1, 2, 3], [4, 5, 6]]) + + new_excel_path = tmpdir_factory.mktemp("data").join("new_upload_data.xlsx").strpath + new_data = np.array([[1, 2, 3], [4, 5, 6]]) + pd.DataFrame(new_data).to_excel(new_excel_path, index=False) + + dn = ExcelDataNode("foo", Scope.SCENARIO, properties={"path": old_excel_path, "exposed_type": "numpy"}) + dn.write(old_data) + old_last_edit_date = dn.last_edit_date + + def check_data_is_positive(upload_path, upload_data): + return upload_path.endswith(".xlsx") and np.all(upload_data["Sheet1"] > 0) + + not_exists_xlsx_path = tmpdir_factory.mktemp("data").join("not_exists.xlsx").strpath + reasons = dn._upload(not_exists_xlsx_path, upload_checker=check_data_is_positive) + assert bool(reasons) is False + assert ( + str(list(reasons._reasons[dn.id])[0]) == "The uploaded file not_exists.xlsx can not be read," + f' therefore is not a valid data file for data node "{dn.id}"' + ) + + wrong_format_not_excel_path = tmpdir_factory.mktemp("data").join("wrong_format_df.xlsm").strpath + pd.DataFrame(old_data).to_excel(wrong_format_not_excel_path, index=False) + # The upload should fail when the file is not a excel + reasons = dn._upload(wrong_format_not_excel_path, upload_checker=check_data_is_positive) + assert bool(reasons) is False + assert ( + str(list(reasons._reasons[dn.id])[0]) + == f'The uploaded file wrong_format_df.xlsm has invalid data for data node "{dn.id}"' + ) + + not_xlsx_path = tmpdir_factory.mktemp("data").join("wrong_format_df.xlsx").strpath + pd.DataFrame(np.array([[-1, 2, 3], [-4, -5, -6]])).to_excel(not_xlsx_path, index=False) + # The upload should fail when check_data_is_positive() return False + reasons = dn._upload(not_xlsx_path, upload_checker=check_data_is_positive) + assert ( + str(list(reasons._reasons[dn.id])[0]) + == f'The uploaded file wrong_format_df.xlsx has invalid data for data node "{dn.id}"' + ) + + np.array_equal(dn.read()["Sheet1"], old_data) # The content of the dn should not change when upload fails + assert dn.last_edit_date == old_last_edit_date # The last edit date should not change when upload fails + assert dn.path == old_excel_path # The path of the dn should not change + + # The upload should succeed when check_data_is_positive() return True + assert dn._upload(new_excel_path, upload_checker=check_data_is_positive) diff --git a/tests/core/data/test_json_data_node.py b/tests/core/data/test_json_data_node.py index 4c086deb49..fa803b355f 100644 --- a/tests/core/data/test_json_data_node.py +++ b/tests/core/data/test_json_data_node.py @@ -18,6 +18,7 @@ from enum import Enum from time import sleep +import freezegun import numpy as np import pandas as pd import pytest @@ -390,3 +391,79 @@ def test_migrate_to_new_path(self, tmp_path): assert ".data" not in dn.path assert os.path.exists(dn.path) + + def test_get_download_path(self): + path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/json/example_dict.json") + dn = JSONDataNode("foo", Scope.SCENARIO, properties={"path": path}) + assert dn._get_downloadable_path() == path + + def test_get_download_path_with_not_existed_file(self): + dn = JSONDataNode("foo", Scope.SCENARIO, properties={"path": "NOT_EXISTED.json"}) + assert dn._get_downloadable_path() == "" + + def test_upload(self, json_file, tmpdir_factory): + old_json_path = tmpdir_factory.mktemp("data").join("df.json").strpath + old_data = [{"a": 0, "b": 1, "c": 2}, {"a": 3, "b": 4, "c": 5}] + + dn = JSONDataNode("foo", Scope.SCENARIO, properties={"path": old_json_path}) + dn.write(old_data) + old_last_edit_date = dn.last_edit_date + + with open(json_file, "r") as f: + upload_content = json.load(f) + + with freezegun.freeze_time(old_last_edit_date + datetime.timedelta(seconds=1)): + dn._upload(json_file) + + assert dn.read() == upload_content # The content of the dn should change to the uploaded content + assert dn.last_edit_date > old_last_edit_date + assert dn.path == old_json_path # The path of the dn should not change + + def test_upload_with_upload_check(self, json_file, tmpdir_factory): + old_json_path = tmpdir_factory.mktemp("data").join("df.json").strpath + old_data = [{"a": 0, "b": 1, "c": 2}, {"a": 3, "b": 4, "c": 5}] + + dn = JSONDataNode("foo", Scope.SCENARIO, properties={"path": old_json_path}) + dn.write(old_data) + old_last_edit_date = dn.last_edit_date + + def check_data_keys(upload_path, upload_data): + all_column_is_abc = all(data.keys() == {"a", "b", "c"} for data in upload_data) + return upload_path.endswith(".json") and all_column_is_abc + + not_exists_json_path = tmpdir_factory.mktemp("data").join("not_exists.json").strpath + reasons = dn._upload(not_exists_json_path, upload_checker=check_data_keys) + assert bool(reasons) is False + assert ( + str(list(reasons._reasons[dn.id])[0]) == "The uploaded file not_exists.json can not be read," + f' therefore is not a valid data file for data node "{dn.id}"' + ) + + not_json_path = tmpdir_factory.mktemp("data").join("wrong_format_df.not_json").strpath + with open(not_json_path, "w") as f: + json.dump([{"a": 1, "b": 2, "d": 3}, {"a": 4, "b": 5, "d": 6}], f) + # The upload should fail when the file is not a json + reasons = dn._upload(not_json_path, upload_checker=check_data_keys) + assert bool(reasons) is False + assert ( + str(list(reasons._reasons[dn.id])[0]) + == f'The uploaded file wrong_format_df.not_json has invalid data for data node "{dn.id}"' + ) + + wrong_format_json_path = tmpdir_factory.mktemp("data").join("wrong_format_df.json").strpath + with open(wrong_format_json_path, "w") as f: + json.dump([{"a": 1, "b": 2, "d": 3}, {"a": 4, "b": 5, "d": 6}], f) + # The upload should fail when check_data_keys() return False + reasons = dn._upload(wrong_format_json_path, upload_checker=check_data_keys) + assert bool(reasons) is False + assert ( + str(list(reasons._reasons[dn.id])[0]) + == f'The uploaded file wrong_format_df.json has invalid data for data node "{dn.id}"' + ) + + assert dn.read() == old_data # The content of the dn should not change when upload fails + assert dn.last_edit_date == old_last_edit_date # The last edit date should not change when upload fails + assert dn.path == old_json_path # The path of the dn should not change + + # The upload should succeed when check_data_keys() return True + assert dn._upload(json_file, upload_checker=check_data_keys) diff --git a/tests/core/data/test_parquet_data_node.py b/tests/core/data/test_parquet_data_node.py index d389327aa4..610d1b8a50 100644 --- a/tests/core/data/test_parquet_data_node.py +++ b/tests/core/data/test_parquet_data_node.py @@ -12,12 +12,15 @@ import os import pathlib import uuid -from datetime import datetime +from datetime import datetime, timedelta from importlib import util from time import sleep +import freezegun +import numpy as np import pandas as pd import pytest +from pandas.testing import assert_frame_equal from taipy.config.common.scope import Scope from taipy.config.config import Config @@ -230,3 +233,132 @@ def test_migrate_to_new_path(self, tmp_path): assert ".data" not in dn.path assert os.path.exists(dn.path) + + def test_get_downloadable_path(self): + path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/example.parquet") + dn = ParquetDataNode("foo", Scope.SCENARIO, properties={"path": path, "exposed_type": "pandas"}) + assert dn._get_downloadable_path() == path + + def test_get_downloadable_path_with_not_existing_file(self): + dn = ParquetDataNode("foo", Scope.SCENARIO, properties={"path": "NOT_EXISTING.parquet"}) + assert dn._get_downloadable_path() == "" + + def test_get_downloadable_path_as_directory_should_return_nothing(self): + path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/parquet_example") + dn = ParquetDataNode("foo", Scope.SCENARIO, properties={"path": path}) + assert dn._get_downloadable_path() == "" + + def test_upload(self, parquet_file_path, tmpdir_factory): + old_parquet_path = tmpdir_factory.mktemp("data").join("df.parquet").strpath + old_data = pd.DataFrame([{"a": 0, "b": 1, "c": 2}, {"a": 3, "b": 4, "c": 5}]) + + dn = ParquetDataNode("foo", Scope.SCENARIO, properties={"path": old_parquet_path, "exposed_type": "pandas"}) + dn.write(old_data) + old_last_edit_date = dn.last_edit_date + + upload_content = pd.read_parquet(parquet_file_path) + + with freezegun.freeze_time(old_last_edit_date + timedelta(seconds=1)): + dn._upload(parquet_file_path) + + assert_frame_equal(dn.read(), upload_content) # The content of the dn should change to the uploaded content + assert dn.last_edit_date > old_last_edit_date + assert dn.path == old_parquet_path # The path of the dn should not change + + def test_upload_with_upload_check_pandas(self, parquet_file_path, tmpdir_factory): + old_parquet_path = tmpdir_factory.mktemp("data").join("df.parquet").strpath + old_data = pd.DataFrame([{"a": 0, "b": 1, "c": 2}, {"a": 3, "b": 4, "c": 5}]) + + dn = ParquetDataNode("foo", Scope.SCENARIO, properties={"path": old_parquet_path, "exposed_type": "pandas"}) + dn.write(old_data) + old_last_edit_date = dn.last_edit_date + + def check_data_column(upload_path, upload_data): + return upload_path.endswith(".parquet") and upload_data.columns.tolist() == ["a", "b", "c"] + + not_exists_parquet_path = tmpdir_factory.mktemp("data").join("not_exists.parquet").strpath + reasons = dn._upload(not_exists_parquet_path, upload_checker=check_data_column) + assert bool(reasons) is False + assert ( + str(list(reasons._reasons[dn.id])[0]) == "The uploaded file not_exists.parquet can not be read," + f' therefore is not a valid data file for data node "{dn.id}"' + ) + + not_parquet_path = tmpdir_factory.mktemp("data").join("wrong_format_df.not_parquet").strpath + old_data.to_parquet(not_parquet_path, index=False) + # The upload should fail when the file is not a parquet + reasons = dn._upload(not_parquet_path, upload_checker=check_data_column) + assert bool(reasons) is False + assert ( + str(list(reasons._reasons[dn.id])[0]) + == f'The uploaded file wrong_format_df.not_parquet has invalid data for data node "{dn.id}"' + ) + + wrong_format_parquet_path = tmpdir_factory.mktemp("data").join("wrong_format_df.parquet").strpath + pd.DataFrame([{"a": 1, "b": 2, "d": 3}, {"a": 4, "b": 5, "d": 6}]).to_parquet( + wrong_format_parquet_path, index=False + ) + # The upload should fail when check_data_column() return False + reasons = dn._upload(wrong_format_parquet_path, upload_checker=check_data_column) + assert bool(reasons) is False + assert ( + str(list(reasons._reasons[dn.id])[0]) + == f'The uploaded file wrong_format_df.parquet has invalid data for data node "{dn.id}"' + ) + + assert_frame_equal(dn.read(), old_data) # The content of the dn should not change when upload fails + assert dn.last_edit_date == old_last_edit_date # The last edit date should not change when upload fails + assert dn.path == old_parquet_path # The path of the dn should not change + + # The upload should succeed when check_data_column() return True + assert dn._upload(parquet_file_path, upload_checker=check_data_column) + + def test_upload_with_upload_check_numpy(self, tmpdir_factory): + old_parquet_path = tmpdir_factory.mktemp("data").join("df.parquet").strpath + old_data = np.array([[1, 2, 3], [4, 5, 6]]) + + new_parquet_path = tmpdir_factory.mktemp("data").join("new_upload_data.parquet").strpath + new_data = np.array([[1, 2, 3], [4, 5, 6]]) + pd.DataFrame(new_data, columns=["a", "b", "c"]).to_parquet(new_parquet_path, index=False) + + dn = ParquetDataNode("foo", Scope.SCENARIO, properties={"path": old_parquet_path, "exposed_type": "numpy"}) + dn.write(old_data) + old_last_edit_date = dn.last_edit_date + + def check_data_is_positive(upload_path, upload_data): + return upload_path.endswith(".parquet") and np.all(upload_data > 0) + + not_exists_parquet_path = tmpdir_factory.mktemp("data").join("not_exists.parquet").strpath + reasons = dn._upload(not_exists_parquet_path, upload_checker=check_data_is_positive) + assert bool(reasons) is False + assert ( + str(list(reasons._reasons[dn.id])[0]) == "The uploaded file not_exists.parquet can not be read," + f' therefore is not a valid data file for data node "{dn.id}"' + ) + + not_parquet_path = tmpdir_factory.mktemp("data").join("wrong_format_df.not_parquet").strpath + pd.DataFrame(old_data, columns=["a", "b", "c"]).to_parquet(not_parquet_path, index=False) + # The upload should fail when the file is not a parquet + reasons = dn._upload(not_parquet_path, upload_checker=check_data_is_positive) + assert ( + str(list(reasons._reasons[dn.id])[0]) + == f'The uploaded file wrong_format_df.not_parquet has invalid data for data node "{dn.id}"' + ) + + wrong_format_parquet_path = tmpdir_factory.mktemp("data").join("wrong_format_df.parquet").strpath + pd.DataFrame(np.array([[-1, 2, 3], [-4, -5, -6]]), columns=["a", "b", "c"]).to_parquet( + wrong_format_parquet_path, index=False + ) + # The upload should fail when check_data_is_positive() return False + reasons = dn._upload(wrong_format_parquet_path, upload_checker=check_data_is_positive) + assert ( + str(list(reasons._reasons[dn.id])[0]) + == f'The uploaded file wrong_format_df.parquet has invalid data for data node "{dn.id}"' + ) + + np.array_equal(dn.read(), old_data) # The content of the dn should not change when upload fails + assert dn.last_edit_date == old_last_edit_date # The last edit date should not change when upload fails + assert dn.path == old_parquet_path # The path of the dn should not change + + # The upload should succeed when check_data_is_positive() return True + assert dn._upload(new_parquet_path, upload_checker=check_data_is_positive) diff --git a/tests/core/data/test_pickle_data_node.py b/tests/core/data/test_pickle_data_node.py index 15e35a8ea2..f76ae24710 100644 --- a/tests/core/data/test_pickle_data_node.py +++ b/tests/core/data/test_pickle_data_node.py @@ -11,11 +11,14 @@ import os import pathlib -from datetime import datetime +import pickle +from datetime import datetime, timedelta from time import sleep +import freezegun import pandas as pd import pytest +from pandas.testing import assert_frame_equal from taipy.config.common.scope import Scope from taipy.config.config import Config @@ -201,3 +204,77 @@ def test_migrate_to_new_path(self, tmp_path): assert ".data" not in dn.path assert os.path.exists(dn.path) + + def test_get_download_path(self): + path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/example.p") + dn = PickleDataNode("foo", Scope.SCENARIO, properties={"path": path}) + assert dn._get_downloadable_path() == path + + def test_get_download_path_with_not_existed_file(self): + dn = PickleDataNode("foo", Scope.SCENARIO, properties={"path": "NOT_EXISTED.p"}) + assert dn._get_downloadable_path() == "" + + def test_upload(self, pickle_file_path, tmpdir_factory): + old_pickle_path = tmpdir_factory.mktemp("data").join("df.p").strpath + old_data = pd.DataFrame([{"a": 0, "b": 1, "c": 2}, {"a": 3, "b": 4, "c": 5}]) + + dn = PickleDataNode("foo", Scope.SCENARIO, properties={"path": old_pickle_path}) + dn.write(old_data) + old_last_edit_date = dn.last_edit_date + + upload_content = pd.read_pickle(pickle_file_path) + + with freezegun.freeze_time(old_last_edit_date + timedelta(seconds=1)): + dn._upload(pickle_file_path) + + assert_frame_equal(dn.read(), upload_content) # The content of the dn should change to the uploaded content + assert dn.last_edit_date > old_last_edit_date + assert dn.path == old_pickle_path # The path of the dn should not change + + def test_upload_with_upload_check(self, pickle_file_path, tmpdir_factory): + old_pickle_path = tmpdir_factory.mktemp("data").join("df.p").strpath + old_data = pd.DataFrame([{"a": 0, "b": 1, "c": 2}, {"a": 3, "b": 4, "c": 5}]) + + dn = PickleDataNode("foo", Scope.SCENARIO, properties={"path": old_pickle_path}) + dn.write(old_data) + old_last_edit_date = dn.last_edit_date + + def check_data_column(upload_path, upload_data): + return upload_path.endswith(".p") and upload_data.columns.tolist() == ["a", "b", "c"] + + not_exists_json_path = tmpdir_factory.mktemp("data").join("not_exists.json").strpath + reasons = dn._upload(not_exists_json_path, upload_checker=check_data_column) + assert bool(reasons) is False + assert ( + str(list(reasons._reasons[dn.id])[0]) == "The uploaded file not_exists.json can not be read," + f' therefore is not a valid data file for data node "{dn.id}"' + ) + + not_pickle_path = tmpdir_factory.mktemp("data").join("wrong_format_df.not_pickle").strpath + with open(str(not_pickle_path), "wb") as f: + pickle.dump(pd.DataFrame([{"a": 1, "b": 2, "d": 3}, {"a": 4, "b": 5, "d": 6}]), f) + # The upload should fail when the file is not a pickle + reasons = dn._upload(not_pickle_path, upload_checker=check_data_column) + assert bool(reasons) is False + assert ( + str(list(reasons._reasons[dn.id])[0]) + == f'The uploaded file wrong_format_df.not_pickle has invalid data for data node "{dn.id}"' + ) + + wrong_format_pickle_path = tmpdir_factory.mktemp("data").join("wrong_format_df.p").strpath + with open(str(wrong_format_pickle_path), "wb") as f: + pickle.dump(pd.DataFrame([{"a": 1, "b": 2, "d": 3}, {"a": 4, "b": 5, "d": 6}]), f) + # The upload should fail when check_data_column() return False + reasons = dn._upload(wrong_format_pickle_path, upload_checker=check_data_column) + assert bool(reasons) is False + assert ( + str(list(reasons._reasons[dn.id])[0]) + == f'The uploaded file wrong_format_df.p has invalid data for data node "{dn.id}"' + ) + + assert_frame_equal(dn.read(), old_data) # The content of the dn should not change when upload fails + assert dn.last_edit_date == old_last_edit_date # The last edit date should not change when upload fails + assert dn.path == old_pickle_path # The path of the dn should not change + + # The upload should succeed when check_data_column() return True + assert dn._upload(pickle_file_path, upload_checker=check_data_column) diff --git a/tests/core/data/test_read_excel_data_node.py b/tests/core/data/test_read_excel_data_node.py index 4efe509765..420f069bc3 100644 --- a/tests/core/data/test_read_excel_data_node.py +++ b/tests/core/data/test_read_excel_data_node.py @@ -58,6 +58,7 @@ def __init__(self, id, integer, text): excel_file_path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/example.xlsx") sheet_names = ["Sheet1", "Sheet2"] custom_class_dict = {"Sheet1": MyCustomObject1, "Sheet2": MyCustomObject2} +custom_pandas_numpy_exposed_type_dict = {"Sheet1": "pandas", "Sheet2": "numpy"} def test_raise_no_data_with_header(): @@ -400,7 +401,7 @@ def test_read_multi_sheet_with_header_single_custom_exposed_type(): assert row_custom_no_sheet_name.text == row_custom.text -def test_read_multi_sheet_with_header_multiple_custom_exposed_type(): +def test_read_multi_sheet_with_header_multiple_custom_object_exposed_type(): data_pandas = pd.read_excel(excel_file_path, sheet_name=sheet_names) # With sheet name @@ -461,6 +462,48 @@ def test_read_multi_sheet_with_header_multiple_custom_exposed_type(): assert row_custom_no_sheet_name.text == row_custom.text +def test_read_multi_sheet_with_header_multiple_custom_pandas_numpy_exposed_type(): + # With sheet name + excel_dn_as_pandas_numpy = ExcelDataNode( + "bar", + Scope.SCENARIO, + properties={ + "path": excel_file_path, + "sheet_name": sheet_names, + "exposed_type": custom_pandas_numpy_exposed_type_dict, + }, + ) + assert excel_dn_as_pandas_numpy.properties["exposed_type"] == custom_pandas_numpy_exposed_type_dict + multi_data_custom = excel_dn_as_pandas_numpy.read() + assert isinstance(multi_data_custom["Sheet1"], pd.DataFrame) + assert isinstance(multi_data_custom["Sheet2"], np.ndarray) + + excel_dn_as_pandas_numpy = ExcelDataNode( + "bar", + Scope.SCENARIO, + properties={ + "path": excel_file_path, + "sheet_name": sheet_names, + "exposed_type": ["pandas", "numpy"], + }, + ) + assert excel_dn_as_pandas_numpy.properties["exposed_type"] == ["pandas", "numpy"] + multi_data_custom = excel_dn_as_pandas_numpy.read() + assert isinstance(multi_data_custom["Sheet1"], pd.DataFrame) + assert isinstance(multi_data_custom["Sheet2"], np.ndarray) + + # Without sheet name + excel_dn_as_pandas_numpy_no_sheet_name = ExcelDataNode( + "bar", + Scope.SCENARIO, + properties={"path": excel_file_path, "exposed_type": custom_pandas_numpy_exposed_type_dict}, + ) + assert excel_dn_as_pandas_numpy_no_sheet_name.properties["exposed_type"] == custom_pandas_numpy_exposed_type_dict + multi_data_custom_no_sheet_name = excel_dn_as_pandas_numpy_no_sheet_name.read() + assert isinstance(multi_data_custom_no_sheet_name["Sheet1"], pd.DataFrame) + assert isinstance(multi_data_custom_no_sheet_name["Sheet2"], np.ndarray) + + def test_read_multi_sheet_without_header_pandas(): # With sheet name excel_data_node_as_pandas = ExcelDataNode( @@ -525,7 +568,7 @@ def test_read_multi_sheet_without_header_numpy(): assert np.array_equal(data_numpy[key], data_numpy_no_sheet_name[key]) -def test_read_multi_sheet_without_header_single_custom_exposed_type(): +def test_read_multi_sheet_without_header_single_custom_object_exposed_type(): data_pandas = pd.read_excel(excel_file_path, header=None, sheet_name=sheet_names) # With sheet name @@ -579,7 +622,7 @@ def test_read_multi_sheet_without_header_single_custom_exposed_type(): assert row_custom_no_sheet_name.text == row_custom.text -def test_read_multi_sheet_without_header_multiple_custom_exposed_type(): +def test_read_multi_sheet_without_header_multiple_custom_object_exposed_type(): data_pandas = pd.read_excel(excel_file_path, header=None, sheet_name=sheet_names) # With sheet names @@ -643,3 +686,51 @@ def test_read_multi_sheet_without_header_multiple_custom_exposed_type(): assert row_custom_no_sheet_name.id == row_custom.id assert row_custom_no_sheet_name.integer == row_custom.integer assert row_custom_no_sheet_name.text == row_custom.text + + +def test_read_multi_sheet_without_header_multiple_custom_pandas_numpy_exposed_type(): + # With sheet names + excel_dn_as_pandas_numpy = ExcelDataNode( + "bar", + Scope.SCENARIO, + properties={ + "path": excel_file_path, + "sheet_name": sheet_names, + "exposed_type": custom_pandas_numpy_exposed_type_dict, + "has_header": False, + }, + ) + assert excel_dn_as_pandas_numpy.properties["exposed_type"] == custom_pandas_numpy_exposed_type_dict + multi_data_custom = excel_dn_as_pandas_numpy.read() + assert isinstance(multi_data_custom["Sheet1"], pd.DataFrame) + assert isinstance(multi_data_custom["Sheet2"], np.ndarray) + + excel_dn_as_pandas_numpy = ExcelDataNode( + "bar", + Scope.SCENARIO, + properties={ + "path": excel_file_path, + "sheet_name": sheet_names, + "exposed_type": ["pandas", "numpy"], + "has_header": False, + }, + ) + assert excel_dn_as_pandas_numpy.properties["exposed_type"] == ["pandas", "numpy"] + multi_data_custom = excel_dn_as_pandas_numpy.read() + assert isinstance(multi_data_custom["Sheet1"], pd.DataFrame) + assert isinstance(multi_data_custom["Sheet2"], np.ndarray) + + # Without sheet names + excel_dn_as_pandas_numpy_no_sheet_name = ExcelDataNode( + "bar", + Scope.SCENARIO, + properties={ + "path": excel_file_path, + "has_header": False, + "exposed_type": custom_pandas_numpy_exposed_type_dict, + }, + ) + multi_data_custom_no_sheet_name = excel_dn_as_pandas_numpy_no_sheet_name.read() + multi_data_custom_no_sheet_name = excel_dn_as_pandas_numpy.read() + assert isinstance(multi_data_custom_no_sheet_name["Sheet1"], pd.DataFrame) + assert isinstance(multi_data_custom_no_sheet_name["Sheet2"], np.ndarray)