From 479e19375bf590307987cb13455bb3962b3a7282 Mon Sep 17 00:00:00 2001 From: Sangjoon Bob Lee Date: Wed, 19 Jun 2024 00:03:49 -0400 Subject: [PATCH] Implement CIF Ensemble preprocessing print --- example.ipynb | 40 +++++++++------- src/cifkit/preprocessors/error.py | 66 ++++++++++++++------------ tests/core/preprocessors/test_error.py | 14 +++--- 3 files changed, 67 insertions(+), 53 deletions(-) diff --git a/example.ipynb b/example.ipynb index 0ee49e6..eee2cf9 100644 --- a/example.ipynb +++ b/example.ipynb @@ -6,30 +6,38 @@ "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "2024-06-18 12:45:47,573 - INFO - Preprocessing tests/data/cif/folder/300169.cif\n", - "2024-06-18 12:45:47,583 - INFO - Parsing .cif file and generating a supercell\n", - "2024-06-18 12:45:47,594 - INFO - Computing atomic environments in the supercell\n", - "2024-06-18 12:45:47,756 - INFO - Preprocessing tests/data/cif/folder/300171.cif\n", - "2024-06-18 12:45:47,758 - INFO - Parsing .cif file and generating a supercell\n", - "2024-06-18 12:45:47,763 - INFO - Computing atomic environments in the supercell\n", - "2024-06-18 12:45:47,877 - INFO - Preprocessing tests/data/cif/folder/300170.cif\n", - "2024-06-18 12:45:47,878 - INFO - Parsing .cif file and generating a supercell\n", - "2024-06-18 12:45:47,882 - INFO - Computing atomic environments in the supercell\n" + "\n", + "CIF Preprocessing has begun...\n", + "\n", + "Preprocessing tests/data/cif/ensemble_test/300169.cif (1/6)\n", + "Preprocessing tests/data/cif/ensemble_test/260171.cif (2/6)\n", + "Preprocessing tests/data/cif/ensemble_test/250697.cif (3/6)\n", + "Preprocessing tests/data/cif/ensemble_test/250709.cif (4/6)\n", + "Preprocessing tests/data/cif/ensemble_test/300171.cif (5/6)\n", + "Preprocessing tests/data/cif/ensemble_test/300170.cif (6/6)\n", + "\n", + "File movement summary:\n", + "The returned loop tags do not match the expected tags.: 0 files moved\n", + "Wrong number of values in loop _atom_site_*: 0 files moved\n", + "The file contains duplicate atom site labels.: 0 files moved\n", + "The element was not correctly parsed from the site label.: 0 files moved\n", + "An error occurred while processing symmetry operation: 0 files moved\n", + "Missing atomic coordinates: 0 files moved\n", + "other_error: 0 files moved\n" ] } ], "source": [ - "from cifkit import Cif\n", + "from cifkit import Cif, CifEnsemble\n", "from cifkit.utils import folder\n", "\n", "# Example usage\n", - "file_paths = folder.get_file_path_list(\"tests/data/cif/folder\")\n", "\n", - "for file_path in file_paths:\n", - " cif = Cif(file_path, logging_enabled=True)\n", + "cif_ensemble = CifEnsemble(\"tests/data/cif/ensemble_test\")\n", + "\n", "\n", " # Print output\n", " # print(\"Formula:\", cif.formula)\n", @@ -39,8 +47,8 @@ " # print(\"Space group:\", cif.space_group_name)\n", "\n", " # Set the cut-off radius\n", - " cut_off_radius = 4\n", - " cif.compute_connections(cut_off_radius)\n", + " # cut_off_radius = 4\n", + " # cif.compute_connections(cut_off_radius)\n", " # print(cif.bond_fraction_CN)\n", " # print(cif.bond_counts_CN)\n", " # print(cif.all_bond_pairs)\n", diff --git a/src/cifkit/preprocessors/error.py b/src/cifkit/preprocessors/error.py index bc708eb..8775fa1 100644 --- a/src/cifkit/preprocessors/error.py +++ b/src/cifkit/preprocessors/error.py @@ -21,43 +21,49 @@ def move_files_based_on_errors(dir_path): # Dictionary to hold directory paths for each error type error_directories = { - CifParserError.SYMMETRY_OPERATION_ERROR: dir_path / "bad_op", - CifParserError.WRONG_LOOP_VALUE_COUNT: dir_path / "wrong_loop_value", - CifParserError.MISSING_COORDINATES: dir_path / "bad_coords", - CifParserError.INVALID_PARSED_ELEMENT: dir_path / "invalid_label", - CifParserError.DUPLICATE_LABELS: dir_path / "duplicate_labels", - "other_error": dir_path / "other_error", + "error_operations": dir_path / "error_operations", + "error_duplicate_labels": dir_path / "error_duplicate_labels", + "error_wrong_loop_value": dir_path / "error_wrong_loop_value", + "error_coords": dir_path / "error_coords", + "error_invalid_label": dir_path / "error_invalid_label", + "error_others": dir_path / "error_others", } # Ensure all direct - num_files = {key.value: 0 for key in CifParserError} - num_files["other_error"] = 0 - + file_paths = list(dir_path.glob("*.cif")) + num_files_moved = {key: 0 for key in error_directories.keys()} file_paths = folder.get_file_paths(str(dir_path)) - for file_path in file_paths: + for i, file_path in enumerate(file_paths, start=1): filename = os.path.basename(file_path) - moved = False - + print(f"Preprocessing {file_path} ({i}/{len(file_paths)})") try: Cif(file_path) except Exception as e: error_message = str(e) - for error, message in CifParserError.__members__.items(): - if message.value in error_message: - - make_directory_and_move( - file_path, error_directories[message], filename - ) - num_files[message.value] += 1 - moved = True - break - - if not moved: - make_directory_and_move( - file_path, error_directories["other_error"], filename - ) - num_files["other_error"] += 1 - - print(f"File {filename} moved due to error: {error_message}") - print(num_files) + # Example of handling specific errors, adjust as needed + if "symmetry operation" in error_message: + error_type = "error_operations" + elif "contains duplicate atom site labels" in error_message: + error_type = "error_duplicate_labels" + elif "Wrong number of values in loop" in error_message: + error_type = "error_wrong_loop_value" + elif "missing atomic coordinates" in error_message: + error_type = "error_coords" + elif "incorrectly parsed element" in error_message: + error_type = "error_invalid_label" + else: + error_type = "error_others" + + make_directory_and_move( + file_path, error_directories[error_type], filename + ) + num_files_moved[error_type] += 1 + print( + f"File {filename} moved to '{error_type}' due to: {error_message}" + ) + + # Display the number of files moved to each folder + print("\nSUMMARY") + for error_type, count in num_files_moved.items(): + print(f"# of files moved to '{error_type}' folder: {count}") diff --git a/tests/core/preprocessors/test_error.py b/tests/core/preprocessors/test_error.py index e101918..1575f54 100644 --- a/tests/core/preprocessors/test_error.py +++ b/tests/core/preprocessors/test_error.py @@ -21,16 +21,16 @@ def test_move_files_based_on_errors(tmp_path): # Define expected directories expected_dirs = { - "duplicate_labels": tmp_dir / "duplicate_labels", - "wrong_loop_value": tmp_dir / "wrong_loop_value", - "invalid_label": tmp_dir / "invalid_label", - "other_error": tmp_dir / "other_error", + "error_duplicate_labels": tmp_dir / "error_duplicate_labels", + "error_wrong_loop_value": tmp_dir / "error_wrong_loop_value", + "error_invalid_label": tmp_dir / "error_invalid_label", + "error_others": tmp_dir / "error_others", } # Run the function with the paths in the temporary directory move_files_based_on_errors(str(tmp_dir)) # Assert the number of files in each directory - assert get_file_count(expected_dirs["duplicate_labels"]) == 1 - assert get_file_count(expected_dirs["wrong_loop_value"]) == 1 - # assert get_file_count(expected_dirs["other_error"]) == 1 + assert get_file_count(expected_dirs["error_wrong_loop_value"]) == 1 + assert get_file_count(expected_dirs["error_duplicate_labels"]) == 1 + assert get_file_count(expected_dirs["error_duplicate_labels"]) == 1