From c0caa08b562aa8293317208e6acc4e46b82e0811 Mon Sep 17 00:00:00 2001 From: Han Lin Mai Date: Fri, 14 Jun 2024 23:04:26 +0200 Subject: [PATCH] black formatting + fixed tests --- actual_usage/VaspReconvergeExample.py | 14 +- actual_usage/build_vasp_database.py | 73 +- actual_usage/compression.py | 14 +- actual_usage/summarise_vasp_database.py | 44 +- actual_usage/update_vasp_db.py | 51 +- setup.py | 34 +- tests/test_generic.py | 370 ++++-- tests/test_vasp.py | 14 +- utils/GNN_calculators/mace.py | 69 +- utils/StructureManipulator/cleave.py | 190 ++- utils/StructureManipulator/interstitial.py | 155 ++- utils/ace_descriptor_utils.py | 298 ++--- utils/analysis_functions.py | 1033 +++++++++++------ utils/chargemol.py | 575 +++++---- utils/custom_custodian_handlers.py | 32 +- utils/functions.py | 425 ++++--- utils/generic.py | 316 +++-- utils/jobfile.py | 95 +- .../CustodianScripts/template_BASE.py | 12 +- .../CustodianScripts/template_DRS.py | 87 +- .../CustodianScripts/template_SDRS.py | 122 +- .../CustodianScripts/template_SDRS_KPOINTS.py | 121 +- .../CustodianScripts/template_Static.py | 34 +- utils/parallel.py | 18 +- utils/periodic_table.py | 486 +++++--- utils/plotters/grid_plots.py | 66 +- utils/plotters/structure_plots.py | 108 +- utils/structure_featuriser.py | 121 +- utils/training_data_nequip.py | 41 +- utils/vasp/database.py | 661 +++++++---- utils/vasp/job.py | 189 +-- utils/vasp/parser/outcar.py | 76 +- utils/vasp/parser/output.py | 240 ++-- utils/vasp/resubmitter.py | 205 +++- .../vasp/vasp_potential_training_database.py | 1 - 35 files changed, 4171 insertions(+), 2219 deletions(-) diff --git a/actual_usage/VaspReconvergeExample.py b/actual_usage/VaspReconvergeExample.py index a2b53ff..9367903 100644 --- a/actual_usage/VaspReconvergeExample.py +++ b/actual_usage/VaspReconvergeExample.py @@ -1,10 +1,12 @@ from utils.vasp.vasp_resubmitter import CalculationConverger import os -vasp_resubmitter = CalculationConverger(parent_dir=os.getcwd(), - script_template_dir="/home/hmai/CustodianJobfiles", - max_submissions = 1000, - submission_command = "sbatch", - username="hmai") +vasp_resubmitter = CalculationConverger( + parent_dir=os.getcwd(), + script_template_dir="/home/hmai/CustodianJobfiles", + max_submissions=1000, + submission_command="sbatch", + username="hmai", +) -vasp_resubmitter.reconverge_all() \ No newline at end of file +vasp_resubmitter.reconverge_all() diff --git a/actual_usage/build_vasp_database.py b/actual_usage/build_vasp_database.py index c666fc8..76511d8 100644 --- a/actual_usage/build_vasp_database.py +++ b/actual_usage/build_vasp_database.py @@ -3,21 +3,43 @@ import warnings from multiprocessing import cpu_count + def main(): warnings.filterwarnings("ignore") - + # Initialize argument parser - parser = argparse.ArgumentParser(description='Find and compress directories based on specified criteria.') - parser.add_argument('directory', metavar='DIR', type=str, help='the directory to operate on') - parser.add_argument('--extract', action='store_true', help='Extract directories during database generation') - parser.add_argument('--max_dir_count', type=int, help='Maximum directory count for database generation') - parser.add_argument('--read_all_runs_in_dir', action='store_true', default=False, help='Read all runs in directory') - parser.add_argument('--read_error_runs_in_dir', action='store_true', default=False, help='Read directories with errors') + parser = argparse.ArgumentParser( + description="Find and compress directories based on specified criteria." + ) + parser.add_argument( + "directory", metavar="DIR", type=str, help="the directory to operate on" + ) + parser.add_argument( + "--extract", + action="store_true", + help="Extract directories during database generation", + ) + parser.add_argument( + "--max_dir_count", + type=int, + help="Maximum directory count for database generation", + ) + parser.add_argument( + "--read_all_runs_in_dir", + action="store_true", + default=False, + help="Read all runs in directory", + ) + parser.add_argument( + "--read_error_runs_in_dir", + action="store_true", + default=False, + help="Read directories with errors", + ) args = parser.parse_args() - datagen = DatabaseGenerator(args.directory, - max_workers=cpu_count()) - + datagen = DatabaseGenerator(args.directory, max_workers=cpu_count()) + # Check if max_dir_count is provided as an argument if args.max_dir_count is not None: max_dir_count = args.max_dir_count @@ -25,18 +47,21 @@ def main(): max_dir_count = 2000 # Default value # Call the build_database function with the updated parameters - df = datagen.build_database(extract_directories=args.extract, - read_multiple_runs_in_dir=args.read_all_runs_in_dir, - read_error_dirs=args.read_error_runs_in_dir, - max_dir_count=max_dir_count, - tarball_extensions=(".tar.gz", ".tar.bz2"), - cleanup=False, - keep_filenames_after_cleanup=[], - keep_filename_patterns_after_cleanup=[], - filenames_to_qualify=["OUTCAR"],#, "vasprun.xml"], - all_present=True, - df_filename=None, - df_compression=True) - -if __name__ == '__main__': + df = datagen.build_database( + extract_directories=args.extract, + read_multiple_runs_in_dir=args.read_all_runs_in_dir, + read_error_dirs=args.read_error_runs_in_dir, + max_dir_count=max_dir_count, + tarball_extensions=(".tar.gz", ".tar.bz2"), + cleanup=False, + keep_filenames_after_cleanup=[], + keep_filename_patterns_after_cleanup=[], + filenames_to_qualify=["OUTCAR"], # , "vasprun.xml"], + all_present=True, + df_filename=None, + df_compression=True, + ) + + +if __name__ == "__main__": main() diff --git a/actual_usage/compression.py b/actual_usage/compression.py index 0f1399d..9c8f64d 100644 --- a/actual_usage/compression.py +++ b/actual_usage/compression.py @@ -2,9 +2,14 @@ from utils.generic import find_and_compress_directories_parallel import os + def main(): - parser = argparse.ArgumentParser(description='Find and compress directories based on specified criteria.') - parser.add_argument('directory', metavar='DIR', type=str, help='the directory to operate on') + parser = argparse.ArgumentParser( + description="Find and compress directories based on specified criteria." + ) + parser.add_argument( + "directory", metavar="DIR", type=str, help="the directory to operate on" + ) args = parser.parse_args() find_and_compress_directories_parallel( @@ -16,8 +21,9 @@ def main(): files=[], file_patterns=[], print_msg=True, - inside_dir=True + inside_dir=True, ) -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/actual_usage/summarise_vasp_database.py b/actual_usage/summarise_vasp_database.py index 4f4090b..10acbb7 100644 --- a/actual_usage/summarise_vasp_database.py +++ b/actual_usage/summarise_vasp_database.py @@ -2,20 +2,23 @@ import argparse import pandas as pd + def analyze_vasp_database(folder_path, output_compression=False): # Initialize paths for both potential database files - database_file_pkl = os.path.join(folder_path, 'vasp_database.pkl') - database_file_gz = os.path.join(folder_path, 'vasp_database.pkl.gz') + database_file_pkl = os.path.join(folder_path, "vasp_database.pkl") + database_file_gz = os.path.join(folder_path, "vasp_database.pkl.gz") # Determine which file exists and set the appropriate path and compression option if os.path.exists(database_file_gz): database_file = database_file_gz - compression_option = 'gzip' + compression_option = "gzip" elif os.path.exists(database_file_pkl): database_file = database_file_pkl compression_option = None else: - print("Error: neither 'vasp_database.pkl' nor 'vasp_database.pkl.gz' found in the specified folder.") + print( + "Error: neither 'vasp_database.pkl' nor 'vasp_database.pkl.gz' found in the specified folder." + ) return # Load the database into a DataFrame with or without compression @@ -29,20 +32,41 @@ def analyze_vasp_database(folder_path, output_compression=False): converged_jobs = df[df["convergence"] == True] # Determine compression option for output based on the user input - output_compression_option = 'gzip' if output_compression else None + output_compression_option = "gzip" if output_compression else None # Write the failed_jobs and converged_jobs DataFrames to separate pickle files with optional compression - failed_jobs.to_pickle(os.path.join(folder_path, 'failed_jobs.pkl.gz' if output_compression else 'failed_jobs.pkl'), compression=output_compression_option) - converged_jobs.to_pickle(os.path.join(folder_path, 'converged_jobs.pkl.gz' if output_compression else 'converged_jobs.pkl'), compression=output_compression_option) + failed_jobs.to_pickle( + os.path.join( + folder_path, + "failed_jobs.pkl.gz" if output_compression else "failed_jobs.pkl", + ), + compression=output_compression_option, + ) + converged_jobs.to_pickle( + os.path.join( + folder_path, + "converged_jobs.pkl.gz" if output_compression else "converged_jobs.pkl", + ), + compression=output_compression_option, + ) # Print the counts print(f"The number of failed jobs is: {len(failed_jobs)}") print(f"The number of successful jobs is: {len(converged_jobs)}") print(f"The total number of jobs is: {len(df)}") + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Analyze VASP database") - parser.add_argument("folder_path", type=str, help="Folder path containing 'vasp_database.pkl' or 'vasp_database.pkl.gz'") - parser.add_argument("--output_compression", action="store_true", help="Enable gzip compression for output pkl files") + parser.add_argument( + "folder_path", + type=str, + help="Folder path containing 'vasp_database.pkl' or 'vasp_database.pkl.gz'", + ) + parser.add_argument( + "--output_compression", + action="store_true", + help="Enable gzip compression for output pkl files", + ) args = parser.parse_args() - analyze_vasp_database(args.folder_path, args.output_compression) \ No newline at end of file + analyze_vasp_database(args.folder_path, args.output_compression) diff --git a/actual_usage/update_vasp_db.py b/actual_usage/update_vasp_db.py index 1b7aaea..10334b3 100644 --- a/actual_usage/update_vasp_db.py +++ b/actual_usage/update_vasp_db.py @@ -3,20 +3,38 @@ import warnings from multiprocessing import cpu_count + def main(): warnings.filterwarnings("ignore") - + # Initialize argument parser - parser = argparse.ArgumentParser(description='Find and compress directories based on specified criteria.') - parser.add_argument('directory', metavar='DIR', type=str, help='the directory to operate on') - parser.add_argument('--max_dir_count', type=int, help='Maximum directory count for database generation') - parser.add_argument('--read_all_runs_in_dir', action='store_true', default=False, help='Read all runs in directory') - parser.add_argument('--read_error_runs_in_dir', action='store_true', default=False, help='Read directories with errors') + parser = argparse.ArgumentParser( + description="Find and compress directories based on specified criteria." + ) + parser.add_argument( + "directory", metavar="DIR", type=str, help="the directory to operate on" + ) + parser.add_argument( + "--max_dir_count", + type=int, + help="Maximum directory count for database generation", + ) + parser.add_argument( + "--read_all_runs_in_dir", + action="store_true", + default=False, + help="Read all runs in directory", + ) + parser.add_argument( + "--read_error_runs_in_dir", + action="store_true", + default=False, + help="Read directories with errors", + ) args = parser.parse_args() - datagen = DatabaseGenerator(args.directory, - max_workers=cpu_count()) - + datagen = DatabaseGenerator(args.directory, max_workers=cpu_count()) + # Check if max_dir_count is provided as an argument if args.max_dir_count is not None: max_dir_count = args.max_dir_count @@ -24,11 +42,14 @@ def main(): max_dir_count = 2000 # Default value # Call the update_failed_jobs_in_database function with the updated parameters - df = datagen.update_failed_jobs_in_database(df_path=args.directory, - read_error_dirs=args.read_error_runs_in_dir, - read_multiple_runs_in_dir=args.read_all_runs_in_dir, - max_dir_count=max_dir_count, - df_compression=True) + df = datagen.update_failed_jobs_in_database( + df_path=args.directory, + read_error_dirs=args.read_error_runs_in_dir, + read_multiple_runs_in_dir=args.read_all_runs_in_dir, + max_dir_count=max_dir_count, + df_compression=True, + ) + -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/setup.py b/setup.py index e14f33d..7b443fa 100644 --- a/setup.py +++ b/setup.py @@ -1,25 +1,25 @@ from setuptools import setup, find_packages setup( - name='utils', - version='0.1', - packages=find_packages(where='utils'), - package_dir={'': 'utils'}, + name="utils", + version="0.1", + packages=find_packages(where="utils"), + package_dir={"": "utils"}, install_requires=[ - 'pandas', - 'numpy', - 'pymatgen', + "pandas", + "numpy", + "pymatgen", ], scripts=[ - 'actual_usage/check_jobdir', - 'actual_usage/memory_check', - 'actual_usage/slurm_list_jobdir', - 'actual_usage/build_and_show_db', - 'actual_usage/compress_here', - 'actual_usage/qstat_slurm', - 'actual_usage/summarise_db', - 'actual_usage/setonix_refresh_mamba', - 'actual_usage/setonix_refresh_mamba', - 'actual_usage/update_failed_jobs_db', + "actual_usage/check_jobdir", + "actual_usage/memory_check", + "actual_usage/slurm_list_jobdir", + "actual_usage/build_and_show_db", + "actual_usage/compress_here", + "actual_usage/qstat_slurm", + "actual_usage/summarise_db", + "actual_usage/setonix_refresh_mamba", + "actual_usage/setonix_refresh_mamba", + "actual_usage/update_failed_jobs_db", ], ) diff --git a/tests/test_generic.py b/tests/test_generic.py index adb1b8a..0a71e9a 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -7,21 +7,24 @@ import filecmp # Import the function to be tested -from utils.generic import (chunk_list, - search_line_in_file, - parse_lines, - find_directories_with_files, - extract_tarball, - find_and_extract_tarballs_parallel, - extract_files_from_tarball, - extract_files_from_tarballs_parallel, - find_and_extract_files_from_tarballs_parallel, - compress_directory, - compress_directory_parallel, - cleanup_dir, - compress_and_cleanup, - find_and_compress_directories_parallel, - is_line_in_file) +from utils.generic import ( + chunk_list, + search_line_in_file, + parse_lines, + find_directories_with_files, + extract_tarball, + find_and_extract_tarballs_parallel, + extract_files_from_tarball, + extract_files_from_tarballs_parallel, + find_and_extract_files_from_tarballs_parallel, + compress_directory, + compress_directory_parallel, + cleanup_dir, + compress_and_cleanup, + find_and_compress_directories_parallel, + is_line_in_file, +) + class TestChunkList(unittest.TestCase): def test_chunk_list(self): @@ -48,21 +51,30 @@ def test_chunk_list(self): # Test with a large list and large n value lst5 = list(range(1, 1001)) result5 = chunk_list(lst5, 100) - expected_result5 = [list(range(1, 101)), list(range(101, 201)), list(range(201, 301)), - list(range(301, 401)), list(range(401, 501)), list(range(501, 601)), - list(range(601, 701)), list(range(701, 801)), list(range(801, 901)), - list(range(901, 1001))] + expected_result5 = [ + list(range(1, 101)), + list(range(101, 201)), + list(range(201, 301)), + list(range(301, 401)), + list(range(401, 501)), + list(range(501, 601)), + list(range(601, 701)), + list(range(701, 801)), + list(range(801, 901)), + list(range(901, 1001)), + ] self.assertEqual(result5, expected_result5) + class TestSearchLineInFile(unittest.TestCase): def setUp(self): # Create a temporary file and write some contents for testing self.temp_dir = tempfile.mkdtemp() - self.temp_file = os.path.join(self.temp_dir, 'test_file.txt') - with open(self.temp_file, 'w') as file: - file.write('This is the first line.\n') - file.write('This is the second line.\n') - file.write('This is the third line.\n') + self.temp_file = os.path.join(self.temp_dir, "test_file.txt") + with open(self.temp_file, "w") as file: + file.write("This is the first line.\n") + file.write("This is the second line.\n") + file.write("This is the third line.\n") def tearDown(self): # Remove the temporary directory and files after the test @@ -70,28 +82,31 @@ def tearDown(self): def test_search_line_in_file(self): # Test searching for an existing line in the file - result = search_line_in_file(self.temp_file, 'second line') + result = search_line_in_file(self.temp_file, "second line") self.assertTrue(result) # Test searching for a non-existing line in the file - result = search_line_in_file(self.temp_file, 'fourth line') + result = search_line_in_file(self.temp_file, "fourth line") self.assertFalse(result) def test_search_line_in_file_with_depth(self): # Test searching for a line with a specified depth - result = search_line_in_file(self.temp_file, 'first line', search_depth=2, reverse=True) + result = search_line_in_file( + self.temp_file, "first line", search_depth=2, reverse=True + ) self.assertFalse(result) def test_search_line_in_file_reverse(self): # Test searching for a line in reverse order - result = search_line_in_file(self.temp_file, 'third line', reverse=True) + result = search_line_in_file(self.temp_file, "third line", reverse=True) self.assertTrue(result) def test_search_line_in_file_file_not_found(self): # Test handling of file not found scenario - result = search_line_in_file('non_existing_file.txt', 'line') + result = search_line_in_file("non_existing_file.txt", "line") self.assertFalse(result) + class TestParseLines(unittest.TestCase): def test_parse_lines(self): flist = [ @@ -100,14 +115,14 @@ def test_parse_lines(self): "1.0 2.0 3.0\n", "4.0 5.0 6.0\n", "Trigger End line\n", - "Footer line\n" + "Footer line\n", ] trigger_start = "Trigger Start" trigger_end = "Trigger End" result = parse_lines(flist, trigger_start, trigger_end) expected = [["1.0 2.0 3.0\n", "4.0 5.0 6.0\n"]] - + np.testing.assert_array_equal(result, expected) def test_parse_lines_triggers_but_no_data(self): @@ -115,43 +130,36 @@ def test_parse_lines_triggers_but_no_data(self): "Header line\n", "Trigger Start line\n", "Trigger End line\n", - "Footer line\n" + "Footer line\n", ] trigger_start = "Trigger Start" trigger_end = "Trigger End" result = parse_lines(flist, trigger_start, trigger_end) expected = [[]] - + np.testing.assert_array_equal(result, expected) - + def test_parse_lines_no_data(self): - flist = [ - "Header line\n", - "Footer line\n" - ] + flist = ["Header line\n", "Footer line\n"] trigger_start = "Trigger Start" trigger_end = "Trigger End" result = parse_lines(flist, trigger_start, trigger_end) expected = [] - + np.testing.assert_array_equal(result, expected) - + def test_parse_lines_no_endtrigger(self): - flist = [ - "Header line\n", - "Trigger Start", - "1.0 2.0 3.0\n" - ] + flist = ["Header line\n", "Trigger Start", "1.0 2.0 3.0\n"] trigger_start = "Trigger Start" trigger_end = "Trigger End" result = parse_lines(flist, trigger_start, trigger_end) expected = [["1.0 2.0 3.0\n"]] - + np.testing.assert_array_equal(result, expected) - + def test_parse_lines_multiple_blocks(self): flist = [ "Header line\n", @@ -164,19 +172,22 @@ def test_parse_lines_multiple_blocks(self): "7.0 8.0 9.0\n", "10.0 11.0 12.0\n", "Trigger End line\n", - "Footer line\n" + "Footer line\n", ] trigger_start = "Trigger Start" trigger_end = "Trigger End" result = parse_lines(flist, trigger_start, trigger_end) - expected = [["1.0 2.0 3.0\n", "4.0 5.0 6.0\n"], - ["7.0 8.0 9.0\n", "10.0 11.0 12.0\n"]] - + expected = [ + ["1.0 2.0 3.0\n", "4.0 5.0 6.0\n"], + ["7.0 8.0 9.0\n", "10.0 11.0 12.0\n"], + ] + np.testing.assert_array_equal(result, expected) - + + class TestFindDirectoriesWithFiles(unittest.TestCase): - + def setUp(self): self.temp_dir = tempfile.mkdtemp() self.create_test_files() @@ -194,54 +205,71 @@ def create_test_files(self): dir2 = os.path.join(self.temp_dir, "dir2") dir3 = os.path.join(self.temp_dir, "dir3") dir4 = os.path.join(self.temp_dir, "dir4") - + os.makedirs(dir1) os.makedirs(dir2) os.makedirs(dir3) os.makedirs(dir4) - - with open(os.path.join(dir1, "file1.txt"), 'w') as file: + + with open(os.path.join(dir1, "file1.txt"), "w") as file: file.write("This is file 1 in dir 1") - - with open(os.path.join(dir1, "file2.txt"), 'w') as file: + + with open(os.path.join(dir1, "file2.txt"), "w") as file: file.write("This is file 2 in dir 1") - - with open(os.path.join(dir2, "file2.txt"), 'w') as file: + + with open(os.path.join(dir2, "file2.txt"), "w") as file: file.write("This is file 2") - with open(os.path.join(dir3, "file3.txt"), 'w') as file: + with open(os.path.join(dir3, "file3.txt"), "w") as file: file.write("This is file 3") with open(os.path.join(dir4, "file9.txt"), "w") as file: file.write("This is file 9") def test_find_all_files_present(self): - result = find_directories_with_files(self.temp_dir, ["file1.txt", "file2.txt", "file3.txt"], all_present=False) - expected = [os.path.join(self.temp_dir, "dir1"), os.path.join(self.temp_dir, "dir2"), os.path.join(self.temp_dir, "dir3")] + result = find_directories_with_files( + self.temp_dir, ["file1.txt", "file2.txt", "file3.txt"], all_present=False + ) + expected = [ + os.path.join(self.temp_dir, "dir1"), + os.path.join(self.temp_dir, "dir2"), + os.path.join(self.temp_dir, "dir3"), + ] self.assertTrue(set(result) == set(expected)) def test_find_some_files_present(self): - result = find_directories_with_files(self.temp_dir, ["file1.txt", "file2.txt"], all_present=True) + result = find_directories_with_files( + self.temp_dir, ["file1.txt", "file2.txt"], all_present=True + ) expected = [os.path.join(self.temp_dir, "dir1")] self.assertTrue(set(result) == set(expected)) def test_find_any_files_present(self): - result = find_directories_with_files(self.temp_dir, ["file2.txt", "file3.txt"], all_present=False) - expected = [os.path.join(self.temp_dir, "dir1"), os.path.join(self.temp_dir, "dir2"), os.path.join(self.temp_dir, "dir3")] + result = find_directories_with_files( + self.temp_dir, ["file2.txt", "file3.txt"], all_present=False + ) + expected = [ + os.path.join(self.temp_dir, "dir1"), + os.path.join(self.temp_dir, "dir2"), + os.path.join(self.temp_dir, "dir3"), + ] self.assertTrue(set(result) == set(expected)) def test_find_no_files_present(self): - result = find_directories_with_files(self.temp_dir, ["file4.txt", "file5.txt"], all_present=True) + result = find_directories_with_files( + self.temp_dir, ["file4.txt", "file5.txt"], all_present=True + ) expected = [] self.assertTrue(set(result) == set(expected)) + class TestExtractTarball(unittest.TestCase): - + def setUp(self): self.temp_dir = tempfile.mkdtemp() - self.temp_file = os.path.join(self.temp_dir, 'test_file.txt') - with open(self.temp_file, 'w') as file: - file.write('This is the first line.\n') + self.temp_file = os.path.join(self.temp_dir, "test_file.txt") + with open(self.temp_file, "w") as file: + file.write("This is the first line.\n") self.create_test_tarball() def tearDown(self): @@ -264,13 +292,14 @@ def test_extract_tarball(self): extracted_file_path = os.path.join(extraction_path, "dir1", "test_file.txt") self.assertTrue(os.path.exists(extracted_file_path)) + class TestFindAndExtractTarballsParallel(unittest.TestCase): - + def setUp(self): self.temp_dir = tempfile.mkdtemp() - self.temp_file = os.path.join(self.temp_dir, 'test_file.txt') - with open(self.temp_file, 'w') as file: - file.write('This is the first line.\n') + self.temp_file = os.path.join(self.temp_dir, "test_file.txt") + with open(self.temp_file, "w") as file: + file.write("This is the first line.\n") self.create_test_tarballs() def tearDown(self): @@ -301,22 +330,23 @@ def create_test_tarballs(self): test_tarball_path3 = os.path.join(dir2, "test3.tar.bz2") with tarfile.open(test_tarball_path3, "w:bz2") as tar: tar.add(self.temp_file, arcname="test_file3.txt") - + def test_find_and_extract_tarballs_parallel(self): parent_dir = self.temp_dir tarball_extension = ".tar.gz" find_and_extract_tarballs_parallel(parent_dir, tarball_extension) - + extracted_file_path1 = os.path.join(self.temp_dir, "dir1", "test_file1.txt") self.assertTrue(os.path.exists(extracted_file_path1)) extracted_file_path2 = os.path.join(self.temp_dir, "dir2", "test_file2.txt") self.assertTrue(os.path.exists(extracted_file_path2)) - + extracted_file_path3 = os.path.join(self.temp_dir, "dir2", "test_file3.txt") self.assertFalse(os.path.exists(extracted_file_path3)) - + + class TestExtractFilesFromTarball(unittest.TestCase): def setUp(self): self.temp_dir = tempfile.mkdtemp() @@ -334,10 +364,10 @@ def tearDown(self): def create_test_tarball(self): self.test_tarball_path = os.path.join(self.temp_dir, "test.tar.gz") test_file_path = os.path.join(os.path.dirname(__file__), "test_file.txt") - + with open(test_file_path, "w") as file: file.write("This is the content of the test file.") - + with tarfile.open(self.test_tarball_path, "w:gz") as tar: tar.add(test_file_path, arcname="dir1/test_file.txt") @@ -346,18 +376,23 @@ def test_extract_files_from_tarball(self): filenames = ["test_file.txt"] suffix = None - extracted_filepaths = extract_files_from_tarball(tarball_filepath, filenames, suffix) + extracted_filepaths = extract_files_from_tarball( + tarball_filepath, filenames, suffix + ) extracted_file_path = os.path.join(self.temp_dir, "dir1", "test_file.txt") self.assertTrue(os.path.exists(extracted_file_path)) self.assertListEqual(extracted_filepaths, [extracted_file_path]) - + + class TestExtractFilesFromTarballsParallel(unittest.TestCase): def setUp(self): self.temp_dir = tempfile.mkdtemp() self.create_test_tarballs() - self.tarball_paths = [os.path.join(self.temp_dir, "dir1", "test1.tar.gz"), - os.path.join(self.temp_dir, "dir2", "test2.tar.gz")] + self.tarball_paths = [ + os.path.join(self.temp_dir, "dir1", "test1.tar.gz"), + os.path.join(self.temp_dir, "dir2", "test2.tar.gz"), + ] def tearDown(self): for root, dirs, files in os.walk(self.temp_dir, topdown=False): @@ -375,7 +410,7 @@ def create_test_tarballs(self): # Create test tarball 1 (.tar.gz) test_tarball_path1 = os.path.join(dir1, "test1.tar.gz") - with open(os.path.join(dir1, "file1.txt"), 'w') as file: + with open(os.path.join(dir1, "file1.txt"), "w") as file: file.write("This is file 1") with tarfile.open(test_tarball_path1, "w:gz") as tar: @@ -383,7 +418,7 @@ def create_test_tarballs(self): # Create test tarball 2 (.tar.gz) test_tarball_path2 = os.path.join(dir2, "test2.tar.gz") - with open(os.path.join(dir2, "file2.txt"), 'w') as file: + with open(os.path.join(dir2, "file2.txt"), "w") as file: file.write("This is file 2") with tarfile.open(test_tarball_path2, "w:gz") as tar: @@ -396,7 +431,9 @@ def test_extract_files_from_tarballs_parallel(self): ] suffix = False - extract_files_from_tarballs_parallel(self.tarball_paths, filenames, suffix=suffix) + extract_files_from_tarballs_parallel( + self.tarball_paths, filenames, suffix=suffix + ) extracted_file_path1 = os.path.join(self.temp_dir, "dir1", "file1.txt") self.assertTrue(os.path.exists(extracted_file_path1)) @@ -413,7 +450,9 @@ def test_extract_files_with_leading_dot(self): extract_files_from_tarballs_parallel(self.tarball_paths, filenames) for i, filename in enumerate(filenames): - extracted_filepath = os.path.join(os.path.dirname(self.tarball_paths[i]), filename[2:]) + extracted_filepath = os.path.join( + os.path.dirname(self.tarball_paths[i]), filename[2:] + ) self.assertTrue(os.path.exists(extracted_filepath)) def test_extract_files_no_suffix(self): @@ -425,9 +464,12 @@ def test_extract_files_no_suffix(self): extract_files_from_tarballs_parallel(self.tarball_paths, filenames) for i, filename in enumerate(filenames): - extracted_filepath = os.path.join(os.path.dirname(self.tarball_paths[i]), filename) + extracted_filepath = os.path.join( + os.path.dirname(self.tarball_paths[i]), filename + ) self.assertTrue(os.path.exists(extracted_filepath)) + class TestFindAndExtractFilesFromTarballsParallel(unittest.TestCase): def setUp(self): self.temp_dir = tempfile.mkdtemp() @@ -449,7 +491,7 @@ def create_test_tarballs(self): # Create test tarball 1 (.tar.gz) test_tarball_path1 = os.path.join(dir1, "test1.tar.gz") - with open(os.path.join(dir1, "file1.txt"), 'w') as file: + with open(os.path.join(dir1, "file1.txt"), "w") as file: file.write("This is file 1") with tarfile.open(test_tarball_path1, "w:gz") as tar: @@ -457,7 +499,7 @@ def create_test_tarballs(self): # Create test tarball 2 (.tar.gz) test_tarball_path2 = os.path.join(dir2, "test2.tar.gz") - with open(os.path.join(dir2, "file2.txt"), 'w') as file: + with open(os.path.join(dir2, "file2.txt"), "w") as file: file.write("This is file 2") with tarfile.open(test_tarball_path2, "w:gz") as tar: @@ -469,14 +511,17 @@ def test_find_and_extract_files_from_tarballs_parallel(self): filenames = ["file1.txt", "file2.txt"] suffix = False - find_and_extract_files_from_tarballs_parallel(parent_dir, extension, filenames, suffix) + find_and_extract_files_from_tarballs_parallel( + parent_dir, extension, filenames, suffix + ) extracted_file_path1 = os.path.join(self.temp_dir, "dir1", "file1.txt") self.assertTrue(os.path.exists(extracted_file_path1)) extracted_file_path2 = os.path.join(self.temp_dir, "dir2", "file2.txt") self.assertTrue(os.path.exists(extracted_file_path2)) - + + class TestCompressDirectory(unittest.TestCase): def setUp(self): self.temp_dir = tempfile.mkdtemp() @@ -497,11 +542,11 @@ def create_test_directory(self): # Create test files in the directory file1_path = os.path.join(dir_path, "file1.txt") - with open(file1_path, 'w') as file: + with open(file1_path, "w") as file: file.write("This is file 1") file2_path = os.path.join(dir_path, "file2.txt") - with open(file2_path, 'w') as file: + with open(file2_path, "w") as file: file.write("This is file 2") def test_compress_directory(self): @@ -511,7 +556,13 @@ def test_compress_directory(self): print_message = False inside_dir = True - compress_directory(directory_path, exclude_files, exclude_file_patterns, print_message, inside_dir) + compress_directory( + directory_path, + exclude_files, + exclude_file_patterns, + print_message, + inside_dir, + ) compressed_file_path = os.path.join(self.temp_dir, "test_dir/test_dir.tar.gz") self.assertTrue(os.path.exists(compressed_file_path)) @@ -520,7 +571,8 @@ def test_compress_directory(self): file_names = tar.getnames() self.assertTrue(any(name.endswith("file1.txt") for name in file_names)) self.assertFalse(any(name.endswith("file2.txt") for name in file_names)) - + + class TestCompressDirectoryParallel(unittest.TestCase): def setUp(self): self.temp_dir = tempfile.mkdtemp() @@ -552,14 +604,20 @@ def create_test_directories(self): def test_compress_directory_parallel(self): directory_paths = [ os.path.join(self.temp_dir, "dir1"), - os.path.join(self.temp_dir, "dir2") + os.path.join(self.temp_dir, "dir2"), ] exclude_files = [["file1.txt"], ["file2.txt"]] exclude_file_patterns = [] print_message = [False] inside_dir = [True] - compress_directory_parallel(directory_paths, exclude_files, exclude_file_patterns, print_message, inside_dir) + compress_directory_parallel( + directory_paths, + exclude_files, + exclude_file_patterns, + print_message, + inside_dir, + ) compressed_file_path1 = os.path.join(self.temp_dir, "dir1/dir1.tar.gz") self.assertTrue(os.path.exists(compressed_file_path1)) @@ -577,6 +635,7 @@ def test_compress_directory_parallel(self): self.assertFalse(any(name.endswith("file1.txt") for name in file_names)) self.assertTrue(any(name.endswith("file2.txt") for name in file_names)) + class TestCleanupDir(unittest.TestCase): def setUp(self): self.temp_dir = tempfile.mkdtemp() @@ -608,7 +667,7 @@ def test_cleanup_dir_keep(self): keep = True files = ["file1.txt"] file_patterns = [] - + self.create_test_files() cleanup_dir(directory_path, keep, files, file_patterns) @@ -637,7 +696,8 @@ def test_cleanup_dir_remove(self): file3_path = os.path.join(self.temp_dir, "file3.txt") self.assertTrue(os.path.exists(file3_path)) - + + class TestCompressAndCleanup(unittest.TestCase): def setUp(self): self.temp_dir = tempfile.mkdtemp() @@ -674,8 +734,16 @@ def test_compress_and_cleanup_keep(self): print_msg = False inside_dir = True - compress_and_cleanup(directory_path, exclude_files_from_tarball, exclude_filepatterns_from_tarball, - keep_after, files, file_patterns, print_msg, inside_dir) + compress_and_cleanup( + directory_path, + exclude_files_from_tarball, + exclude_filepatterns_from_tarball, + keep_after, + files, + file_patterns, + print_msg, + inside_dir, + ) file1_path = os.path.join(self.temp_dir, "file1.txt") self.assertTrue(os.path.exists(file1_path)) @@ -686,7 +754,9 @@ def test_compress_and_cleanup_keep(self): file3_path = os.path.join(self.temp_dir, "file3.txt") self.assertFalse(os.path.exists(file3_path)) - compressed_file_path = os.path.join(self.temp_dir, os.path.basename(self.temp_dir) + ".tar.gz") + compressed_file_path = os.path.join( + self.temp_dir, os.path.basename(self.temp_dir) + ".tar.gz" + ) self.assertTrue(os.path.exists(compressed_file_path)) def test_compress_and_cleanup_remove(self): @@ -699,8 +769,16 @@ def test_compress_and_cleanup_remove(self): print_msg = False inside_dir = True - compress_and_cleanup(directory_path, exclude_files_from_tarball, exclude_filepatterns_from_tarball, - keep_after, files, file_patterns, print_msg, inside_dir) + compress_and_cleanup( + directory_path, + exclude_files_from_tarball, + exclude_filepatterns_from_tarball, + keep_after, + files, + file_patterns, + print_msg, + inside_dir, + ) file1_path = os.path.join(self.temp_dir, "file1.txt") self.assertFalse(os.path.exists(file1_path)) @@ -711,12 +789,15 @@ def test_compress_and_cleanup_remove(self): file3_path = os.path.join(self.temp_dir, "file3.txt") self.assertTrue(os.path.exists(file3_path)) - compressed_file_path = os.path.join(self.temp_dir, os.path.basename(self.temp_dir) + ".tar.gz") + compressed_file_path = os.path.join( + self.temp_dir, os.path.basename(self.temp_dir) + ".tar.gz" + ) self.assertTrue(os.path.exists(compressed_file_path)) # compressed_dir_path = os.path.join(self.temp_dir, "test_dir") # self.assertFalse(os.path.exists(compressed_dir_path)) + class TestCompressDirectoryParallel(unittest.TestCase): def setUp(self): self.temp_dir = tempfile.mkdtemp() @@ -746,13 +827,22 @@ def create_test_directories(self): file.write("This is file 2") def test_compress_directory_parallel(self): - directory_paths = [os.path.join(self.temp_dir, "dir1"), os.path.join(self.temp_dir, "dir2")] + directory_paths = [ + os.path.join(self.temp_dir, "dir1"), + os.path.join(self.temp_dir, "dir2"), + ] exclude_files = [] exclude_file_patterns = [] print_message = False inside_dir = True - compress_directory_parallel(directory_paths, exclude_files, exclude_file_patterns, print_message, inside_dir) + compress_directory_parallel( + directory_paths, + exclude_files, + exclude_file_patterns, + print_message, + inside_dir, + ) compressed_file_path1 = os.path.join(self.temp_dir, "dir1/dir1.tar.gz") self.assertTrue(os.path.exists(compressed_file_path1)) @@ -772,8 +862,17 @@ def test_compress_directory_parallel(self): extracted_file_path2 = os.path.join(self.temp_dir, "dir2", "file2.txt") self.assertTrue(os.path.exists(extracted_file_path2)) - self.assertTrue(filecmp.cmp(extracted_file_path1, os.path.join(self.temp_dir, "dir1", "file1.txt"))) - self.assertTrue(filecmp.cmp(extracted_file_path2, os.path.join(self.temp_dir, "dir2", "file2.txt"))) + self.assertTrue( + filecmp.cmp( + extracted_file_path1, os.path.join(self.temp_dir, "dir1", "file1.txt") + ) + ) + self.assertTrue( + filecmp.cmp( + extracted_file_path2, os.path.join(self.temp_dir, "dir2", "file2.txt") + ) + ) + class TestFindAndCompressDirectoriesParallel(unittest.TestCase): def setUp(self): @@ -822,10 +921,19 @@ def test_find_and_compress_directories_parallel(self): print_msg = False inside_dir = True all_present = False - - find_and_compress_directories_parallel(parent_dir, valid_dir_if_filenames, all_present, exclude_files_from_tarball, - exclude_filepatterns_from_tarball, keep_after, files, file_patterns, - print_msg, inside_dir) + + find_and_compress_directories_parallel( + parent_dir, + valid_dir_if_filenames, + all_present, + exclude_files_from_tarball, + exclude_filepatterns_from_tarball, + keep_after, + files, + file_patterns, + print_msg, + inside_dir, + ) compressed_file_path1 = os.path.join(self.temp_dir, "dir1/dir1.tar.gz") self.assertTrue(os.path.exists(compressed_file_path1)) @@ -847,17 +955,26 @@ def test_find_and_compress_directories_parallel(self): extracted_file_path2 = os.path.join(self.temp_dir, "dir2", "file2.txt") self.assertTrue(os.path.exists(extracted_file_path2)) - + self.assertFalse(os.path.exists(compressed_file_path3)) - self.assertTrue(filecmp.cmp(extracted_file_path1, os.path.join(self.temp_dir, "dir1", "file1.txt"))) - self.assertTrue(filecmp.cmp(extracted_file_path1, os.path.join(self.temp_dir, "dir1", "file1.txt"))) - + self.assertTrue( + filecmp.cmp( + extracted_file_path1, os.path.join(self.temp_dir, "dir1", "file1.txt") + ) + ) + self.assertTrue( + filecmp.cmp( + extracted_file_path1, os.path.join(self.temp_dir, "dir1", "file1.txt") + ) + ) + + class TestIsLineInFile(unittest.TestCase): def test_exact_match_line_present(self): # Create a temporary file with some lines - with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file: + with tempfile.NamedTemporaryFile(mode="w", delete=False) as temp_file: temp_file.write("Line 1\n") temp_file.write("Line 2\n") temp_file.write("Line 3\n") @@ -880,7 +997,7 @@ def test_exact_match_line_not_present(self): def test_partial_match_line_present(self): # Create a temporary file with some lines - with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file: + with tempfile.NamedTemporaryFile(mode="w", delete=False) as temp_file: temp_file.write("Hello, world!\n") temp_file.write("Goodbye, world!\n") @@ -893,7 +1010,7 @@ def test_partial_match_line_present(self): def test_partial_match_line_not_present(self): # Create a temporary file with some lines - with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file: + with tempfile.NamedTemporaryFile(mode="w", delete=False) as temp_file: temp_file.write("Hello, world!\n") temp_file.write("Goodbye, world!\n") @@ -903,6 +1020,7 @@ def test_partial_match_line_not_present(self): result = is_line_in_file(filepath, line_to_search, exact_match) self.assertFalse(result) - -if __name__ == '__main__': - unittest.main() \ No newline at end of file + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_vasp.py b/tests/test_vasp.py index a1fdb76..07aff0c 100644 --- a/tests/test_vasp.py +++ b/tests/test_vasp.py @@ -3,7 +3,8 @@ import os import shutil -from utils.vasp.database import (find_vasp_directories) +from utils.vasp.database import find_vasp_directories + class TestFindVaspDirectories(unittest.TestCase): def setUp(self): @@ -57,14 +58,16 @@ def test_find_vasp_directories(self): all_present = False extract_tarballs = True - directories = find_vasp_directories(parent_dir, filenames, all_present, extract_tarballs) + directories = find_vasp_directories( + parent_dir, filenames, all_present, extract_tarballs + ) self.assertEqual(len(directories), 2) expected_dirs = ["dir2", "dir3"] for dir_name in expected_dirs: self.assertIn(dir_name, [os.path.basename(dir) for dir in directories]) - + def test_find_vasp_directories_negative(self): # Create a temporary empty directory to test the negative case empty_dir = tempfile.mkdtemp() @@ -75,10 +78,13 @@ def test_find_vasp_directories_negative(self): all_present = True extract_tarballs = True - directories = find_vasp_directories(parent_dir, filenames, all_present, extract_tarballs) + directories = find_vasp_directories( + parent_dir, filenames, all_present, extract_tarballs + ) # Assert that the function returns an empty list as there are no directories that meet the criteria self.assertEqual(len(directories), 0) + if __name__ == "__main__": unittest.main() diff --git a/utils/GNN_calculators/mace.py b/utils/GNN_calculators/mace.py index 6d1918a..f53b6fc 100644 --- a/utils/GNN_calculators/mace.py +++ b/utils/GNN_calculators/mace.py @@ -17,6 +17,7 @@ df = pd.read_pickle("unrel_df.pkl") + def calc_static_CHGNET(structure, chgnet=None): try: if chgnet is None: @@ -27,10 +28,13 @@ def calc_static_CHGNET(structure, chgnet=None): forces = chgnet_pred["f"] magmoms = chgnet_pred["m"] except Exception as e: - print(f"CHGNET evaluation failed with exception: {e} \n Probably the element you are trying does not exist in their dataset") + print( + f"CHGNET evaluation failed with exception: {e} \n Probably the element you are trying does not exist in their dataset" + ) return np.nan, np.nan, np.nan return toten, forces, magmoms + def calc_static_M3GNET(structure, m3gnet=None): try: if m3gnet is None: @@ -40,27 +44,38 @@ def calc_static_M3GNET(structure, m3gnet=None): toten = atoms.get_potential_energy() forces = atoms.get_forces() except Exception as e: - print(f"M3GNET evaluation failed with exception: {e} \n Probably the element you are trying does not exist in their dataset") + print( + f"M3GNET evaluation failed with exception: {e} \n Probably the element you are trying does not exist in their dataset" + ) return np.nan, np.nan, np.nan return toten, forces, np.nan -def calc_static_MACE(structure, MACE="/g/data/v43/Han/mace/mace/calculators/foundations_models/2023-08-14-mace-universal.model", device="cpu", default_dtype="float32"): + +def calc_static_MACE( + structure, + MACE="/g/data/v43/Han/mace/mace/calculators/foundations_models/2023-08-14-mace-universal.model", + device="cpu", + default_dtype="float32", +): try: MACE_calculator = MACECalculator( - model_paths=MACE, - device=device, - default_dtype=default_dtype, - ) + model_paths=MACE, + device=device, + default_dtype=default_dtype, + ) atoms = AseAtomsAdaptor().get_atoms(structure) atoms.set_calculator(MACE_calculator) toten = atoms.get_potential_energy() forces = atoms.get_forces() except Exception as e: - print(f"MACE evaluation failed with exception: {e} \n Probably the element you are trying does not exist in their dataset") + print( + f"MACE evaluation failed with exception: {e} \n Probably the element you are trying does not exist in their dataset" + ) return np.nan, np.nan, np.nan return toten, forces, np.nan -def calc_static_GNN(structure, model_type = None, model = None): + +def calc_static_GNN(structure, model_type=None, model=None): if model_type == "mace": toten, forces, magmoms = calc_static_MACE(structure) elif model_type == "m3gnet": @@ -68,11 +83,15 @@ def calc_static_GNN(structure, model_type = None, model = None): elif model_type == "chgnet": toten, forces, magmoms = calc_static_CHGNET(structure, chgnet=model) else: - warnings.warn(f"Specified model {model} is not a valid calculator, returning np.nan") + warnings.warn( + f"Specified model {model} is not a valid calculator, returning np.nan" + ) toten = np.nan forces = np.nan magmoms = np.nan return toten, forces, magmoms + + for model_type in ["chgnet"]: pureGB_toten_lst = [] pureslab_toten_lst = [] @@ -105,16 +124,24 @@ def calc_static_GNN(structure, model_type = None, model = None): # Wrap the outer loop with tqdm to add a progress bar for idx, row in tqdm(df.iterrows(), total=len(df), desc=f"Model: {model_type}"): - i+=1 - #if i > 5: + i += 1 + # if i > 5: # break pureGB_start = time.time() # Call the calc_static_GNN function - pureGB_toten, pureGB_f, pureGB_m = calc_static_GNN(row.struct_pureGB, model_type=model_type, model=model) - pureslab_toten, pureslab_f, pureslab_m = calc_static_GNN(row.struct_pureSLAB, model_type=model_type, model=model) - segGB_toten, segGB_f, segGB_m = calc_static_GNN(row.struct_segGB, model_type=model_type, model=model) - solslab_toten, solslab_f, solslab_m = calc_static_GNN(row.struct_solSLAB, model_type=model_type, model=model) + pureGB_toten, pureGB_f, pureGB_m = calc_static_GNN( + row.struct_pureGB, model_type=model_type, model=model + ) + pureslab_toten, pureslab_f, pureslab_m = calc_static_GNN( + row.struct_pureSLAB, model_type=model_type, model=model + ) + segGB_toten, segGB_f, segGB_m = calc_static_GNN( + row.struct_segGB, model_type=model_type, model=model + ) + solslab_toten, solslab_f, solslab_m = calc_static_GNN( + row.struct_solSLAB, model_type=model_type, model=model + ) # Append values to the corresponding lists pureGB_toten_lst.append(pureGB_toten) @@ -133,11 +160,11 @@ def calc_static_GNN(structure, model_type = None, model = None): solslab_m_lst.append(solslab_m) # Calculate the energy of segregation at each step and append to the list - eseg = ( - segGB_toten - pureGB_toten - (solslab_toten - pureslab_toten) - ) + eseg = segGB_toten - pureGB_toten - (solslab_toten - pureslab_toten) eseg_lst.append(eseg) - print(f"{row.job_name}: Eseg = {eseg_lst[-1]}, DFT = {row.E_seg_DFT}, error_Eseg = {eseg_lst[-1] - row.E_seg_DFT}") + print( + f"{row.job_name}: Eseg = {eseg_lst[-1]}, DFT = {row.E_seg_DFT}, error_Eseg = {eseg_lst[-1] - row.E_seg_DFT}" + ) print(f"Row processing time: {time.time() - pureGB_start:.4f} seconds") model_elapsed_time = time.time() - start_time @@ -160,7 +187,7 @@ def calc_static_GNN(structure, model_type = None, model = None): df[f"segGB_m_{model_type}"] = segGB_m_lst df[f"solslab_m_{model_type}"] = solslab_m_lst - #df.to_pickle(f"df_{model_type}.pkl") + # df.to_pickle(f"df_{model_type}.pkl") # Attach energy of segregation to the DataFrame with the corresponding suffix df[f"eseg_{model_type}"] = eseg_lst diff --git a/utils/StructureManipulator/cleave.py b/utils/StructureManipulator/cleave.py index 6cc46b5..bb75e5d 100644 --- a/utils/StructureManipulator/cleave.py +++ b/utils/StructureManipulator/cleave.py @@ -11,6 +11,7 @@ # RIPPED FROM MPINTERFACES + def center_slab(structure): """ Centers the atoms in a slab structure around 0.5 @@ -27,6 +28,7 @@ def center_slab(structure): structure.translate_sites(range(len(structure.sites)), translation) return structure + def get_rotation_matrix(axis, theta): """ Find the rotation matrix associated with counterclockwise rotation @@ -43,16 +45,21 @@ def get_rotation_matrix(axis, theta): axis = np.array(list(axis)) axis = axis / np.linalg.norm(axis) - axis *= -np.sin(theta/2.0) - a = np.cos(theta/2.0) + axis *= -np.sin(theta / 2.0) + a = np.cos(theta / 2.0) b, c, d = tuple(axis.tolist()) - aa, bb, cc, dd = a*a, b*b, c*c, d*d - bc, ad, ac, ab, bd, cd = b*c, a*d, a*c, a*b, b*d, c*d - return np.array([[aa+bb-cc-dd, 2*(bc+ad), 2*(bd-ac)], - [2*(bc-ad), aa+cc-bb-dd, 2*(cd+ab)], - [2*(bd+ac), 2*(cd-ab), aa+dd-bb-cc]]) - -def align_axis(structure, axis='c', direction=(0, 0, 1)): + aa, bb, cc, dd = a * a, b * b, c * c, d * d + bc, ad, ac, ab, bd, cd = b * c, a * d, a * c, a * b, b * d, c * d + return np.array( + [ + [aa + bb - cc - dd, 2 * (bc + ad), 2 * (bd - ac)], + [2 * (bc - ad), aa + cc - bb - dd, 2 * (cd + ab)], + [2 * (bd + ac), 2 * (cd - ab), aa + dd - bb - cc], + ] + ) + + +def align_axis(structure, axis="c", direction=(0, 0, 1)): """ Rotates a structure so that the specified axis is along the [001] direction. This is useful for adding vacuum, and @@ -66,23 +73,23 @@ def align_axis(structure, axis='c', direction=(0, 0, 1)): structure. Rotated to align axis along direction. """ - if axis == 'a': + if axis == "a": axis = structure.lattice._matrix[0] - elif axis == 'b': + elif axis == "b": axis = structure.lattice._matrix[1] - elif axis == 'c': + elif axis == "c": axis = structure.lattice._matrix[2] proj_axis = np.cross(axis, direction) - if not(proj_axis[0] == 0 and proj_axis[1] == 0): - theta = ( - np.arccos(np.dot(axis, direction) - / (np.linalg.norm(axis) * np.linalg.norm(direction))) + if not (proj_axis[0] == 0 and proj_axis[1] == 0): + theta = np.arccos( + np.dot(axis, direction) / (np.linalg.norm(axis) * np.linalg.norm(direction)) ) R = get_rotation_matrix(proj_axis, theta) rotation = SymmOp.from_rotation_and_translation(rotation_matrix=R) structure.apply_operation(rotation) return structure + def add_vacuum(structure, vacuum): """ Adds padding to a slab or 2D material. @@ -103,21 +110,28 @@ def add_vacuum(structure, vacuum): structure = Structure(lattice_C, species, coords, coords_are_cartesian=True) return center_slab(structure) + def cleave_sites(structure, cleave_line_coord, vacuum_size): - site_list = []; site_list2 = [] + site_list = [] + site_list2 = [] for idx, sites in enumerate(structure): if sites.frac_coords[-1] > cleave_line_coord: - #print(idx) + # print(idx) site_list.append(idx) else: - #print(idx) + # print(idx) site_list2.append(idx) - transformation_shift_up = transform.TranslateSitesTransformation(site_list,(0,0,vacuum_size/2),vector_in_frac_coords=False) - transformation_shift_down = transform.TranslateSitesTransformation(site_list2,(0,0,-vacuum_size/2),vector_in_frac_coords=False) + transformation_shift_up = transform.TranslateSitesTransformation( + site_list, (0, 0, vacuum_size / 2), vector_in_frac_coords=False + ) + transformation_shift_down = transform.TranslateSitesTransformation( + site_list2, (0, 0, -vacuum_size / 2), vector_in_frac_coords=False + ) cleaved_cell = transformation_shift_up.apply_transformation(structure) cleaved_cell = transformation_shift_down.apply_transformation(cleaved_cell) return cleaved_cell + def get_unique_values_in_nth_value(arr_list, n, tolerance): """ Returns unique values in the n-th element of sublists in arr_list within a specified tolerance. @@ -142,6 +156,7 @@ def get_unique_values_in_nth_value(arr_list, n, tolerance): unique_values.append(value) return np.sort(unique_values) + def compute_average_pairs(lst): """ Computes the average of consecutive pairs in the given list. @@ -158,6 +173,7 @@ def compute_average_pairs(lst): averages.append(average) return averages + def get_non_host_ele_idx(structure, host_elements): """ Returns the indices of non-host elements in the structure. @@ -169,10 +185,17 @@ def get_non_host_ele_idx(structure, host_elements): Returns: - list: Indices of non-host elements in the structure. """ - non_host_indices = [i for i, site in enumerate(structure) if site.species_string not in host_elements] + non_host_indices = [ + i + for i, site in enumerate(structure) + if site.species_string not in host_elements + ] return non_host_indices -def get_min_max_cp_coords_solute(structure, host_elements, axis, threshold=5, fractional=True): + +def get_min_max_cp_coords_solute( + structure, host_elements, axis, threshold=5, fractional=True +): """ Returns the minimum and maximum coordinates of solute elements along the specified axis. @@ -192,15 +215,18 @@ def get_min_max_cp_coords_solute(structure, host_elements, axis, threshold=5, fr for site_idx in non_host_indices: coord = structure[site_idx].frac_coords[axis] if max_coord is None or coord > max_coord: - max_coord = (coord + threshold/structure.lattice.abc[axis]) + max_coord = coord + threshold / structure.lattice.abc[axis] if min_coord is None or coord < min_coord: - min_coord = (coord - threshold/structure.lattice.abc[axis]) + min_coord = coord - threshold / structure.lattice.abc[axis] if not fractional: max_coord = max_coord * structure.lattice.abc[axis] min_coord = min_coord * structure.lattice.abc[axis] return [min_coord, max_coord] -def get_cp_coords_solute(structure, host_elements, axis, threshold=5, tolerance=0.01, fractional=True): + +def get_cp_coords_solute( + structure, host_elements, axis, threshold=5, tolerance=0.01, fractional=True +): """ Returns viable coordinates for solute elements within a specified range along the specified axis. @@ -215,11 +241,17 @@ def get_cp_coords_solute(structure, host_elements, axis, threshold=5, tolerance= Returns: - list: List of viable coordinates for solute elements. """ - min_max = get_min_max_cp_coords_solute(structure, host_elements, axis, fractional=fractional, threshold=threshold) + min_max = get_min_max_cp_coords_solute( + structure, host_elements, axis, fractional=fractional, threshold=threshold + ) if fractional: - atomic_layers = get_unique_values_in_nth_value(structure.frac_coords, -1, tolerance=tolerance/structure.lattice.abc[axis]) + atomic_layers = get_unique_values_in_nth_value( + structure.frac_coords, -1, tolerance=tolerance / structure.lattice.abc[axis] + ) else: - atomic_layers = get_unique_values_in_nth_value(structure.cart_coords, -1, tolerance=tolerance) + atomic_layers = get_unique_values_in_nth_value( + structure.cart_coords, -1, tolerance=tolerance + ) cp_list = compute_average_pairs(atomic_layers) min_cp_thres = min_max[0] max_cp_thres = min_max[1] @@ -227,14 +259,17 @@ def get_cp_coords_solute(structure, host_elements, axis, threshold=5, tolerance= cp_viable = [cp for cp in cp_list if min_cp_thres <= cp <= max_cp_thres] return cp_viable -def cleave_structure(structure, cleave_line_coord, cleave_vacuum_length, axis, fractional=True): + +def cleave_structure( + structure, cleave_line_coord, cleave_vacuum_length, axis, fractional=True +): """ - Cleaves the structure along a specified coordinate line. - Assumes vacuum is already present! + Cleaves the structure along a specified coordinate line. + Assumes vacuum is already present! If not, please: - - structure = add_vacuum(structure) - + + structure = add_vacuum(structure) + before this! Parameters: @@ -247,47 +282,76 @@ def cleave_structure(structure, cleave_line_coord, cleave_vacuum_length, axis, f Returns: - pymatgen.Structure: Cleaved structure. """ - site_list = []; site_list2 = [] + site_list = [] + site_list2 = [] for idx, sites in enumerate(structure): if fractional: - if sites.frac_coords[axis] > cleave_line_coord: + if sites.frac_coords[axis] > cleave_line_coord: site_list.append(idx) else: site_list2.append(idx) else: - if sites.coords[axis] > cleave_line_coord: + if sites.coords[axis] > cleave_line_coord: site_list.append(idx) else: site_list2.append(idx) shift = [0, 0, 0] - shift[axis] = cleave_vacuum_length/2 + shift[axis] = cleave_vacuum_length / 2 shift2 = shift.copy() - shift2[axis] = -cleave_vacuum_length/2 - transformation_shift_up = transform.TranslateSitesTransformation(site_list,tuple(shift),vector_in_frac_coords=False) - transformation_shift_down = transform.TranslateSitesTransformation(site_list2,tuple(shift2),vector_in_frac_coords=False) + shift2[axis] = -cleave_vacuum_length / 2 + transformation_shift_up = transform.TranslateSitesTransformation( + site_list, tuple(shift), vector_in_frac_coords=False + ) + transformation_shift_down = transform.TranslateSitesTransformation( + site_list2, tuple(shift2), vector_in_frac_coords=False + ) cleaved_struct = transformation_shift_up.apply_transformation(structure) cleaved_struct = transformation_shift_down.apply_transformation(cleaved_struct) return cleaved_struct -def cleave_structure_around_solutes(structure, - host_elements, - axis=2, - cleave_vacuum_length=6, - sol_dist_threshold=5, - tolerance=0.01, - add_vacuum_block_length=None): + +def cleave_structure_around_solutes( + structure, + host_elements, + axis=2, + cleave_vacuum_length=6, + sol_dist_threshold=5, + tolerance=0.01, + add_vacuum_block_length=None, +): if add_vacuum_block_length is not None: - structure = add_vacuum(structure,vacuum=add_vacuum_block_length) - cp_coords = get_cp_coords_solute(structure, host_elements=host_elements, axis=axis, threshold=sol_dist_threshold, tolerance=tolerance) + structure = add_vacuum(structure, vacuum=add_vacuum_block_length) + cp_coords = get_cp_coords_solute( + structure, + host_elements=host_elements, + axis=axis, + threshold=sol_dist_threshold, + tolerance=tolerance, + ) cleaved_struct_list = [] for cp in cp_coords: - cleaved_struct = cleave_structure(structure,cleave_line_coord=cp,cleave_vacuum_length=cleave_vacuum_length, axis=axis) + cleaved_struct = cleave_structure( + structure, + cleave_line_coord=cp, + cleave_vacuum_length=cleave_vacuum_length, + axis=axis, + ) cleaved_struct_list.append(cleaved_struct) return cleaved_struct_list -def cleave_structures_around_site(structure, site_index, axis=2, cleave_vacuum_length=6, site_dist_threshold=5, tolerance=0.01, add_vacuum_block_length=None, fractional=True): + +def cleave_structures_around_site( + structure, + site_index, + axis=2, + cleave_vacuum_length=6, + site_dist_threshold=5, + tolerance=0.01, + add_vacuum_block_length=None, + fractional=True, +): """ - Cleaves a structure around a specified site. Assumes vacuum is already present! + Cleaves a structure around a specified site. Assumes vacuum is already present! If not, add vacuum before this. Parameters: @@ -306,11 +370,15 @@ def cleave_structures_around_site(structure, site_index, axis=2, cleave_vacuum_l if add_vacuum_block_length is not None: structure = add_vacuum(structure, vacuum=add_vacuum_block_length) - site_coord = structure[site_index].frac_coords[axis] if fractional else structure[site_index].coords[axis] + site_coord = ( + structure[site_index].frac_coords[axis] + if fractional + else structure[site_index].coords[axis] + ) # Determine the range of coordinates around the specified site - min_coord = site_coord - site_dist_threshold/structure.lattice.abc[axis] - max_coord = site_coord + site_dist_threshold/structure.lattice.abc[axis] + min_coord = site_coord - site_dist_threshold / structure.lattice.abc[axis] + max_coord = site_coord + site_dist_threshold / structure.lattice.abc[axis] # Get unique values in the specified axis within the tolerance coords = structure.frac_coords if fractional else structure.cart_coords @@ -322,7 +390,13 @@ def cleave_structures_around_site(structure, site_index, axis=2, cleave_vacuum_l cleaved_struct_list = [] for cp in cp_viable: - cleaved_struct = cleave_structure(structure, cleave_line_coord=cp, cleave_vacuum_length=cleave_vacuum_length, axis=axis, fractional=fractional) + cleaved_struct = cleave_structure( + structure, + cleave_line_coord=cp, + cleave_vacuum_length=cleave_vacuum_length, + axis=axis, + fractional=fractional, + ) cleaved_struct_list.append(cleaved_struct) return cleaved_struct_list diff --git a/utils/StructureManipulator/interstitial.py b/utils/StructureManipulator/interstitial.py index 4b3d6ba..4a42fd0 100644 --- a/utils/StructureManipulator/interstitial.py +++ b/utils/StructureManipulator/interstitial.py @@ -23,17 +23,20 @@ import scipy.optimize as optimization import json + # Reload packages when they change (mostly for custom modules) -from IPython.lib.deepreload import reload -%load_ext autoreload -%autoreload 2 +# from IPython.lib.deepreload import reload +# %load_ext autoreload +# %autoreload 2 import warnings + warnings.filterwarnings("ignore") + # pyscal version <= 2.10.15 def get_all_vertices(sys): - ''' + """ Calculate all Voronoi vertices Parameters @@ -44,16 +47,17 @@ def get_all_vertices(sys): ------- all_vertices_raw: list of floats list of all Voronoi vertices - ''' - sys.find_neighbors(method='voronoi') + """ + sys.find_neighbors(method="voronoi") all_vertices_raw = [] for atom in sys.iter_atoms(): for v in atom.vertex_positions: all_vertices_raw.append(v) return all_vertices_raw + def get_octahedral_positions(sys_in, alat): - ''' + """ Get all octahedral vertex positions Parameters @@ -67,33 +71,34 @@ def get_octahedral_positions(sys_in, alat): ------- octahedral_at: list of floats position of octahedral voids - ''' + """ octahedral_at = [] real_pos = np.array([np.array(atox.pos) for atox in sys_in.iter_atoms()]) atoms = sys_in.get_all_atoms() box = sys_in.box count = 0 for i in range(len(atoms)): - for j in range(i+1, len(atoms)): + for j in range(i + 1, len(atoms)): dist = sys_in.get_distance(atoms[i], atoms[j]) - if np.abs(dist-alat) < 1E-2: + if np.abs(dist - alat) < 1e-2: count += 1 - npos = (np.array(atoms[i].pos)+np.array(atoms[j].pos))/2 + npos = (np.array(atoms[i].pos) + np.array(atoms[j].pos)) / 2 if 0 <= npos[0] <= box[0][0]: if 0 <= npos[1] <= box[1][1]: if 0 <= npos[2] <= box[2][2]: - #print(np.abs(np.sum(npos-real_pos))) - #print(npos) + # print(np.abs(np.sum(npos-real_pos))) + # print(npos) found = False for rpos in real_pos: - if np.sum(np.abs(npos-rpos)) < 1E-5: + if np.sum(np.abs(npos - rpos)) < 1e-5: found = True if not found: octahedral_at.append(npos) return octahedral_at + def add_sphereatoms(sys, all_vertices, max_type): - ''' + """ Add ghost atoms at vertex positions Parameters @@ -111,16 +116,17 @@ def add_sphereatoms(sys, all_vertices, max_type): ------- sys: pyscal System object - ''' + """ new_atoms = [] for vertex in all_vertices: - atom=pc.Atom(pos=vertex, type=max_type+1) + atom = pc.Atom(pos=vertex, type=max_type + 1) new_atoms.append(atom) sys.add_atoms(new_atoms) return sys + def get_ra(sys, natoms, pf): - ''' + """ Calculate radius ra Parameters @@ -137,15 +143,16 @@ def get_ra(sys, natoms, pf): ------- ra: float Calculated ra - ''' + """ box = sys.box vol = np.dot(np.cross(box[0], box[1]), box[2]) - volatom = vol/natoms - ra = ((pf*volatom)/((4/3)*np.pi))**(1/3) + volatom = vol / natoms + ra = ((pf * volatom) / ((4 / 3) * np.pi)) ** (1 / 3) return ra + def get_rvv(sys, max_type, ra): - ''' + """ Calculate rvv for each atom Parameters @@ -157,42 +164,47 @@ def get_rvv(sys, max_type, ra): ra: float calculated ra value - ''' + """ rlist = [] atoms = sys.atoms for atom in atoms: - if atom.type == max_type+1: - #collect ”real” neighbors - nns = [x for x in atom.neighbors if atoms[x].type<=max_type] - #get the distances + if atom.type == max_type + 1: + # collect ”real” neighbors + nns = [x for x in atom.neighbors if atoms[x].type <= max_type] + # get the distances dists = [sys.get_distance(atom, atoms[n]) for n in nns] - #get minimum distance + # get minimum distance Rvv = min(dists) - rvv = (Rvv-ra)/ra + rvv = (Rvv - ra) / ra atom.cutoff = rvv rlist.append(rvv) - return rlist,atoms + return rlist, atoms + -def get_interstitial_structure(input_file, output_file = "poscar.vasp", alat = 2.84, pf = 0.68): +def get_interstitial_structure( + input_file, output_file="poscar.vasp", alat=2.84, pf=0.68 +): # Input parameters - ''' + """ pf = 0.68 # Packing factor of the input crystal lattice alat = 2.84 # Lattice constant in Angstroms example usage: iGB_struct = get_interstitial_structure("tempGB.vasp", output_file = "GB.vasp", alat = 2.84, pf = 0.68) struct_list, struct_all_studied_sites = get_int_struct_list(GB_struct_list[i], midpoint=midpoints[i]) - ''' + """ # Read input from CONTCAR file sys_in = pc.System() sys_out = pc.System() - sys_in.read_inputfile(input_file, format='poscar') - sys_out.read_inputfile(input_file, format='poscar') + sys_in.read_inputfile(input_file, format="poscar") + sys_out.read_inputfile(input_file, format="poscar") # Find all Voronoi vertices and obtain unique ones with a precision of 2 decimal points all_vertices_raw = get_all_vertices(sys_in) - all_vertices = np.unique((np.array(all_vertices_raw)*100).astype(int)/100, axis=0) + all_vertices = np.unique( + (np.array(all_vertices_raw) * 100).astype(int) / 100, axis=0 + ) # Get all octahedral positions octahedral_at = get_octahedral_positions(sys_in, alat) @@ -202,24 +214,24 @@ def get_interstitial_structure(input_file, output_file = "poscar.vasp", alat = 2 natoms = sys_in.natoms max_type = len(conc.keys()) - #Combine vertices and octahedral sites + # Combine vertices and octahedral sites combined_list = np.concatenate((all_vertices, octahedral_at)) - #add ghost atoms at vertex positions - sys_out = add_sphereatoms(sys_out,combined_list,max_type) + # add ghost atoms at vertex positions + sys_out = add_sphereatoms(sys_out, combined_list, max_type) - #calculate ra + # calculate ra ra = get_ra(sys_out, natoms, pf) - #Ghost atoms are used in pyscal to compensate for the small number of total real atoms - #The remap_atoms method removes these ghost atoms, including: - #(i) remapping atoms back to the simulation box, - #(ii) remove the pyscal inbuilt ghost atoms, given by atom.id > total atoms, - #(iii) remove atoms that are too close to each other - the distance tolerance can be set using ‘dtol‘ + # Ghost atoms are used in pyscal to compensate for the small number of total real atoms + # The remap_atoms method removes these ghost atoms, including: + # (i) remapping atoms back to the simulation box, + # (ii) remove the pyscal inbuilt ghost atoms, given by atom.id > total atoms, + # (iii) remove atoms that are too close to each other - the distance tolerance can be set using ‘dtol‘ nx = sys_out.remap_atoms(dtol=0.4, remove_images=False) - sys_out.to_file(output_file, format = 'poscar') - + sys_out.to_file(output_file, format="poscar") + struct = Structure.from_file(filename=output_file) # Define a mapping from the current species to the desired species species_mapping = {Element("H"): Element("Fe"), Element("He"): Element("H")} @@ -230,42 +242,57 @@ def get_interstitial_structure(input_file, output_file = "poscar.vasp", alat = 2 return struct # Get radius of VV sphere, rvv # Calculate neighbors again + + # sys_out.find_neighbors(method="cutoff",cutoff=alat) # # Output void ratios (rvv/ra) and count # rlist,atoms = get_rvv(sys_out,max_type,ra) # void_ratios, void_count = np.unique(np.round(rlist, decimals=1), return_counts=True) -def get_int_struct_list(structure, zlims = [], host_elements=["Fe"], within_GB_distance=3, midpoint = 0.50945): - - int_id = [i for i, site in enumerate(structure) if site.species_string not in host_elements] - GB_id = [i for i, site in enumerate(structure) if site.species_string in host_elements] - + +def get_int_struct_list( + structure, zlims=[], host_elements=["Fe"], within_GB_distance=3, midpoint=0.50945 +): + + int_id = [ + i + for i, site in enumerate(structure) + if site.species_string not in host_elements + ] + GB_id = [ + i for i, site in enumerate(structure) if site.species_string in host_elements + ] + GB_struct = structure.copy() GB_struct.remove_sites(int_id) - + z_frac_coords = [site.frac_coords[-1] for site in GB_struct] zlims = [min(z_frac_coords), max(z_frac_coords)] - + only_intsites_struct = structure.copy() only_intsites_struct.remove_sites(GB_id) - only_intsites_struct.merge_sites(tol = 0.35, mode = "a") + only_intsites_struct.merge_sites(tol=0.35, mode="a") only_intsites_struct.sort(lambda x: x.frac_coords[-1]) - - int_fcoords = [site.frac_coords for site in only_intsites_struct if site.species_string == "H"] - # Get the ones we are interested in computing + int_fcoords = [ + site.frac_coords for site in only_intsites_struct if site.species_string == "H" + ] + + # Get the ones we are interested in computing struct_list = [] struct_all_studied_sites = GB_struct.copy() - + for int_sites in int_fcoords: compute_struct = GB_struct.copy() - if int_sites[-1] > min(zlims)\ - and int_sites[-1] < max(zlims)\ - and abs(int_sites[-1] - midpoint) * structure.lattice.c < within_GB_distance: - # and int_sites[-1] <= midpoint+0.01: + if ( + int_sites[-1] > min(zlims) + and int_sites[-1] < max(zlims) + and abs(int_sites[-1] - midpoint) * structure.lattice.c < within_GB_distance + ): + # and int_sites[-1] <= midpoint+0.01: compute_struct.append("H", int_sites) struct_list.append(compute_struct) struct_all_studied_sites.append("H", int_sites) - + return struct_list, struct_all_studied_sites diff --git a/utils/ace_descriptor_utils.py b/utils/ace_descriptor_utils.py index 08ea53a..b7061cc 100644 --- a/utils/ace_descriptor_utils.py +++ b/utils/ace_descriptor_utils.py @@ -1,10 +1,7 @@ import string from random import sample import math -from itertools import ( - starmap, - combinations -) +from itertools import starmap, combinations import numpy as np import scipy.stats as ss @@ -24,46 +21,54 @@ from pyace.atomicenvironment import aseatoms_to_atomicenvironment from tqdm.auto import tqdm -def make_ace(rmax, number_of_functions=250, element='H', **kwargs): + +def make_ace(rmax, number_of_functions=250, element="H", **kwargs): pot_conf = { - 'deltaSplineBins': 0.001, - 'elements': [element], - 'embeddings': {'ALL': {'drho_core_cut': 250, - 'fs_parameters': [1, 1], - 'ndensity': 1, - 'npot': 'FinnisSinclair', - 'rho_core_cut': 200000}, - }, - 'bonds': { - 'ALL': {'NameOfCutoffFunction': 'cos', - 'core-repulsion': [10000.0, 5.0], - 'dcut': 0.01, - 'radbase': 'ChebPow', - # 'nradbase': 10, - 'radparameters': [2.0], - 'rcut': 1.1 * rmax}, + "deltaSplineBins": 0.001, + "elements": [element], + "embeddings": { + "ALL": { + "drho_core_cut": 250, + "fs_parameters": [1, 1], + "ndensity": 1, + "npot": "FinnisSinclair", + "rho_core_cut": 200000, + }, + }, + "bonds": { + "ALL": { + "NameOfCutoffFunction": "cos", + "core-repulsion": [10000.0, 5.0], + "dcut": 0.01, + "radbase": "ChebPow", + # 'nradbase': 10, + "radparameters": [2.0], + "rcut": 1.1 * rmax, + }, + }, + "functions": { + "number_of_functions_per_element": number_of_functions, + "UNARY": + # simple default from Yury + { + "nradmax_by_orders": [15, 6, 4, 3, 2, 2], + "lmax_by_orders": [0, 3, 3, 2, 2, 1], + }, + # {'nradmax_by_orders': [ 10, 5, 3, 2, ], + # 'lmax_by_orders': [ 0 , 3, 3, 1, ]} }, - 'functions': { - 'number_of_functions_per_element': number_of_functions, - 'UNARY': - # simple default from Yury - { 'nradmax_by_orders': [ 15, 6, 4, 3, 2, 2 ], - 'lmax_by_orders': [ 0 , 3, 3, 2, 2, 1 ]} - # {'nradmax_by_orders': [ 10, 5, 3, 2, ], - # 'lmax_by_orders': [ 0 , 3, 3, 1, ]} - } } calc = pyace.PyACECalculator( - pyace.create_multispecies_basis_config(pot_conf), - **kwargs + pyace.create_multispecies_basis_config(pot_conf), **kwargs ) return calc + def get_ace_descr(calc, structure, max_params=None, copy=True, overwrite_type=True): if copy: structure = structure.copy() if overwrite_type: - structure[:] = 'H' + structure[:] = "H" structure.calc = calc structure.get_potential_energy() @@ -79,12 +84,10 @@ def get_ace_descr(calc, structure, max_params=None, copy=True, overwrite_type=Tr # axis=1 # ) if max_params is not None and max_params < min(descr.shape): - descr = PCA( - whiten=True, - n_components=max_params - ).fit_transform(descr) + descr = PCA(whiten=True, n_components=max_params).fit_transform(descr) return descr + def suggest_sites(structure, num_sites, mask=None): if mask is None: mask = np.ones(len(structure), dtype=bool) @@ -107,7 +110,7 @@ def suggest_sites(structure, num_sites, mask=None): # pick the largest peak as the mode mode = x[p[si.peak_prominences(k.pdf(x), p)[0].argmax()]] # sort all atoms by their deviation from the mode (ie. bulk atoms) - SA = np.argsort(abs(pca-mode)) + SA = np.argsort(abs(pca - mode)) # the mask needs to be sorted in the same way, then we pick num_sites # atoms that are furthest from the mode sites = SA[mask[SA]][-num_sites:] @@ -116,6 +119,7 @@ def suggest_sites(structure, num_sites, mask=None): else: raise ValueError("Lazy developer error!") + def plot_sites(structure, sites): I = np.zeros(len(structure)) for i in sites: @@ -123,8 +127,10 @@ def plot_sites(structure, sites): return structure.plot3d(scalar_field=I) + ### SPACE routines + def space(calc, structure, pure_descr, indices, per_atom=False): """ Calculate the SPACE descriptors from a given unary ACE model. @@ -143,8 +149,12 @@ def space(calc, structure, pure_descr, indices, per_atom=False): """ single = pure_descr[indices].reshape(len(indices), -1) - solutes = ASEAtoms(['H']*len(indices), structure.positions[indices], - cell=structure.cell, pbc=structure.pbc) + solutes = ASEAtoms( + ["H"] * len(indices), + structure.positions[indices], + cell=structure.cell, + pbc=structure.pbc, + ) calc.ace.compute(aseatoms_to_atomicenvironment(solutes)) projections = np.asarray(calc.ace.projections) inter = projections @@ -154,98 +164,102 @@ def space(calc, structure, pure_descr, indices, per_atom=False): else: return full + def calc_space_descriptors( - structure, segregations, calc, pure_descr=None, per_atom=False, - tqdm_enabled=True + structure, segregations, calc, pure_descr=None, per_atom=False, tqdm_enabled=True ): if pure_descr is None: - pure_descr = get_ace_descr( - calc, - structure, - max_params=None - ) + pure_descr = get_ace_descr(calc, structure, max_params=None) descr_shape = (pure_descr.shape[1] + len(calc.basis.basis_coeffs),) # Check For structure descriptor array - info = segregations.has_array('descriptors') - if info and info['shape'] != descr_shape: - del segregations._per_chunk_arrays['descriptors'] - info = segregations.has_array('descriptors') + info = segregations.has_array("descriptors") + if info and info["shape"] != descr_shape: + del segregations._per_chunk_arrays["descriptors"] + info = segregations.has_array("descriptors") if not info: - segregations.add_array('descriptors', - shape=descr_shape, - per='chunk', - fill=np.nan + segregations.add_array( + "descriptors", shape=descr_shape, per="chunk", fill=np.nan ) # Check For atom descriptor array if per_atom: - info = segregations.has_array('atomic_descriptors') - if info and info['shape'] != descr_shape: - del segregations._per_chunk_arrays['atomic_descriptors'] - info = segregations.has_array('atomic_descriptors') + info = segregations.has_array("atomic_descriptors") + if info and info["shape"] != descr_shape: + del segregations._per_chunk_arrays["atomic_descriptors"] + info = segregations.has_array("atomic_descriptors") if not info: - segregations.add_array('atomic_descriptors', - shape=descr_shape, - per='element', - fill=np.nan + segregations.add_array( + "atomic_descriptors", shape=descr_shape, per="element", fill=np.nan ) - for i in tqdm(range(len(segregations)), - desc='SPACE', disable=not tqdm_enabled): - if np.isnan(segregations['descriptors', i]).any(): - descr = space(calc, structure, pure_descr, segregations['indices', i], - per_atom=per_atom) + for i in tqdm(range(len(segregations)), desc="SPACE", disable=not tqdm_enabled): + if np.isnan(segregations["descriptors", i]).any(): + descr = space( + calc, + structure, + pure_descr, + segregations["indices", i], + per_atom=per_atom, + ) if not per_atom: - segregations['descriptors', i] = descr + segregations["descriptors", i] = descr else: - segregations['atomic_descriptors', i] = descr - segregations['descriptors', i] = descr.sum(axis=0) + segregations["atomic_descriptors", i] = descr + segregations["descriptors", i] = descr.sum(axis=0) + def reduce_sites( - structure, - segregations, - ace, - cluster_threshold=1e-4, cluster=True, check_cluster=True + structure, + segregations, + ace, + cluster_threshold=1e-4, + cluster=True, + check_cluster=True, ): """ Find and filter equivalent segregation patterns with ACE. """ calc_space_descriptors(structure, segregations, ace) - descr = segregations['descriptors'] + descr = segregations["descriptors"] if cluster: _, unique, inverse, counts = np.unique( - DBSCAN(min_samples=1, eps=cluster_threshold).fit_predict( - StandardScaler().fit_transform(descr) - ), - return_index=True, return_inverse=True, return_counts=True + DBSCAN(min_samples=1, eps=cluster_threshold).fit_predict( + StandardScaler().fit_transform(descr) + ), + return_index=True, + return_inverse=True, + return_counts=True, ) inverse = unique[inverse] if check_cluster: for R in unique: - D = descr[inverse==R] - assert np.abs(D-D[0]).mean() < 1e-4 + D = descr[inverse == R] + assert np.abs(D - D[0]).mean() < 1e-4 # assert np.allclose(D, D[0], atol=1e-5) else: _, unique, inverse, counts = np.unique( - descr.round(7), # get rid of floating point noise - # StandardScaler().fit_transform(descr).round( - # -int(np.ceil(np.log10(cluster_threshold))) - # ), - axis=0, - return_index=True, return_inverse=True, return_counts=True + descr.round(7), # get rid of floating point noise + # StandardScaler().fit_transform(descr).round( + # -int(np.ceil(np.log10(cluster_threshold))) + # ), + axis=0, + return_index=True, + return_inverse=True, + return_counts=True, ) inverse = unique[inverse] return unique, inverse, counts -def fit_space(df, D, E='excess', LM=Ridge, plot=True): - df = df.query('n_sites>0') - df['index'] = df['index'].astype(int) - SI, I = np.unique( df['index'], return_index=True ) + +def fit_space(df, D, E="excess", LM=Ridge, plot=True): + df = df.query("n_sites>0") + df["index"] = df["index"].astype(int) + SI, I = np.unique(df["index"], return_index=True) Dr = D[SI] Er = df[E].iloc[I] lm = LM(fit_intercept=False) @@ -255,19 +269,22 @@ def fit_space(df, D, E='excess', LM=Ridge, plot=True): if len(Ep) < 500: plt.scatter(Er, Ep) else: - plt.hexbin(Er, Ep, bins='log') + plt.hexbin(Er, Ep, bins="log") plt.gca().set_aspect(1) - plt.plot([Er.min()]*2, [Er.max()]*2, 'r-') - rmse = np.sqrt( np.mean((Er - Ep)**2) ) + plt.plot([Er.min()] * 2, [Er.max()] * 2, "r-") + rmse = np.sqrt(np.mean((Er - Ep) ** 2)) return lm, rmse, np.abs(Er - Ep).max() + ### Sampling routines + def random_combination(pool, r): n = len(pool) indices = sorted(sample(range(n), r)) return tuple(pool[i] for i in indices) + def n_random_combinations(iterable, r, n): pool = tuple(iterable) if n >= math.comb(len(pool), r): @@ -276,104 +293,101 @@ def n_random_combinations(iterable, r, n): for _ in range(n): yield random_combination(pool, r) + def make_individual_segregation(seg, name, indices, **kwargs): seg.add_chunk( - len(indices), - identifier=name, - indices=indices, - n_sites=len(indices), - **kwargs + len(indices), identifier=name, indices=indices, n_sites=len(indices), **kwargs ) -def add_segregations(seg, all_sites, max_sites, cache=None, tqdm_enabled=True, **kwargs): + +def add_segregations( + seg, all_sites, max_sites, cache=None, tqdm_enabled=True, **kwargs +): num_sites = len(all_sites) # distribute n_sites evenly, but take into account that we have added # n=1 & n=full already by default - max_per_n_sites = { - i: max_sites // (num_sites - 2) - for i in range(2, num_sites) - } - for i in range(2, num_sites//2 + 1): + max_per_n_sites = {i: max_sites // (num_sites - 2) for i in range(2, num_sites)} + for i in range(2, num_sites // 2 + 1): # can't add more structures than permutationally possible nmax = math.comb(num_sites, i) navg = max_per_n_sites[i] if navg > nmax: max_per_n_sites[i] = nmax max_per_n_sites[num_sites - i] = nmax - for j in range(i+1, num_sites): - max_per_n_sites[j] += 2*(navg - nmax)//(num_sites-i) + for j in range(i + 1, num_sites): + max_per_n_sites[j] += 2 * (navg - nmax) // (num_sites - i) if cache is None: if len(seg) > 0: - cache = set(seg['identifier']) + cache = set(seg["identifier"]) else: cache = set() - for o in tqdm(range(2, num_sites), desc='Order', disable=not tqdm_enabled): - for names, indices in starmap(zip, - n_random_combinations(all_sites.items(), o, max_per_n_sites[i]) + for o in tqdm(range(2, num_sites), desc="Order", disable=not tqdm_enabled): + for names, indices in starmap( + zip, n_random_combinations(all_sites.items(), o, max_per_n_sites[i]) ): - sites = '|'.join(names) + sites = "|".join(names) if sites not in cache: - make_individual_segregation( - seg, sites, indices, **kwargs - ) + make_individual_segregation(seg, sites, indices, **kwargs) cache.add(sites) return cache + ### Analysis routines -def get_excess_energies(df, E='[E]N', cname='coverage'): +def get_excess_energies(df, E="[E]N", cname="coverage"): c = df[cname] / df[cname].max() cmin = df[cname].min() cmax = df[cname].max() - e0 = df.query(f'{cname}==@cmin')[E].min() - e1 = df.query(f'{cname}==@cmax')[E].min() - df['excess'] = df[E] - (1-c)*e0 - e1 * c + e0 = df.query(f"{cname}==@cmin")[E].min() + e1 = df.query(f"{cname}==@cmax")[E].min() + df["excess"] = df[E] - (1 - c) * e0 - e1 * c - ch = ConvexHull(df[[cname, 'excess']].to_numpy()) - df['stable'] = False - df['stable'].iloc[ - df.iloc[ np.unique(ch.simplices) - ].query('excess<=0').index] = True + ch = ConvexHull(df[[cname, "excess"]].to_numpy()) + df["stable"] = False + df["stable"].iloc[df.iloc[np.unique(ch.simplices)].query("excess<=0").index] = True # S = df.query('stable').sites # makes sure that degenerate sites of the ones found by CH are also # marked stable, not needed after we move this to analyze # df.stable.iloc[df.query('original.isin(@S)').index] = True - chex = sint.interp1d(*df.query('stable')[[cname, 'excess']].to_numpy().T) - df['energy_above_hull'] = df.excess - df[cname].map(chex) + chex = sint.interp1d(*df.query("stable")[[cname, "excess"]].to_numpy().T) + df["energy_above_hull"] = df.excess - df[cname].map(chex) # better version of the paragraph above - df.loc[df.energy_above_hull==0].stable = True + df.loc[df.energy_above_hull == 0].stable = True return df -def plot_excess_energies(df, cname='n_sites'): - sns.violinplot( - data=df, - x=cname, y='excess', - cut=0 - ) +def plot_excess_energies(df, cname="n_sites"): + + sns.violinplot(data=df, x=cname, y="excess", cut=0) sns.lineplot( - data=df.query('stable'), marker='o', color='k', - x=cname, y='excess', zorder=1, + data=df.query("stable"), + marker="o", + color="k", + x=cname, + y="excess", + zorder=1, ) - return df -def plot_energies_above_hull(df, temperature_units=False, cname='n_sites'): + +def plot_energies_above_hull(df, temperature_units=False, cname="n_sites"): E = df.energy_above_hull.to_numpy() if temperature_units: E /= 8.6e-5 sns.scatterplot( - data=df, alpha=.5, - x=cname, y=E, - hue='stable', #size='degeneracy' + data=df, + alpha=0.5, + x=cname, + y=E, + hue="stable", # size='degeneracy' ) - return df \ No newline at end of file + return df diff --git a/utils/analysis_functions.py b/utils/analysis_functions.py index a8dc244..ccedcb7 100644 --- a/utils/analysis_functions.py +++ b/utils/analysis_functions.py @@ -52,47 +52,70 @@ job_path_standard_Gadi = "%s\\jobfile-conv-Gadi" % job_script_folder job_path_standard_Magnus = "%s\\jobfile-conv-Magnus" % job_script_folder job_path_standard_Setonix = "%s/jobfile-conv-Setonix" % job_script_folder -job_path_DoubleRelaxation_DDEC6_Setonix = os.path.join(job_script_folder, "jobfile-Setonix-DoubleRelaxation-DDEC6") -job_path_DoubleRelaxation_DDEC6_Gadi = os.path.join(job_script_folder, "jobfile-Gadi-DoubleRelaxation-DDEC6") -job_path_DoubleRelaxation_DDEC6_Gadi_GPU = os.path.join(job_script_folder, "jobfile-Gadi-DoubleRelaxation-DDEC6-GPU") -job_path_StaticImage_DDEC6_Setonix = os.path.join(job_script_folder, "jobfile-Setonix-StaticImage-DDEC6") -job_path_StaticImage_DDEC6_Gadi = os.path.join(job_script_folder, "jobfile-Gadi-StaticImage-DDEC6") +job_path_DoubleRelaxation_DDEC6_Setonix = os.path.join( + job_script_folder, "jobfile-Setonix-DoubleRelaxation-DDEC6" +) +job_path_DoubleRelaxation_DDEC6_Gadi = os.path.join( + job_script_folder, "jobfile-Gadi-DoubleRelaxation-DDEC6" +) +job_path_DoubleRelaxation_DDEC6_Gadi_GPU = os.path.join( + job_script_folder, "jobfile-Gadi-DoubleRelaxation-DDEC6-GPU" +) +job_path_StaticImage_DDEC6_Setonix = os.path.join( + job_script_folder, "jobfile-Setonix-StaticImage-DDEC6" +) +job_path_StaticImage_DDEC6_Gadi = os.path.join( + job_script_folder, "jobfile-Gadi-StaticImage-DDEC6" +) VASP_job_INCAR_path = "%s\\INCAR" % job_script_folder VASP_job_INCAR_DDEC6_path = "%s\\INCAR-DDEC6" % job_script_folder -GB_Kpoint_rough_dict = {"S11-RA110-S3-32": func.KPOINTS([3, 3, 1], [0, 0, 0]), - "S3-RA110-S1-11": func.KPOINTS([4, 2, 1], [0, 0, 0]), - "S3-RA110-S1-12": func.KPOINTS([4, 3, 1], [0, 0, 0]), - "S5-RA001-S210": func.KPOINTS([3, 3, 1], [0, 0, 0]), - "S5-RA001-S310": func.KPOINTS([3, 2, 1], [0, 0, 0]), - "S9-RA110-S2-21": func.KPOINTS([3, 4, 1], [0, 0, 0])} +GB_Kpoint_rough_dict = { + "S11-RA110-S3-32": func.KPOINTS([3, 3, 1], [0, 0, 0]), + "S3-RA110-S1-11": func.KPOINTS([4, 2, 1], [0, 0, 0]), + "S3-RA110-S1-12": func.KPOINTS([4, 3, 1], [0, 0, 0]), + "S5-RA001-S210": func.KPOINTS([3, 3, 1], [0, 0, 0]), + "S5-RA001-S310": func.KPOINTS([3, 2, 1], [0, 0, 0]), + "S9-RA110-S2-21": func.KPOINTS([3, 4, 1], [0, 0, 0]), +} # KPOINT NUMBER CALCULATED: S3-1 6, S3-2 7, S9 7, S5-2 5, S5-3 4, S11 7 -GB_Kpoint_fine_dict = {"S11-RA110-S3-32": func.KPOINTS([6, 6, 1], [0, 0, 0]), - "S3-RA110-S1-11": func.KPOINTS([6, 3, 1], [0, 0, 0]), - "S3-RA110-S1-12": func.KPOINTS([6, 6, 1], [0, 0, 0]), - "S5-RA001-S210": func.KPOINTS([4, 4, 1], [0, 0, 0]), - "S5-RA001-S310": func.KPOINTS([4, 3, 1], [0, 0, 0]), - "S9-RA110-S2-21": func.KPOINTS([4, 6, 1], [0, 0, 0])} - -list_GB = ["S11-RA110-S3-32","S3-RA110-S1-11", "S3-RA110-S1-12", "S9-RA110-S2-21"] +GB_Kpoint_fine_dict = { + "S11-RA110-S3-32": func.KPOINTS([6, 6, 1], [0, 0, 0]), + "S3-RA110-S1-11": func.KPOINTS([6, 3, 1], [0, 0, 0]), + "S3-RA110-S1-12": func.KPOINTS([6, 6, 1], [0, 0, 0]), + "S5-RA001-S210": func.KPOINTS([4, 4, 1], [0, 0, 0]), + "S5-RA001-S310": func.KPOINTS([4, 3, 1], [0, 0, 0]), + "S9-RA110-S2-21": func.KPOINTS([4, 6, 1], [0, 0, 0]), +} + +list_GB = ["S11-RA110-S3-32", "S3-RA110-S1-11", "S3-RA110-S1-12", "S9-RA110-S2-21"] list_element = ["P", "Ti", "V", "Cr", "Mn", "Co", "Ni", "Cu", "Nb", "Mo", "W"] + + def get_immediate_subdirectories(a_dir): - return [f.path for f in os.scandir(a_dir) if f.is_dir() and os.path.basename(f) != ".ipynb_checkpoints"] + return [ + f.path + for f in os.scandir(a_dir) + if f.is_dir() and os.path.basename(f) != ".ipynb_checkpoints" + ] + class DataPaths: - def __init__(self, - DataPath = "C:\\Users\\liger\\OneDrive - The University of Sydney (Staff)\\FeGBProject-Data", - Fpath_Krough = "%s\\P-X-Krough" % DataPath, - Fpath_Kfine = "%s\\P-X-Krefined" % DataPath, - Seg1_path = "%s\\Segregation_1sol" % DataPath, - Seg2_path = "%s\\Segregation_2sol" % DataPath, - Wsep1_rigid_path = "%s\\RGS-1sol" % DataPath, - Wsep1_rel_path = "%s\\Wsep_relaxed-1sol" % DataPath, - Wsep2_rigid_path = "%s\\RGS" % DataPath, - Wsep2_rel_path = "%s\\Wsep_rel" % DataPath, - BO1_path = "%s\\BondOrder-1solute" % DataPath, - BO2_path = "%s\\BondOrder-2solute" % DataPath): + def __init__( + self, + DataPath="C:\\Users\\liger\\OneDrive - The University of Sydney (Staff)\\FeGBProject-Data", + Fpath_Krough="%s\\P-X-Krough" % DataPath, + Fpath_Kfine="%s\\P-X-Krefined" % DataPath, + Seg1_path="%s\\Segregation_1sol" % DataPath, + Seg2_path="%s\\Segregation_2sol" % DataPath, + Wsep1_rigid_path="%s\\RGS-1sol" % DataPath, + Wsep1_rel_path="%s\\Wsep_relaxed-1sol" % DataPath, + Wsep2_rigid_path="%s\\RGS" % DataPath, + Wsep2_rel_path="%s\\Wsep_rel" % DataPath, + BO1_path="%s\\BondOrder-1solute" % DataPath, + BO2_path="%s\\BondOrder-2solute" % DataPath, + ): self.DataPath = DataPath self.Fpath_Krough = "%s\\P-X-Krough" % DataPath @@ -106,39 +129,98 @@ def __init__(self, self.BO1_path = "%s\\BondOrder-1solute" % DataPath self.BO2_path = "%s\\BondOrder-2solute" % DataPath + class PlotParameters: """ PlotParameters class contains object-based convenience functionality for plotting parameters """ - def __init__(self, - output_path = "C:\\Users\\liger\\OneDrive - The University of Sydney (Staff)\\FeGB-P-TM-Project\\Manuscript\\Figures-P-TM"): - self.marker_dict = dict(zip(["S3-RA110-S1-11", "S3-RA110-S1-12", "S9-RA110-S2-21", "S11-RA110-S3-32"], - ['o','s','d','^'])) - self.GB_labels = dict(zip(["S3-RA110-S1-11", "S3-RA110-S1-12", "S9-RA110-S2-21", "S11-RA110-S3-32"], - [r"$\Sigma3\ [110](1\bar{1}1)$", r"$\Sigma3\ [110](1\bar{1}2)$", r"$\Sigma9\ [110](2\bar{2}1)$", r"$\Sigma11\ [110](3\bar{3}2)$"])) - self.GB_labels_short = dict(zip(["S3-RA110-S1-11", "S3-RA110-S1-12", "S9-RA110-S2-21", "S11-RA110-S3-32"], - [r"$\Sigma3(1\bar{1}1)$", r"$\Sigma3(1\bar{1}2)$", r"$\Sigma9(2\bar{2}1)$", r"$\Sigma11(3\bar{3}2)$"])) + + def __init__( + self, + output_path="C:\\Users\\liger\\OneDrive - The University of Sydney (Staff)\\FeGB-P-TM-Project\\Manuscript\\Figures-P-TM", + ): + self.marker_dict = dict( + zip( + [ + "S3-RA110-S1-11", + "S3-RA110-S1-12", + "S9-RA110-S2-21", + "S11-RA110-S3-32", + ], + ["o", "s", "d", "^"], + ) + ) + self.GB_labels = dict( + zip( + [ + "S3-RA110-S1-11", + "S3-RA110-S1-12", + "S9-RA110-S2-21", + "S11-RA110-S3-32", + ], + [ + r"$\Sigma3\ [110](1\bar{1}1)$", + r"$\Sigma3\ [110](1\bar{1}2)$", + r"$\Sigma9\ [110](2\bar{2}1)$", + r"$\Sigma11\ [110](3\bar{3}2)$", + ], + ) + ) + self.GB_labels_short = dict( + zip( + [ + "S3-RA110-S1-11", + "S3-RA110-S1-12", + "S9-RA110-S2-21", + "S11-RA110-S3-32", + ], + [ + r"$\Sigma3(1\bar{1}1)$", + r"$\Sigma3(1\bar{1}2)$", + r"$\Sigma9(2\bar{2}1)$", + r"$\Sigma11(3\bar{3}2)$", + ], + ) + ) self.output_path = "C:\\Users\\liger\\OneDrive - The University of Sydney (Staff)\\FeGB-P-TM-Project\\Manuscript\\Figures-P-TM" self.label_string_S11_RA110_S3_32 = r"$\Sigma11\ [110](3\bar{3}2)$" self.label_string_S3_RA110_S1_11 = r"$\Sigma3\ [110](1\bar{1}1)$" self.label_string_S3_RA110_S1_12 = r"$\Sigma3\ [110](1\bar{1}2)$" self.label_string_S9_RA110_S2_21 = r"$\Sigma9\ [110](2\bar{2}1)$" - self.color_ele_dict = dict(zip(["P", "Ti", "V", "Cr", "Mn", "Co", "Ni", "Cu", "Nb", "Mo", "W"], - ["olive", "blue", "orange", "green", "red", "black", "brown", "pink", "darkviolet", "lime", "cyan"])) - -class SegregationEnergyData_2sol(): - def __init__(self, savefile = True): + self.color_ele_dict = dict( + zip( + ["P", "Ti", "V", "Cr", "Mn", "Co", "Ni", "Cu", "Nb", "Mo", "W"], + [ + "olive", + "blue", + "orange", + "green", + "red", + "black", + "brown", + "pink", + "darkviolet", + "lime", + "cyan", + ], + ) + ) + + +class SegregationEnergyData_2sol: + def __init__(self, savefile=True): Segregation_1sol = SegregationEnergyData_1sol() - def get_1sol_cohesion_effect(GB, element, property, df = None): - ''' + + def get_1sol_cohesion_effect(GB, element, property, df=None): + """ GB = GB string (e.g. S11-RA110-S3-32) element = element string (e.g. V) property = one of string: eta_RGS, eta_rel, eta_ANSBO df = the output from get_1sol_cohesion_summary() method not strictly necessary, but is required for the purposes of speedup - ''' + """ if df is None: df = get_1sol_cohesion_summary(GB) ele_df = df[df["element"] == element] @@ -153,10 +235,18 @@ def get_2sol_cohesion_RGS(GB, case): area = get_area("%s\\%s\\Co\\GB\\CONTCAR" % (fp_Seg1_path, GB)) case_df = pd.read_csv(csv_path) # total energy of non-cleaved GB structure - total_energy = GB_energetics_df[GB_energetics_df["system"] == case].energy.values[0] - Wsep_RGS_list = [(row.energy - total_energy) * 16.02176565 / area for _, row in case_df.iterrows()] + total_energy = GB_energetics_df[ + GB_energetics_df["system"] == case + ].energy.values[0] + Wsep_RGS_list = [ + (row.energy - total_energy) * 16.02176565 / area + for _, row in case_df.iterrows() + ] Wsep_RGS = min(np.array(Wsep_RGS_list)) - cp_list = [float(row.system.split(sep="-")[-1]) for _, row in case_df.iterrows()] + cp_list = [ + float(row.system.split(sep="-")[-1]) + for _, row in case_df.iterrows() + ] else: Wsep_RGS = np.nan Wsep_RGS_list = np.nan @@ -169,13 +259,21 @@ def get_2sol_cohesion_Wseprel(GB, case): GB_energetics_df = self.GB_treated_dfs_dict[GB] csv_path = "%s\\%s\\info.csv" % (fp_Wsep2_rel, GB) df = pd.read_csv(csv_path) - df["base_system"] = ["-".join(row.system.split(sep = "-")[:-2]) for _, row in df.iterrows()] + df["base_system"] = [ + "-".join(row.system.split(sep="-")[:-2]) for _, row in df.iterrows() + ] if case in df.base_system.values: area = get_area("%s\\%s\\Co\\GB\\CONTCAR" % (fp_Seg1_path, GB)) # total energy of non-cleaved GB structure - total_energy = GB_energetics_df[GB_energetics_df["system"] == case].energy.values[0] - Wsep_rel = (df[df["base_system"] == case].energy.values[0] - total_energy) * 16.02176565 / area + total_energy = GB_energetics_df[ + GB_energetics_df["system"] == case + ].energy.values[0] + Wsep_rel = ( + (df[df["base_system"] == case].energy.values[0] - total_energy) + * 16.02176565 + / area + ) else: Wsep_rel = np.nan @@ -183,13 +281,20 @@ def get_2sol_cohesion_Wseprel(GB, case): def get_2sol_cohesion_BO(GB, case): csv_RGS_path = "%s\\%s\\%s\\info.csv" % (fp_Wsep2_rigid, GB, case) - if os.path.isfile(csv_RGS_path) and os.path.isfile("%s\\%s\\%s\\CONTCAR" % (fp_BO2, GB, case)): + if os.path.isfile(csv_RGS_path) and os.path.isfile( + "%s\\%s\\%s\\CONTCAR" % (fp_BO2, GB, case) + ): case_df = pd.read_csv(csv_RGS_path) - cp_list = [float(row.system.split(sep="-")[-1]) for _, row in case_df.iterrows()] - min_bo_list, _ = cp_bondorder(structure_path = "%s\\%s\\%s\\CONTCAR" % (fp_BO2, GB, case),\ - DDEC_output_path = "%s\\%s\\%s" % (fp_BO2, GB, case),\ - cleavage_plane_array = cp_list,\ - bo_threshold = 0) + cp_list = [ + float(row.system.split(sep="-")[-1]) + for _, row in case_df.iterrows() + ] + min_bo_list, _ = cp_bondorder( + structure_path="%s\\%s\\%s\\CONTCAR" % (fp_BO2, GB, case), + DDEC_output_path="%s\\%s\\%s" % (fp_BO2, GB, case), + cleavage_plane_array=cp_list, + bo_threshold=0, + ) min_bo_CP = cp_list[np.argmin(min_bo_list)] min_bo = min(min_bo_list) else: @@ -204,31 +309,55 @@ def get_2sol_cohesion_BO(GB, case): df_Krough_list = [] for element in get_immediate_subdirectories(GB): df_Krough = pd.read_csv("%s\\info.csv" % element) - df_Krough["distance"] = [get_dist_solutes("%s\\%s" % (element, row.system)) for i, row in df_Krough.iterrows()] - df_Krough["element"] = [row.system.split(sep="-")[0] - if row.system.split(sep="-")[0] != "P" - else row.system.split(sep="-")[-2] - for i, row in df_Krough.iterrows()] + df_Krough["distance"] = [ + get_dist_solutes("%s\\%s" % (element, row.system)) + for i, row in df_Krough.iterrows() + ] + df_Krough["element"] = [ + ( + row.system.split(sep="-")[0] + if row.system.split(sep="-")[0] != "P" + else row.system.split(sep="-")[-2] + ) + for i, row in df_Krough.iterrows() + ] df_Krough_list.append(df_Krough) df_Krough_all = pd.concat(df_Krough_list) # Re-organise into ordering agnostic blocks (e.g. P-X and X-P both fall under "X" calls to dict) df_Krough_list = [] for element in df_Krough_all.element.unique(): - df_Krough_list.append(df_Krough_all[df_Krough_all["element"] == element]) + df_Krough_list.append( + df_Krough_all[df_Krough_all["element"] == element] + ) df_Krough_dict = dict(zip(df_Krough_all.element.unique(), df_Krough_list)) dict_Krough_list.append(df_Krough_dict) - self.GB_Krough_df_dict = dict(zip([os.path.basename(GB) for GB in get_immediate_subdirectories(fp_Fpath_Krough)], dict_Krough_list)) + self.GB_Krough_df_dict = dict( + zip( + [ + os.path.basename(GB) + for GB in get_immediate_subdirectories(fp_Fpath_Krough) + ], + dict_Krough_list, + ) + ) dict_Kfine_list = [] for GB in get_immediate_subdirectories(fp_Fpath_Kfine): df_Kfine_list = [] for element in get_immediate_subdirectories(GB): df_Kfine = pd.read_csv("%s\\info.csv" % element) - df_Kfine["distance"] = [get_dist_solutes("%s\\%s" % (element, row.system)) for i, row in df_Kfine.iterrows()] - df_Kfine["element"] = [row.system.split(sep="-")[0] - if row.system.split(sep="-")[0] != "P" - else row.system.split(sep="-")[-2] - for i, row in df_Kfine.iterrows()] + df_Kfine["distance"] = [ + get_dist_solutes("%s\\%s" % (element, row.system)) + for i, row in df_Kfine.iterrows() + ] + df_Kfine["element"] = [ + ( + row.system.split(sep="-")[0] + if row.system.split(sep="-")[0] != "P" + else row.system.split(sep="-")[-2] + ) + for i, row in df_Kfine.iterrows() + ] df_Kfine_list.append(df_Kfine) df_Kfine_all = pd.concat(df_Kfine_list) @@ -237,7 +366,15 @@ def get_2sol_cohesion_BO(GB, case): df_Kfine_list.append(df_Kfine_all[df_Kfine_all["element"] == element]) df_Kfine_dict = dict(zip(df_Kfine_all.element.unique(), df_Kfine_list)) dict_Kfine_list.append(df_Kfine_dict) - self.GB_Kfine_df_dict = dict(zip([os.path.basename(GB) for GB in get_immediate_subdirectories(fp_Fpath_Kfine)], dict_Kfine_list)) + self.GB_Kfine_df_dict = dict( + zip( + [ + os.path.basename(GB) + for GB in get_immediate_subdirectories(fp_Fpath_Kfine) + ], + dict_Kfine_list, + ) + ) df_all_list = [] for GB in self.GB_Kfine_df_dict: @@ -249,18 +386,24 @@ def get_2sol_cohesion_BO(GB, case): corr_list = [] for i, case in df_Kf.iterrows(): if len(df_Kr[df_Kr["system"] == case.system]): - energy_Kf = df_Kf[df_Kf["system"] == case.system].energy.values[0] - energy_Kr = df_Kr[df_Kr["system"] == case.system].energy.values[0] + energy_Kf = df_Kf[ + df_Kf["system"] == case.system + ].energy.values[0] + energy_Kr = df_Kr[ + df_Kr["system"] == case.system + ].energy.values[0] # E_kr + corr = E_kf correction = energy_Kf - energy_Kr corr_list.append(correction) else: corr_list.append(np.nan) - df_Kr['energy'] = [row.energy + np.mean(corr_list) for i, row in df_Kr.iterrows()] + df_Kr["energy"] = [ + row.energy + np.mean(corr_list) for i, row in df_Kr.iterrows() + ] df = df_Kf.append(df_Kr) - #print(corr_list) - #print("GB: %s, element: %s, corr_value: %.2f eV" % (GB, element, np.round(np.mean(corr_list),4))) - df.drop_duplicates(subset=['system'], keep='first', inplace=True) + # print(corr_list) + # print("GB: %s, element: %s, corr_value: %.2f eV" % (GB, element, np.round(np.mean(corr_list),4))) + df.drop_duplicates(subset=["system"], keep="first", inplace=True) df.sort_values("system", inplace=True) else: df = self.GB_Kfine_df_dict[GB][element] @@ -289,27 +432,41 @@ def get_2sol_cohesion_BO(GB, case): # Get the site number of solute 2 df["site_2"] = [int(x.split(sep="-")[-1]) for x in df.system.values] # Get the Eseg in isolation of solute 1 - df["E_seg_s1"] = [Segregation_1sol.get_Eseg(GB=GB, element=case.element_1, site=case.site_1) for i, case in df.iterrows()] + df["E_seg_s1"] = [ + Segregation_1sol.get_Eseg( + GB=GB, element=case.element_1, site=case.site_1 + ) + for i, case in df.iterrows() + ] # Get the Eseg in isolation of solute 2 - df["E_seg_s2"] = [Segregation_1sol.get_Eseg(GB=GB, element=case.element_2, site=case.site_2) for i, case in df.iterrows()] + df["E_seg_s2"] = [ + Segregation_1sol.get_Eseg( + GB=GB, element=case.element_2, site=case.site_2 + ) + for i, case in df.iterrows() + ] if GB == "S11-RA110-S3-32": GB_use = "S11-RA110-S3-32-2x2" else: GB_use = GB # Get the energy of solution - df["solnE_s1"] = [get_solution_energy(GB_use, case.element_1) for i, case in df.iterrows()] - df["solnE_s2"] = [get_solution_energy(GB_use, case.element_2) for i, case in df.iterrows()] + df["solnE_s1"] = [ + get_solution_energy(GB_use, case.element_1) for i, case in df.iterrows() + ] + df["solnE_s2"] = [ + get_solution_energy(GB_use, case.element_2) for i, case in df.iterrows() + ] ## Get the energy of base solute structure in isolation of solute 1 #### THIS NEEDS TO BE FIXED - TOTEN NEEDS TO BE THAT OF THE ORIGINAL SIZED GB (1x1 in S11, S9) - #conditional_timer = time.time() + # conditional_timer = time.time() E_base_list = [] for i, case in df.iterrows(): if GB == "S9-RA110-S2-21" and case.system.split(sep="-")[0] == "P": # -549.38781758 is the value for P-36-d-0.0 structure in the 1x1 S9 cell E_base_list.append(-549.38781758) else: - E_base_list.append(get_totalenergy(GB, structure_name = case.system)) - #print("%.2fs elapsed for checking conditional"% (time.time() - conditional_timer)) + E_base_list.append(get_totalenergy(GB, structure_name=case.system)) + # print("%.2fs elapsed for checking conditional"% (time.time() - conditional_timer)) df["E_base"] = E_base_list df["E_GB"] = get_totalenergy(GB) # Incremental energy of segregation @@ -321,139 +478,215 @@ def get_2sol_cohesion_BO(GB, case): # Eseg_inc = 0 df["E_seg_inc_c"] = [x if x < 0 else 0 for x in df.E_seg_inc.values] # Total energy of segregation - df["E_seg_total"] = df["energy"] - df["E_GB"] - df["solnE_s2"] - df["solnE_s1"] + df["E_seg_total"] = ( + df["energy"] - df["E_GB"] - df["solnE_s2"] - df["solnE_s1"] + ) # This only works if the E_int is NOT POSITIVE df["E_correction"] = [0 if x < 0 else -x for x in df.E_seg_inc.values] # total energy with correction if Eseg_inc > 0 df["toten_c"] = df["energy"] + df["E_correction"] # E_correction for applying to total energy. - df["E_seg_total_c"] = df["energy"] - df["E_GB"] - df["solnE_s2"] - df["solnE_s1"] + df["E_correction"] + df["E_seg_total_c"] = ( + df["energy"] + - df["E_GB"] + - df["solnE_s2"] + - df["solnE_s1"] + + df["E_correction"] + ) # Interaction energy (If the first solute wasn't there, difference in energy) df["E_int"] = df["E_seg_inc"] - df["E_seg_s2"] - # # corrected energy of interaction - # df["E_int_c"] = [-float(get_1sol_EsegWithoutSite(os.path.basename(GB),x.element_2,x.site_1))\ - # if x.E_seg_inc_c == 0\ - # else x.E_seg_inc - x.E_seg_s2 for _, x in df.iterrows()] - - #df_out = df[["system","E_seg_inc", "E_int", "distance", "convergence"]]\ - - df["Wsep_RGS"] = [get_2sol_cohesion_RGS(GB, row.system)[0] for _, row in df.iterrows()] - df["Wsep_RGS_list"] = [get_2sol_cohesion_RGS(GB, row.system)[1] for _, row in df.iterrows()] - df["cp_list"] = [get_2sol_cohesion_RGS(GB, row.system)[2] for _, row in df.iterrows()] + # # corrected energy of interaction + # df["E_int_c"] = [-float(get_1sol_EsegWithoutSite(os.path.basename(GB),x.element_2,x.site_1))\ + # if x.E_seg_inc_c == 0\ + # else x.E_seg_inc - x.E_seg_s2 for _, x in df.iterrows()] + + # df_out = df[["system","E_seg_inc", "E_int", "distance", "convergence"]]\ + + df["Wsep_RGS"] = [ + get_2sol_cohesion_RGS(GB, row.system)[0] for _, row in df.iterrows() + ] + df["Wsep_RGS_list"] = [ + get_2sol_cohesion_RGS(GB, row.system)[1] for _, row in df.iterrows() + ] + df["cp_list"] = [ + get_2sol_cohesion_RGS(GB, row.system)[2] for _, row in df.iterrows() + ] df["eta_RGS"] = df["Wsep_RGS"] - GB_pure_WsepRGS - df["eta_RGS_ele1"] = [get_1sol_cohesion_effect(GB, row.element_1, "eta_RGS", df = df_1sol) for _, row in df.iterrows()] - df["eta_RGS_ele2"] = [get_1sol_cohesion_effect(GB, row.element_2, "eta_RGS", df = df_1sol) for _, row in df.iterrows()] + df["eta_RGS_ele1"] = [ + get_1sol_cohesion_effect(GB, row.element_1, "eta_RGS", df=df_1sol) + for _, row in df.iterrows() + ] + df["eta_RGS_ele2"] = [ + get_1sol_cohesion_effect(GB, row.element_2, "eta_RGS", df=df_1sol) + for _, row in df.iterrows() + ] df["heur_eta_RGS"] = df["eta_RGS_ele1"] + df["eta_RGS_ele2"] - df["Wsep_rel"] = [get_2sol_cohesion_Wseprel(GB, row.system) for _, row in df.iterrows()] + df["Wsep_rel"] = [ + get_2sol_cohesion_Wseprel(GB, row.system) for _, row in df.iterrows() + ] df["eta_rel"] = df["Wsep_rel"] - GB_pure_Wseprel - df["eta_rel_ele1"] = [get_1sol_cohesion_effect(GB, row.element_1, "eta_rel", df = df_1sol) for _, row in df.iterrows()] - df["eta_rel_ele2"] = [get_1sol_cohesion_effect(GB, row.element_2, "eta_rel", df = df_1sol) for _, row in df.iterrows()] + df["eta_rel_ele1"] = [ + get_1sol_cohesion_effect(GB, row.element_1, "eta_rel", df=df_1sol) + for _, row in df.iterrows() + ] + df["eta_rel_ele2"] = [ + get_1sol_cohesion_effect(GB, row.element_2, "eta_rel", df=df_1sol) + for _, row in df.iterrows() + ] df["heur_eta_rel"] = df["eta_rel_ele1"] + df["eta_rel_ele2"] - df["ANSBO"] = [get_2sol_cohesion_BO(GB, row.system)[0] if row.E_seg_inc < 0 else np.nan for _, row in df.iterrows()] - df["ANSBO_list"] = [get_2sol_cohesion_BO(GB, row.system)[1] if row.E_seg_inc < 0 else np.nan for _, row in df.iterrows() ] - df["ANSBO_min_cp"] = [get_2sol_cohesion_BO(GB, row.system)[2] if row.E_seg_inc < 0 else np.nan for _, row in df.iterrows()] + df["ANSBO"] = [ + get_2sol_cohesion_BO(GB, row.system)[0] if row.E_seg_inc < 0 else np.nan + for _, row in df.iterrows() + ] + df["ANSBO_list"] = [ + get_2sol_cohesion_BO(GB, row.system)[1] if row.E_seg_inc < 0 else np.nan + for _, row in df.iterrows() + ] + df["ANSBO_min_cp"] = [ + get_2sol_cohesion_BO(GB, row.system)[2] if row.E_seg_inc < 0 else np.nan + for _, row in df.iterrows() + ] df["eta_ANSBO"] = df["ANSBO"] - GB_pure_ANSBO - df["eta_ANSBO_ele1"] = [get_1sol_cohesion_effect(GB, row.element_1, "eta_ANSBO", df = df_1sol) for _, row in df.iterrows()] - df["eta_ANSBO_ele2"] = [get_1sol_cohesion_effect(GB, row.element_2, "eta_ANSBO", df = df_1sol) for _, row in df.iterrows()] + df["eta_ANSBO_ele1"] = [ + get_1sol_cohesion_effect(GB, row.element_1, "eta_ANSBO", df=df_1sol) + for _, row in df.iterrows() + ] + df["eta_ANSBO_ele2"] = [ + get_1sol_cohesion_effect(GB, row.element_2, "eta_ANSBO", df=df_1sol) + for _, row in df.iterrows() + ] df["heur_eta_ANSBO"] = df["eta_ANSBO_ele1"] + df["eta_ANSBO_ele2"] - df["element"] = [row.system.split(sep="-")[0] - if row.system.split(sep="-")[0] != "P" - else row.system.split(sep="-")[-2] - for i, row in df.iterrows()] - df_output = df.copy()[["system", "element_1", "site_1", "element_2", "site_2", "energy",\ - "solnE_s1", "solnE_s2", "E_base", "E_GB",\ - "E_seg_s1", "E_seg_s2", "E_seg_inc", "E_seg_total_c", "E_int", "distance", "element",\ - "Wsep_RGS", "Wsep_rel", "ANSBO", "cp_list", "Wsep_RGS_list", "ANSBO_list"]] - #df = np.round(df, 3) + df["element"] = [ + ( + row.system.split(sep="-")[0] + if row.system.split(sep="-")[0] != "P" + else row.system.split(sep="-")[-2] + ) + for i, row in df.iterrows() + ] + df_output = df.copy()[ + [ + "system", + "element_1", + "site_1", + "element_2", + "site_2", + "energy", + "solnE_s1", + "solnE_s2", + "E_base", + "E_GB", + "E_seg_s1", + "E_seg_s2", + "E_seg_inc", + "E_seg_total_c", + "E_int", + "distance", + "element", + "Wsep_RGS", + "Wsep_rel", + "ANSBO", + "cp_list", + "Wsep_RGS_list", + "ANSBO_list", + ] + ] + # df = np.round(df, 3) GB_df_list.append(df) - print("%.2fs elapsed for GB step"% (time.time() - GB_step_time)) + print("%.2fs elapsed for GB step" % (time.time() - GB_step_time)) if i == 0 and savefile: - df_output.to_excel("%s\\energetics_analysis.xlsx" % (os.getcwd()),\ - sheet_name = "%s" % (os.path.basename(GB))) + df_output.to_excel( + "%s\\energetics_analysis.xlsx" % (os.getcwd()), + sheet_name="%s" % (os.path.basename(GB)), + ) else: - with pd.ExcelWriter("%s\\energetics_analysis.xlsx" % (os.getcwd()), mode="a", engine="openpyxl") as writer: - df_output.to_excel(writer, sheet_name = "%s" % (os.path.basename(GB))) + with pd.ExcelWriter( + "%s\\energetics_analysis.xlsx" % (os.getcwd()), + mode="a", + engine="openpyxl", + ) as writer: + df_output.to_excel(writer, sheet_name="%s" % (os.path.basename(GB))) self.GB_energetics_dict = dict(zip(list_GB, GB_df_list)) - -class SegregationEnergyData_1sol(): +class SegregationEnergyData_1sol: def __init__(self): #################################################################################################### # S3 S111 studied_list = [20, 22, 24, 26, 28, 30, 32, 34, 36] # 0.5-1ML available - symmetry = [[21, 52, 53],\ - [23, 50, 51],\ - [25, 48, 49],\ - [27, 46, 47],\ - [29, 44, 45],\ - [31, 42, 43],\ - [33, 40, 41],\ - [35, 38, 39],\ - [37]] + symmetry = [ + [21, 52, 53], + [23, 50, 51], + [25, 48, 49], + [27, 46, 47], + [29, 44, 45], + [31, 42, 43], + [33, 40, 41], + [35, 38, 39], + [37], + ] # When the site is on the GB plane, we don't need to calculate values on both sides - self.S3_RA110_S1_11_symmetrydict = dict(zip(studied_list,symmetry)) + self.S3_RA110_S1_11_symmetrydict = dict(zip(studied_list, symmetry)) #################################################################################################### # S3 S112 studied_list = [12, 14, 16, 18, 20, 22, 24] # 0.5-1ML available - symmetry = [[13, 36, 37],\ - [15, 34, 35],\ - [17, 32, 33],\ - [19, 30, 31],\ - [21, 28, 29],\ - [23, 26, 27],\ - [25]] + symmetry = [ + [13, 36, 37], + [15, 34, 35], + [17, 32, 33], + [19, 30, 31], + [21, 28, 29], + [23, 26, 27], + [25], + ] # When the site is on the GB plane, we don't need to calculate values on both sides - self.S3_RA110_S1_12_symmetrydict = dict(zip(studied_list,symmetry)) + self.S3_RA110_S1_12_symmetrydict = dict(zip(studied_list, symmetry)) #################################################################################################### # S9 studied_list = [23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36] # only 0-1 ML available - symmetry = [[47],\ - [46],\ - [45],\ - [44],\ - [43],\ - [42],\ - [41],\ - [40],\ - [39],\ - [38],\ - [37],\ - [],\ - [],\ - []] + symmetry = [ + [47], + [46], + [45], + [44], + [43], + [42], + [41], + [40], + [39], + [38], + [37], + [], + [], + [], + ] # When the site is on the GB plane, we don't need to calculate values on both sides - self.S9_RA110_S2_21_symmetrydict = dict(zip(studied_list,symmetry)) + self.S9_RA110_S2_21_symmetrydict = dict(zip(studied_list, symmetry)) #################################################################################################### # S11 studied_list = [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22] # only 0-1 ML available - symmetry = [[32],\ - [31],\ - [30],\ - [29],\ - [28],\ - [27],\ - [26],\ - [25],\ - [24],\ - [23],\ - [],\ - []] + symmetry = [[32], [31], [30], [29], [28], [27], [26], [25], [24], [23], [], []] # Full dictionary of solutes and sites - self.S11_RA110_S3_32_symmetrydict = dict(zip(studied_list,symmetry)) - self.GB_sym_dict = dict(zip(list_GB, - [self.S11_RA110_S3_32_symmetrydict, - self.S3_RA110_S1_11_symmetrydict, - self.S3_RA110_S1_12_symmetrydict, - self.S9_RA110_S2_21_symmetrydict])) + self.S11_RA110_S3_32_symmetrydict = dict(zip(studied_list, symmetry)) + self.GB_sym_dict = dict( + zip( + list_GB, + [ + self.S11_RA110_S3_32_symmetrydict, + self.S3_RA110_S1_11_symmetrydict, + self.S3_RA110_S1_12_symmetrydict, + self.S9_RA110_S2_21_symmetrydict, + ], + ) + ) #################################################################################################### # Create a dictionary that may be accessed # dict[GB][element][site] that contains information on all sites @@ -477,19 +710,19 @@ def __init__(self): df = get_1sol_df("%s\\%s" % (GB_path, os.path.basename(element))) sym_df = df.copy() - sym_df["base_site"] = [int(x) for x in sym_df['site']] - sym_df['site'] = [int(x.site) for i, x in sym_df.iterrows()] + sym_df["base_site"] = [int(x) for x in sym_df["site"]] + sym_df["site"] = [int(x.site) for i, x in sym_df.iterrows()] for _, sites_calculated in sym_df.iterrows(): - for x in sym_dict[sites_calculated['base_site']]: + for x in sym_dict[sites_calculated["base_site"]]: sym_site = sites_calculated.copy() - sym_sys = sym_site['system'].split(sep="-") + sym_sys = sym_site["system"].split(sep="-") sym_sys[1] = str(x) sym_sys = "-".join(sym_sys) - sym_site['system'] = sym_sys + sym_site["system"] = sym_sys - sym_site['site'] = int(x) + sym_site["site"] = int(x) sym_df = sym_df.append(sym_site) ele_site_eseg_dict = dict(zip(sym_df.site, sym_df.E_seg.values)) @@ -509,87 +742,103 @@ def __init__(self): self.Eseg_dict = dict(zip(list_GB, Eseg_data_list)) self.toten_dict = dict(zip(list_GB, toten_data_list)) - def get_Eseg(self, GB, element, site, warning = True): - ''' + def get_Eseg(self, GB, element, site, warning=True): + """ Convenience value extractor for single solute segregation energy at specified GB, element, site Args: GB = string for GB (e.g. S11-RA110-S3-32) element = string for element (e.g. "W" for tungsten) site = integer for site (0-indexed) - ''' + """ if GB in self.Eseg_dict: if element in self.Eseg_dict[GB]: if site in self.Eseg_dict[GB][element]: E_seg = self.Eseg_dict[GB][element][site] - #print(E_seg) + # print(E_seg) else: E_seg = np.nan if warning: - print("No such site \"%s\" in element \"%s\" in %s dict: check site" % (site, element, GB)) + print( + 'No such site "%s" in element "%s" in %s dict: check site' + % (site, element, GB) + ) else: E_seg = np.nan if warning: - print("No such element \"%s\" in %s dict: check element" % (element, GB)) + print( + 'No such element "%s" in %s dict: check element' % (element, GB) + ) else: E_seg = np.nan if warning: - print("No such GB \"%s\" in dict: check GB string" % GB) + print('No such GB "%s" in dict: check GB string' % GB) return E_seg - def get_toten(self, GB, element, site, warning = True): - ''' + def get_toten(self, GB, element, site, warning=True): + """ Convenience value extractor for single solute total energy at specified GB, element, site Args: GB = string for GB (e.g. S11-RA110-S3-32) element = string for element (e.g. "W" for tungsten) site = integer for site (0-indexed) - ''' + """ if GB in self.toten_dict: if element in self.toten_dict[GB]: if site in self.toten_dict[GB][element]: toten = self.toten_dict[GB][element][site] - #print(E_seg) + # print(E_seg) else: toten = np.nan if warning: - print("No such site \"%s\" in element \"%s\" in %s dict: check site" % (site, element, GB)) + print( + 'No such site "%s" in element "%s" in %s dict: check site' + % (site, element, GB) + ) else: toten = np.nan if warning: - print("No such element \"%s\" in %s dict: check element" % (element, GB)) + print( + 'No such element "%s" in %s dict: check element' % (element, GB) + ) else: toten = np.nan if warning: - print("No such GB \"%s\" in dict: check GB string" % GB) + print('No such GB "%s" in dict: check GB string' % GB) return toten - def get_min_Eseg_without_site(self, GB, element, site, warning = True): - ''' + def get_min_Eseg_without_site(self, GB, element, site, warning=True): + """ Convenience value extractor for minimum single solute segregation energy at specified GB, element, without specified site Args: GB = string for GB (e.g. S11-RA110-S3-32) element = string for element (e.g. "W" for tungsten) site = integer for site to exclude (0-indexed) - ''' + """ if GB in self.Eseg_dict: if element in self.Eseg_dict[GB]: if site in self.Eseg_dict[GB][element]: min_Eseg = self.Eseg_dict[GB][element][site] - #print(E_seg) + # print(E_seg) else: min_Eseg = np.nan if warning: - print("No such site \"%s\" in element \"%s\" in %s dict: check site" % (site, element, GB)) + print( + 'No such site "%s" in element "%s" in %s dict: check site' + % (site, element, GB) + ) else: min_Eseg = np.nan if warning: - print("No such element \"%s\" in %s dict: check element" % (element, GB)) + print( + 'No such element "%s" in %s dict: check element' % (element, GB) + ) else: min_Eseg = np.nan if warning: - print("No such GB \"%s\" in dict: check GB string" % GB) + print('No such GB "%s" in dict: check GB string' % GB) return toten + def get_dist_solutes(fp_2solutes): """ Returns distance between pair of non-Fe solutes for a specified structure @@ -599,12 +848,15 @@ def get_dist_solutes(fp_2solutes): """ structure = Structure.from_file("%s\\CONTCAR" % (fp_2solutes)) # get pair of non-Fe site idx - distance_pair = [i for i, site in enumerate(structure) if site.species_string != "Fe"] + distance_pair = [ + i for i, site in enumerate(structure) if site.species_string != "Fe" + ] distance = structure[distance_pair[1]].distance(structure[distance_pair[0]]) return distance -def get_totalenergy(GB, structure_name = "GB"): + +def get_totalenergy(GB, structure_name="GB"): """ Returns a total energy (eV) value for a specified GB (1 sol case) @@ -615,30 +867,34 @@ def get_totalenergy(GB, structure_name = "GB"): df = pd.read_csv("%s\\%s\\Co\\info.csv" % (fp_Seg1_path, os.path.basename(GB))) E_GB = df[df["system"] == "GB"].energy.values[0] else: - structure_name = '-'.join(structure_name.split(sep="-")[0:-2]) - element = structure_name.split(sep = "-")[0] - df = pd.read_csv("%s\\%s\\%s\\info.csv" % (fp_Seg1_path, os.path.basename(GB), element)) + structure_name = "-".join(structure_name.split(sep="-")[0:-2]) + element = structure_name.split(sep="-")[0] + df = pd.read_csv( + "%s\\%s\\%s\\info.csv" % (fp_Seg1_path, os.path.basename(GB), element) + ) E_GB = df[df["system"] == structure_name].energy.values[0] return E_GB -def get_1sol_df(folder_path, midpoint = 0.5000): +def get_1sol_df(folder_path, midpoint=0.5000): """ Returns a pandas dataframe containing segregation energy, voronoi volume, and magnetic moment profiles for all 1 solute cases in a specified GB folder: "folder_path" """ results = pd.read_csv("%s\\info.csv" % folder_path) # Energy of the pure slab structure - E_slab = results.loc[results['system'] == "SLAB"]['energy'].values[0] + E_slab = results.loc[results["system"] == "SLAB"]["energy"].values[0] # Energy of the slab + 1 solute structure - E_slab_imp = results.loc[results['system'].str.contains('-SLAB-')]['energy'].values[0] + E_slab_imp = results.loc[results["system"].str.contains("-SLAB-")]["energy"].values[ + 0 + ] # Energy of the pure GB - E_GB = results.loc[results['system'] == "GB"]['energy'].values[0] + E_GB = results.loc[results["system"] == "GB"]["energy"].values[0] # Drop any results related to pure GB and any slab structures - df = results.copy().loc[~results['system'].str.contains("SLAB|GB")] + df = results.copy().loc[~results["system"].str.contains("SLAB|GB")] # Calculate energy of segregation - df['E_seg'] = df['energy'] - E_GB - (E_slab_imp - E_slab) + df["E_seg"] = df["energy"] - E_GB - (E_slab_imp - E_slab) ## This section assigns a distance from GB for the solute distance_compiled = [] magmom_list_compiled = [] @@ -647,15 +903,25 @@ def get_1sol_df(folder_path, midpoint = 0.5000): vvol_compiled = [] for _, system in df.iterrows(): # Read the CONTCAR structure in the folder - structure = Structure.from_file('%s\\%s\\CONTCAR' % (folder_path, system['system'])) + structure = Structure.from_file( + "%s\\%s\\CONTCAR" % (folder_path, system["system"]) + ) # Get solute number in structure - solute_no = int(system['system'].split(sep = '-')[1:2][0]) - solute_no = [i for i, site in enumerate(structure) if site.species_string != "Fe"][0] + solute_no = int(system["system"].split(sep="-")[1:2][0]) + solute_no = [ + i for i, site in enumerate(structure) if site.species_string != "Fe" + ][0] # Distance to GB plane (center plane frac z) - distance = abs(midpoint - structure[solute_no].frac_coords[-1])*structure.lattice.c + distance = ( + abs(midpoint - structure[solute_no].frac_coords[-1]) * structure.lattice.c + ) distance_compiled.append(distance) # Magnetic moment - magmom_df = pd.read_csv("%s\\%s\\magnet.csv" % (folder_path, system['system']), delim_whitespace=True, header=None)[[1,2,3,4]] + magmom_df = pd.read_csv( + "%s\\%s\\magnet.csv" % (folder_path, system["system"]), + delim_whitespace=True, + header=None, + )[[1, 2, 3, 4]] ## magnetic moment list magmom_list = list(magmom_df[4].values) magmom_list_compiled.append(magmom_list) @@ -663,29 +929,30 @@ def get_1sol_df(folder_path, midpoint = 0.5000): magmom = magmom_list[solute_no] magmom_compiled.append(magmom) # Voronoi volume - vvol_df = pd.read_excel("%s\\%s\\vvol.xlsx" % (folder_path, system['system'])) + vvol_df = pd.read_excel("%s\\%s\\vvol.xlsx" % (folder_path, system["system"])) ## Voronoi volume list vvol_list = json.loads(vvol_df.iloc[1].values[1]) vvol_list_compiled.append(vvol_list) ## Voronoi volume of non Fe solute vvol = vvol_list[solute_no] vvol_compiled.append(vvol) - df['distance_GB'] = distance_compiled - df['magmom'] = magmom_compiled - df['magmom_list'] = magmom_list_compiled - df['vvol'] = vvol_compiled - df['vvol_list'] = vvol_list_compiled - df['d'] = [row.system.split(sep='-')[-1] for i, row in df.iterrows()] - df['site'] = [row.system.split(sep='-')[1] for i, row in df.iterrows()] + df["distance_GB"] = distance_compiled + df["magmom"] = magmom_compiled + df["magmom_list"] = magmom_list_compiled + df["vvol"] = vvol_compiled + df["vvol_list"] = vvol_list_compiled + df["d"] = [row.system.split(sep="-")[-1] for i, row in df.iterrows()] + df["site"] = [row.system.split(sep="-")[1] for i, row in df.iterrows()] return df def get_area(path): structure = Structure.from_file(path) - area = structure.volume/structure.lattice.c + area = structure.volume / structure.lattice.c return area + def get_1sol_etarel_cohesion_df(folder_path): """ Returns Wsep_rel df for all elements in a GB folder @@ -704,11 +971,26 @@ def getEnergyFromData_1sol(GB_name, element, site): # To fix the segregation profile. # Obviously, you can't compute a work of separation comparing 2x1 GB to 1x1 energies. # So, I am adding this exception. - data_specified_path = "%s\\%s\\%s\\info-old.csv" % (fp_Seg1_path, GB_name, element) + data_specified_path = "%s\\%s\\%s\\info-old.csv" % ( + fp_Seg1_path, + GB_name, + element, + ) else: - data_specified_path = "%s\\%s\\%s\\info.csv" % (fp_Seg1_path, GB_name, element) + data_specified_path = "%s\\%s\\%s\\info.csv" % ( + fp_Seg1_path, + GB_name, + element, + ) df = pd.read_csv(data_specified_path) - df['solute_no'] = [int(row.system.split(sep="-")[1]) if not any(slabgb in row.system for slabgb in ["SLAB", "GB"]) else "NaN" for _, row in df.iterrows()] + df["solute_no"] = [ + ( + int(row.system.split(sep="-")[1]) + if not any(slabgb in row.system for slabgb in ["SLAB", "GB"]) + else "NaN" + ) + for _, row in df.iterrows() + ] if element == "GB": # listen here you little shit energy = df.loc[df["system"] == "GB"].energy.values[0] @@ -717,16 +999,30 @@ def getEnergyFromData_1sol(GB_name, element, site): return energy - df = pd.read_csv('%s\\info.csv' % folder_path) - df["area"] = [get_area("%s\\%s\\CONTCAR" % (folder_path, cleave_case.system)) for _, cleave_case in df.iterrows()] - df['element'] = [row.system.split(sep="-")[0] for _, row in df.iterrows()] - df["site"] = [int(row.system.split(sep="-")[1]) if "GB" not in row.system else "GB" for _, row in df.iterrows()] - df["GB_energy"] = [getEnergyFromData_1sol(os.path.basename(folder_path), row.element, row.site) for _, row in df.iterrows()] - df["Wsep_rel"] = np.round((df['energy'] - df["GB_energy"]) * 16.02176565 / (df['area']), 3) - df["system_base"] = ["-".join(row.system.split(sep="-")[0:-2]) for _, row in df.iterrows()] + df = pd.read_csv("%s\\info.csv" % folder_path) + df["area"] = [ + get_area("%s\\%s\\CONTCAR" % (folder_path, cleave_case.system)) + for _, cleave_case in df.iterrows() + ] + df["element"] = [row.system.split(sep="-")[0] for _, row in df.iterrows()] + df["site"] = [ + int(row.system.split(sep="-")[1]) if "GB" not in row.system else "GB" + for _, row in df.iterrows() + ] + df["GB_energy"] = [ + getEnergyFromData_1sol(os.path.basename(folder_path), row.element, row.site) + for _, row in df.iterrows() + ] + df["Wsep_rel"] = np.round( + (df["energy"] - df["GB_energy"]) * 16.02176565 / (df["area"]), 3 + ) + df["system_base"] = [ + "-".join(row.system.split(sep="-")[0:-2]) for _, row in df.iterrows() + ] return df + def get_1sol_etarigid_cohesion_df(folder_path): """ Returns Wsep_RGS values for all elements in a GB folder @@ -736,71 +1032,124 @@ def get_1sol_etarigid_cohesion_df(folder_path): """ case_df_list = [] for cases in get_immediate_subdirectories(folder_path): - #print(cases) - results = pd.read_csv('%s\\info.csv' % cases) - GB_energy = results[results['system'] == os.path.basename(cases)]['energy'].values[0] - results["area"] = [get_area("%s\\%s\\CONTCAR" % (cases, cleave_case.system)) for _, cleave_case in results.iterrows()] - results["Wsep"] = (results['energy'] - GB_energy) * 16.02176565 / (results['area']) - #print(results.loc[0::, ['system','Wsep']]) + # print(cases) + results = pd.read_csv("%s\\info.csv" % cases) + GB_energy = results[results["system"] == os.path.basename(cases)][ + "energy" + ].values[0] + results["area"] = [ + get_area("%s\\%s\\CONTCAR" % (cases, cleave_case.system)) + for _, cleave_case in results.iterrows() + ] + results["Wsep"] = ( + (results["energy"] - GB_energy) * 16.02176565 / (results["area"]) + ) + # print(results.loc[0::, ['system','Wsep']]) df = results[results["Wsep"] > 0.0001] new_df = df.copy() - new_df["cleavage_plane_name"] = [x.split(sep="-")[-2] for x in df['system'].values] - new_df["cleavage_plane"] = [float(x.split(sep="-")[-1]) for x in df['system'].values] + new_df["cleavage_plane_name"] = [ + x.split(sep="-")[-2] for x in df["system"].values + ] + new_df["cleavage_plane"] = [ + float(x.split(sep="-")[-1]) for x in df["system"].values + ] # Create a single row DF with column names assigned cleavage plane values case_df = pd.DataFrame(new_df["Wsep"].values).transpose() case_df.columns = new_df["cleavage_plane_name"].values - case_df['cleavage_planes'] = [new_df["cleavage_plane"].values] - case_df['cp_names'] = [new_df["cleavage_plane_name"].values] - case_df['system'] = os.path.basename(cases) - case_df['Wsep_RGS_list'] = [np.round(new_df["Wsep"].values,3)] + case_df["cleavage_planes"] = [new_df["cleavage_plane"].values] + case_df["cp_names"] = [new_df["cleavage_plane_name"].values] + case_df["system"] = os.path.basename(cases) + case_df["Wsep_RGS_list"] = [np.round(new_df["Wsep"].values, 3)] case_df_list.append(case_df) - GB_df = np.round(pd.concat(case_df_list),2) - GB_df['Wsep_RGS'] = [np.round(min(wsep_lists),3) for wsep_lists in GB_df['Wsep_RGS_list']] - GB_df['min_cp_name'] = [row.cp_names[np.argmin(row['Wsep_RGS_list'])] for _, row in GB_df.iterrows()] - GB_df['min_cp'] = [row.cleavage_planes[np.argmin(row['Wsep_RGS_list'])] for _, row in GB_df.iterrows()] + GB_df = np.round(pd.concat(case_df_list), 2) + GB_df["Wsep_RGS"] = [ + np.round(min(wsep_lists), 3) for wsep_lists in GB_df["Wsep_RGS_list"] + ] + GB_df["min_cp_name"] = [ + row.cp_names[np.argmin(row["Wsep_RGS_list"])] for _, row in GB_df.iterrows() + ] + GB_df["min_cp"] = [ + row.cleavage_planes[np.argmin(row["Wsep_RGS_list"])] + for _, row in GB_df.iterrows() + ] return GB_df -def cp_bondorder(structure_path = "%s\\CONTCAR" % os.getcwd(),\ - DDEC_output_path = "%s" % os.getcwd(),\ - cleavage_plane_array = [0.5],\ - bo_threshold = 0.0): + +def cp_bondorder( + structure_path="%s\\CONTCAR" % os.getcwd(), + DDEC_output_path="%s" % os.getcwd(), + cleavage_plane_array=[0.5], + bo_threshold=0.0, +): # This function calculates the bond order sum and returns a value, given a structure and chargemol output path # Read the DDEC Output and convert it into a csv temp file structure = Structure.from_file(structure_path) - VASPDDEC_2_CSV("%s\\VASP_DDEC_analysis.output" % DDEC_output_path,"%s\\chargemol.csv" % os.getcwd()) - chargemol_data = pd.read_csv("%s\\chargemol.csv" % os.getcwd(), delim_whitespace=True) + VASPDDEC_2_CSV( + "%s\\VASP_DDEC_analysis.output" % DDEC_output_path, + "%s\\chargemol.csv" % os.getcwd(), + ) + chargemol_data = pd.read_csv( + "%s\\chargemol.csv" % os.getcwd(), delim_whitespace=True + ) chargemol_data = chargemol_data[chargemol_data["final_bond_order"] > bo_threshold] - bond_data = chargemol_data.copy()[['atom1','atom2','repeata','repeatb','final_bond_order']] + bond_data = chargemol_data.copy()[ + ["atom1", "atom2", "repeata", "repeatb", "final_bond_order"] + ] # -1 because chargemol begins indexing at 1, equivalent to structure[0] - bond_data['atom1pos'] = [structure[x-1].frac_coords for x in bond_data['atom1'].values] - bond_data['atom2pos'] = [structure[x-1].frac_coords for x in bond_data['atom2'].values] + bond_data["atom1pos"] = [ + structure[x - 1].frac_coords for x in bond_data["atom1"].values + ] + bond_data["atom2pos"] = [ + structure[x - 1].frac_coords for x in bond_data["atom2"].values + ] # zpos fractional - bond_data['atom1zpos'] = [structure[x-1].frac_coords[-1] for x in bond_data['atom1'].values] - bond_data['atom2zpos'] = [structure[x-1].frac_coords[-1] for x in bond_data['atom2'].values] + bond_data["atom1zpos"] = [ + structure[x - 1].frac_coords[-1] for x in bond_data["atom1"].values + ] + bond_data["atom2zpos"] = [ + structure[x - 1].frac_coords[-1] for x in bond_data["atom2"].values + ] # zpos fractional - bond_data['atom1_ele'] = [structure[x-1].species_string for x in bond_data['atom1'].values] - bond_data['atom2_ele'] = [structure[x-1].species_string for x in bond_data['atom2'].values] - cp_bo = []; clp_df_list = [] + bond_data["atom1_ele"] = [ + structure[x - 1].species_string for x in bond_data["atom1"].values + ] + bond_data["atom2_ele"] = [ + structure[x - 1].species_string for x in bond_data["atom2"].values + ] + cp_bo = [] + clp_df_list = [] for cp in cleavage_plane_array: # cleavage plane and find the sum of bond orders passing through it - clp_df = bond_data[(bond_data[['atom2zpos','atom1zpos']].max(axis=1) > cp) - & (bond_data[['atom2zpos','atom1zpos']].min(axis=1) < cp) ] + clp_df = bond_data[ + (bond_data[["atom2zpos", "atom1zpos"]].max(axis=1) > cp) + & (bond_data[["atom2zpos", "atom1zpos"]].min(axis=1) < cp) + ] clp_df = clp_df.copy()[(clp_df["repeata"] == 0) | (clp_df["repeatb"] == 0)] # We only want to calculate for atoms that exist cell. This is important for bond order/area normalisation - clp_df_countonce = clp_df.copy()[(clp_df["repeata"] == 0) & (clp_df["repeatb"] == 0)] - clp_df_counthalf = clp_df.copy()[(clp_df["repeata"] != 0) | (clp_df["repeatb"] != 0)] + clp_df_countonce = clp_df.copy()[ + (clp_df["repeata"] == 0) & (clp_df["repeatb"] == 0) + ] + clp_df_counthalf = clp_df.copy()[ + (clp_df["repeata"] != 0) | (clp_df["repeatb"] != 0) + ] # Basic summed bond order over CP - final_bond_order = clp_df_countonce.final_bond_order.sum() + 0.5*clp_df_counthalf.final_bond_order.sum() + final_bond_order = ( + clp_df_countonce.final_bond_order.sum() + + 0.5 * clp_df_counthalf.final_bond_order.sum() + ) # N largest - #final_bond_order = clp_df.nlargest(15, ['final_bond_order'])["final_bond_order"].sum() + # final_bond_order = clp_df.nlargest(15, ['final_bond_order'])["final_bond_order"].sum() # IMPORTANT: This assumes that the cross sectional area can be calculated this way - a_fbo = final_bond_order/(float(structure.lattice.volume)/float(structure.lattice.c)) - #print("area of this is %s" % (float(structure.lattice.volume)/float(structure.lattice.c))) + a_fbo = final_bond_order / ( + float(structure.lattice.volume) / float(structure.lattice.c) + ) + # print("area of this is %s" % (float(structure.lattice.volume)/float(structure.lattice.c))) cp_bo.append(a_fbo) clp_df_list.append(clp_df) return cp_bo, clp_df_list -def VASPDDEC_2_CSV( filename, output_filename ): + +def VASPDDEC_2_CSV(filename, output_filename): flist = open(filename).readlines() parsing = False matrix = [] @@ -809,18 +1158,21 @@ def VASPDDEC_2_CSV( filename, output_filename ): parsing = False if parsing: matrix.append(line) - #print(line) + # print(line) if "The final bond pair matrix is" in line: parsing = True - f=open(output_filename,'w') - f.write("atom1 atom2 repeata repeatb repeatc " + \ - "min-na max-na min-nb max-nb min-nc max-nc contact-exchange avg-spin-pol-bonding-term overlap-population " + \ - "isoaepfcbo coord-term-tanh pairwise-term exp-term-comb-coord-pairwise " + \ - "bond-idx-before-self-exch final_bond_order \n") + f = open(output_filename, "w") + f.write( + "atom1 atom2 repeata repeatb repeatc " + + "min-na max-na min-nb max-nb min-nc max-nc contact-exchange avg-spin-pol-bonding-term overlap-population " + + "isoaepfcbo coord-term-tanh pairwise-term exp-term-comb-coord-pairwise " + + "bond-idx-before-self-exch final_bond_order \n" + ) for bond in matrix: f.write(bond) f.close() + def get_1sol_site_SBO(GB_path): """ Returns summed bond order (DDEC6) for a single-solute case @@ -830,14 +1182,17 @@ def get_1sol_site_SBO(GB_path): GB_path = directory path to GB """ structure = Structure.from_file("%s\\CONTCAR" % GB_path) - solute_no = [i for i, site in enumerate(structure) if site.species_string != "Fe"][0] + solute_no = [i for i, site in enumerate(structure) if site.species_string != "Fe"][ + 0 + ] BO_dict = get_BondOrderInfo(GB_path) - SBO = BO_dict[solute_no]['bond_order_sum'] + SBO = BO_dict[solute_no]["bond_order_sum"] atoms_bond_array = [] - for i in BO_dict[solute_no]['bonded_to']: - atoms_bond_array.append(i['index']) + for i in BO_dict[solute_no]["bonded_to"]: + atoms_bond_array.append(i["index"]) return SBO, atoms_bond_array, solute_no + def get_BondOrderInfo(filename): """ Internal command to process pairwise bond order information @@ -875,6 +1230,7 @@ def get_BondOrderInfo(filename): return bond_order_info + def get_site_SBO(filename, site): """ Internal command to process pairwise bond order information @@ -909,12 +1265,13 @@ def get_site_SBO(filename, site): ) elif "The sum of bond orders for this atom" in line: bond_order_info[start_idx]["bond_order_sum"] = float(l[-1]) -# site_sbo = 0 -# for j in bond_order_info[site]['bonded_to']: -# site_sbo += j['bond_order'] - site_sbo = bond_order_info[site]['bond_order_sum'] + # site_sbo = 0 + # for j in bond_order_info[site]['bonded_to']: + # site_sbo += j['bond_order'] + site_sbo = bond_order_info[site]["bond_order_sum"] return site_sbo + def get_solution_energy(GB, element): """ Returns solution energy for an element of a single-solute case in eV @@ -925,14 +1282,17 @@ def get_solution_energy(GB, element): """ results = pd.read_csv("%s\\%s\\%s\\info.csv" % (fp_Seg1_path, GB, element)) # Energy of the pure slab structure - E_slab = results.loc[results['system'] == "SLAB"]['energy'].values[0] + E_slab = results.loc[results["system"] == "SLAB"]["energy"].values[0] # Energy of the slab + 1 solute structure - E_slab_imp = results.loc[results['system'].str.contains('-SLAB-')]['energy'].values[0] + E_slab_imp = results.loc[results["system"].str.contains("-SLAB-")]["energy"].values[ + 0 + ] # Energy of solution solution_energy = E_slab_imp - E_slab return solution_energy + def get_1sol_cohesion_summary(GB_string): """ Returns a cohesion-energy of segregation summary df that is used for generating @@ -940,42 +1300,56 @@ def get_1sol_cohesion_summary(GB_string): """ RGS_1sol_df = get_1sol_etarigid_cohesion_df("%s\\%s" % (fp_Wsep1_rigid, GB_string)) Wsep_rel_1sol_df = get_1sol_etarel_cohesion_df("%s\\%s" % (fp_Wsep1_rel, GB_string)) - rig_df_merge = RGS_1sol_df.copy()[["system", "Wsep_RGS", "cleavage_planes", "Wsep_RGS_list"]] + rig_df_merge = RGS_1sol_df.copy()[ + ["system", "Wsep_RGS", "cleavage_planes", "Wsep_RGS_list"] + ] rel_df_merge = Wsep_rel_1sol_df.copy()[["system_base", "Wsep_rel"]] - rel_df_merge = rel_df_merge.rename(columns = {"system_base" : "system"}) + rel_df_merge = rel_df_merge.rename(columns={"system_base": "system"}) df = pd.merge(rig_df_merge, rel_df_merge) - df["eta_RGS"] = [np.round(row.Wsep_RGS - df.loc[df['system'] == "GB"].Wsep_RGS.values[0],2) \ - for _, row in df.iterrows()] - df["eta_rel"] = [np.round(row.Wsep_rel - df.loc[df['system'] == "GB"].Wsep_rel.values[0],2) \ - for _, row in df.iterrows()] - df['d_eta'] = df["eta_rel"] - df["eta_RGS"] - df["eta_rel_pct"] = (( df["eta_rel"] * 100) \ - / df.loc[df['system'] == "GB"].Wsep_rel.values[0]) - df["eta_RGS_pct"] = (( df["eta_RGS"] * 100 ) \ - / df.loc[df['system'] == "GB"].Wsep_RGS.values[0]) + df["eta_RGS"] = [ + np.round(row.Wsep_RGS - df.loc[df["system"] == "GB"].Wsep_RGS.values[0], 2) + for _, row in df.iterrows() + ] + df["eta_rel"] = [ + np.round(row.Wsep_rel - df.loc[df["system"] == "GB"].Wsep_rel.values[0], 2) + for _, row in df.iterrows() + ] + df["d_eta"] = df["eta_rel"] - df["eta_RGS"] + df["eta_rel_pct"] = (df["eta_rel"] * 100) / df.loc[ + df["system"] == "GB" + ].Wsep_rel.values[0] + df["eta_RGS_pct"] = (df["eta_RGS"] * 100) / df.loc[ + df["system"] == "GB" + ].Wsep_RGS.values[0] df["element"] = [x.system.split(sep="-")[0] for _, x in df.iterrows()] - min_bo = []; bo_df_list = []; bo_array_list = [] + min_bo = [] + bo_df_list = [] + bo_array_list = [] for idx, row in RGS_1sol_df.iterrows(): - cp_array = row['cleavage_planes'] - system = row['system'] - bo_array, bodf_list = cp_bondorder(structure_path = "%s\\%s\\%s\\CONTCAR" % - (fp_BO1, GB_string, system),\ - DDEC_output_path = "%s\\%s\\%s" - % (fp_BO1, GB_string, system),\ - cleavage_plane_array = cp_array,\ - bo_threshold = 0) + cp_array = row["cleavage_planes"] + system = row["system"] + bo_array, bodf_list = cp_bondorder( + structure_path="%s\\%s\\%s\\CONTCAR" % (fp_BO1, GB_string, system), + DDEC_output_path="%s\\%s\\%s" % (fp_BO1, GB_string, system), + cleavage_plane_array=cp_array, + bo_threshold=0, + ) bo_array_list.append(bo_array) min_bo.append(min(bo_array)) - bo_df = bodf_list[np.argmin(bo_array)][["atom1_ele", "atom2_ele", "atom1", "atom2", "final_bond_order"]] + bo_df = bodf_list[np.argmin(bo_array)][ + ["atom1_ele", "atom2_ele", "atom1", "atom2", "final_bond_order"] + ] bo_df = bo_df[bo_df["final_bond_order"] > 0.01] bo_df_list.append(bo_df) - df['ANSBO'] = min_bo - df["eta_ANSBO"] = [row.ANSBO - df.loc[df['system'] == "GB"].ANSBO.values[0] \ - for _, row in df.iterrows()] - df['bond_df'] = bo_df_list + df["ANSBO"] = min_bo + df["eta_ANSBO"] = [ + row.ANSBO - df.loc[df["system"] == "GB"].ANSBO.values[0] + for _, row in df.iterrows() + ] + df["bond_df"] = bo_df_list eseg_list = [] # Section that extracts information about summed bond orders SBO_list = [] @@ -986,14 +1360,16 @@ def get_1sol_cohesion_summary(GB_string): pGB_SBO = np.nan else: SBO = get_1sol_site_SBO("%s\\%s\\%s" % (fp_BO1, GB_string, row.system))[0] - solute_no = get_1sol_site_SBO("%s\\%s\\%s" % (fp_BO1, GB_string, row.system))[2] + solute_no = get_1sol_site_SBO( + "%s\\%s\\%s" % (fp_BO1, GB_string, row.system) + )[2] pGB_SBO = get_site_SBO("%s\\%s\\%s" % (fp_BO1, GB_string, "GB"), solute_no) - #print("%s\\%s\\%s" % (fp_BO1, GB_string, row.system), solute_no) + # print("%s\\%s\\%s" % (fp_BO1, GB_string, row.system), solute_no) SBO_list.append(SBO) pGB_SBO_list.append(pGB_SBO) - df['site_SBO'] = SBO_list - df['site_pGB_SBO'] = pGB_SBO_list - df['site_SBO_delta'] = df['site_SBO'] - df['site_pGB_SBO'] + df["site_SBO"] = SBO_list + df["site_pGB_SBO"] = pGB_SBO_list + df["site_SBO_delta"] = df["site_SBO"] - df["site_pGB_SBO"] # Exception in the case of the S11-RA110-S3-32 case: # Take the segregation energies from the 2x2 cell instead of the 1x1 cell # This was done since interface reconstruction occurs heavily in the 1x1 cell @@ -1001,11 +1377,14 @@ def get_1sol_cohesion_summary(GB_string): GB_string = "S11-RA110-S3-32-2x2" for _, row in df.iterrows(): if row.system.split(sep="-")[0] != "GB": - #print(row.system.split(sep="-")[0]) - eseg_df = get_1sol_df("%s\\%s\\%s" % - (fp_Seg1_path, GB_string, row.system.split(sep="-")[0]), - midpoint = 0.5094) - eseg = np.round(eseg_df.loc[eseg_df["system"] == row.system].E_seg.values[0],3) + # print(row.system.split(sep="-")[0]) + eseg_df = get_1sol_df( + "%s\\%s\\%s" % (fp_Seg1_path, GB_string, row.system.split(sep="-")[0]), + midpoint=0.5094, + ) + eseg = np.round( + eseg_df.loc[eseg_df["system"] == row.system].E_seg.values[0], 3 + ) else: eseg = np.nan eseg_list.append(eseg) diff --git a/utils/chargemol.py b/utils/chargemol.py index 5da8ba1..c4e25c6 100644 --- a/utils/chargemol.py +++ b/utils/chargemol.py @@ -1,8 +1,9 @@ - import os from pymatgen.core import Structure, Element -from pymatgen.command_line.chargemol_caller import ChargemolAnalysis as PMGChargemolAnalysis +from pymatgen.command_line.chargemol_caller import ( + ChargemolAnalysis as PMGChargemolAnalysis, +) import pandas as pd import numpy as np @@ -13,7 +14,8 @@ import matplotlib.pyplot as plt import matplotlib.ticker as ticker -import time +import time + def get_stats(property_list, property_str): """ @@ -40,27 +42,51 @@ def get_stats(property_list, property_str): f"{property_str}_max": np.max(property_list), } -def check_chargemol_output_present(directory,\ - required_files = ["DDEC6_even_tempered_atomic_spin_moments.xyz",\ - "DDEC6_even_tempered_net_atomic_charges.xyz",\ - "DDEC_atomic_Rfourth_moments.xyz",\ - "overlap_populations.xyz",\ - "DDEC6_even_tempered_bond_orders.xyz",\ - "DDEC_atomic_Rcubed_moments.xyz",\ - "DDEC_atomic_Rsquared_moments.xyz",\ - "POTCAR"]): - missing_files = [file for file in required_files if not os.path.exists(os.path.join(directory, file))] + +def check_chargemol_output_present( + directory, + required_files=[ + "DDEC6_even_tempered_atomic_spin_moments.xyz", + "DDEC6_even_tempered_net_atomic_charges.xyz", + "DDEC_atomic_Rfourth_moments.xyz", + "overlap_populations.xyz", + "DDEC6_even_tempered_bond_orders.xyz", + "DDEC_atomic_Rcubed_moments.xyz", + "DDEC_atomic_Rsquared_moments.xyz", + "POTCAR", + ], +): + missing_files = [ + file + for file in required_files + if not os.path.exists(os.path.join(directory, file)) + ] if missing_files: return False else: return True # All required files are present - + + def summarise_DDEC_data(directory, bond_order_threshold=0.05): if not check_chargemol_output_present(directory): # Some files are missing, return a DataFrame with NaN values and the filepath - columns = ["bond_order_std", "bond_order_mean", "bond_order_min", "bond_order_max", "n_bonds", - "element", "bond_order_sums", "ddec_charges", "cm5_charges", "ddec_rcubed_moments", - "ddec_rfourth_moments", "ddec_spin_moments", "dipoles", "charge_transfer", "partial_charge"] + columns = [ + "bond_order_std", + "bond_order_mean", + "bond_order_min", + "bond_order_max", + "n_bonds", + "element", + "bond_order_sums", + "ddec_charges", + "cm5_charges", + "ddec_rcubed_moments", + "ddec_rfourth_moments", + "ddec_spin_moments", + "dipoles", + "charge_transfer", + "partial_charge", + ] empty_data = [[np.nan] * len(columns)] ddec_df = pd.DataFrame(empty_data, columns=columns) ddec_df["filepath"] = directory @@ -74,18 +100,22 @@ def summarise_DDEC_data(directory, bond_order_threshold=0.05): df_thres = df[df["bond_order"] > bond_order_threshold] # This is a failsafe because certain atoms just don't bond (e.g. He/Ar) if len(df_thres) == 0: - df_thres = df + df_thres = df bo_stats_df = get_stats(df_thres.bond_order.tolist(), "bond_order") - bo_stats_df = pd.DataFrame.from_dict(bo_stats_df, orient='index', columns=[str(entries)]).T + bo_stats_df = pd.DataFrame.from_dict( + bo_stats_df, orient="index", columns=[str(entries)] + ).T bo_stats_df["n_bonds"] = 0 else: bo_stats_df = get_stats(df_thres.bond_order.tolist(), "bond_order") - bo_stats_df = pd.DataFrame.from_dict(bo_stats_df, orient='index', columns=[str(entries)]).T + bo_stats_df = pd.DataFrame.from_dict( + bo_stats_df, orient="index", columns=[str(entries)] + ).T bo_stats_df["n_bonds"] = len(df_thres) bo_df.append(bo_stats_df) element_symbol = ca.bond_order_dict[entries]["element"].symbol element_list.append(element_symbol) - + ddec_df = pd.concat(bo_df) ddec_df["filepath"] = directory ddec_df["element"] = element_list @@ -94,45 +124,63 @@ def summarise_DDEC_data(directory, bond_order_threshold=0.05): try: ddec_df["cm5_charges"] = ca.cm5_charges except Exception as e: - print(f"{directory}: FAILED DUE TO EXCEPTION {e}") + print(f"{directory}: FAILED DUE TO EXCEPTION {e}") ddec_df["cm5_charges"] = np.nan ddec_df["ddec_rcubed_moments"] = ca.ddec_rcubed_moments ddec_df["ddec_rfourth_moments"] = ca.ddec_rfourth_moments ddec_df["ddec_spin_moments"] = ca.ddec_spin_moments ddec_df["dipoles"] = ca.dipoles - ddec_df["charge_transfer"] = [ca.get_charge_transfer(i) for i in ca.bond_order_dict] - ddec_df["partial_charge"] = [ca.get_partial_charge(i) for i in ca.bond_order_dict] + ddec_df["charge_transfer"] = [ + ca.get_charge_transfer(i) for i in ca.bond_order_dict + ] + ddec_df["partial_charge"] = [ + ca.get_partial_charge(i) for i in ca.bond_order_dict + ] return ddec_df -def get_solute_summary_DDEC_data(directory, bond_order_threshold=0.05, base_solute="Fe"): - df = summarise_DDEC_data(directory=directory, bond_order_threshold=bond_order_threshold) - df = df[df["element"]==base_solute] + +def get_solute_summary_DDEC_data( + directory, bond_order_threshold=0.05, base_solute="Fe" +): + df = summarise_DDEC_data( + directory=directory, bond_order_threshold=bond_order_threshold + ) + df = df[df["element"] == base_solute] return df -class DatabaseGenerator(): - + +class DatabaseGenerator: + def __init__(self, parent_dir): self.parent_dir = parent_dir - - def build_database(self, - target_directory = None, - extract_directories = False, - cleanup=False, - keep_filenames_after_cleanup = [], - keep_filename_patterns_after_cleanup = [], - max_dir_count = None, - df_filename = None): - + + def build_database( + self, + target_directory=None, + extract_directories=False, + cleanup=False, + keep_filenames_after_cleanup=[], + keep_filename_patterns_after_cleanup=[], + max_dir_count=None, + df_filename=None, + ): + start_time = time.time() - + if target_directory: - dirs = find_chargemol_directories(parent_dir=target_directory, extract_tarballs=extract_directories) + dirs = find_chargemol_directories( + parent_dir=target_directory, extract_tarballs=extract_directories + ) else: - dirs = find_chargemol_directories(parent_dir=self.parent_dir, extract_tarballs=extract_directories) - - print(f"The total number of vasp directories that we are building the database out of is {len(dirs)}") - + dirs = find_chargemol_directories( + parent_dir=self.parent_dir, extract_tarballs=extract_directories + ) + + print( + f"The total number of vasp directories that we are building the database out of is {len(dirs)}" + ) + if max_dir_count: pkl_filenames = [] for i, chunks in enumerate(gen_tools.chunk_list(dirs, max_dir_count)): @@ -144,9 +192,11 @@ def build_database(self, db_filename = f"{i}.pkl" pkl_filenames.append(os.path.join(self.parent_dir, db_filename)) df.to_pickle(os.path.join(self.parent_dir, db_filename)) - step_taken_time = np.round(step_time - time.time(),3) - print(f"Step {i}: {step_taken_time} seconds taken for {len(chunks)} parse steps") - + step_taken_time = np.round(step_time - time.time(), 3) + print( + f"Step {i}: {step_taken_time} seconds taken for {len(chunks)} parse steps" + ) + df = pd.concat([pd.read_pickle(partial_df) for partial_df in pkl_filenames]) else: @@ -157,34 +207,49 @@ def build_database(self, df.to_pickle(os.path.join(self.parent_dir, f"vasp_database.pkl")) end_time = time.time() elapsed_time = end_time - start_time - + # not optional - keep the tarballs/zips.. keep_filename_patterns_after_cleanup += ".tar.gz" keep_filename_patterns_after_cleanup += ".tar.bz2" keep_filename_patterns_after_cleanup += ".zip" if cleanup: - gen_tools.cleanup_dir(directory_path=dirs, keep=True, files=[], file_patterns=[]) - parallelise(gen_tools.cleanup_dir, dirs, [True] * len(dirs), keep_filenames_after_cleanup*len(dirs), keep_filename_patterns_after_cleanup*len(dirs)) - + gen_tools.cleanup_dir( + directory_path=dirs, keep=True, files=[], file_patterns=[] + ) + parallelise( + gen_tools.cleanup_dir, + dirs, + [True] * len(dirs), + keep_filenames_after_cleanup * len(dirs), + keep_filename_patterns_after_cleanup * len(dirs), + ) + print("Elapsed time:", np.round(elapsed_time, 3), "seconds") return df - -class ChargemolAnalysis(): - def __init__(self, directory, extract_dir = False): + + +class ChargemolAnalysis: + def __init__(self, directory, extract_dir=False): self.directory = directory self._struct = None self._bond_matrix = None if extract_dir: directory = find_chargemol_directories(directory)[0] - if check_valid_chargemol_output(os.path.join(directory, "VASP_DDEC_analysis.output")): + if check_valid_chargemol_output( + os.path.join(directory, "VASP_DDEC_analysis.output") + ): self.parse_DDEC6_analysis_output() else: - print("No valid output available! Try extracting any tarballs? Set extract_dir=True") - + print( + "No valid output available! Try extracting any tarballs? Set extract_dir=True" + ) + def parse_DDEC6_analysis_output(self): - struct, bond_matrix = parse_DDEC6_analysis_output(os.path.join(self.directory, "VASP_DDEC_analysis.output")) + struct, bond_matrix = parse_DDEC6_analysis_output( + os.path.join(self.directory, "VASP_DDEC_analysis.output") + ) self.struct = struct self.bond_matrix = bond_matrix return struct, bond_matrix @@ -207,53 +272,73 @@ def set_bond_matrix(self, bond_matrix): def plot_ANSBO_profile(self): plot_ANSBO_profile_and_structure(self.struct, self.bond_matrix) - + def get_ANSBO_profile(self, axis=2, tolerance=0.1): - return get_ANSBO_all_cleavage_planes(self.struct, self.bond_matrix, axis=axis, tolerance=tolerance) + return get_ANSBO_all_cleavage_planes( + self.struct, self.bond_matrix, axis=axis, tolerance=tolerance + ) def get_min_ANSBO(self, axis=2, tolerance=0.1): - return min(get_ANSBO_all_cleavage_planes(self.struct, self.bond_matrix, axis=axis, tolerance=tolerance)) - + return min( + get_ANSBO_all_cleavage_planes( + self.struct, self.bond_matrix, axis=axis, tolerance=tolerance + ) + ) + def analyse_ANSBO(self, axis=2, tolerance=0.1): return analyse_ANSBO(self.directory, axis=axis, tolerance=tolerance) + def analyse_ANSBO(directory, axis=2, tolerance=0.1): - """ - - """ - struct, bond_matrix = parse_DDEC6_analysis_output(os.path.join(directory, "VASP_DDEC_analysis.output")) - atomic_layers = get_unique_values_in_nth_value(struct.cart_coords, axis, tolerance = tolerance) + """ """ + struct, bond_matrix = parse_DDEC6_analysis_output( + os.path.join(directory, "VASP_DDEC_analysis.output") + ) + atomic_layers = get_unique_values_in_nth_value( + struct.cart_coords, axis, tolerance=tolerance + ) cp_list = compute_average_pairs(atomic_layers) - ANSBO_profile = get_ANSBO_all_cleavage_planes(struct, bond_matrix, axis=axis, tolerance=tolerance) - - results_dict = {"layer_boundaries": atomic_layers, - "cleavage_coord": cp_list, - "ANSBO_profile": ANSBO_profile} + ANSBO_profile = get_ANSBO_all_cleavage_planes( + struct, bond_matrix, axis=axis, tolerance=tolerance + ) + + results_dict = { + "layer_boundaries": atomic_layers, + "cleavage_coord": cp_list, + "ANSBO_profile": ANSBO_profile, + } return results_dict - -def find_chargemol_directories(parent_dir, - filenames=["DDEC6_even_tempered_atomic_spin_moments.xyz", - "DDEC6_even_tempered_net_atomic_charges.xyz", - "DDEC_atomic_Rfourth_moments.xyz", - "overlap_populations.xyz", - "DDEC6_even_tempered_bond_orders.xyz", - "DDEC_atomic_Rcubed_moments.xyz", - "DDEC_atomic_Rsquared_moments.xyz", - "POTCAR"], - all_present=True, - extract_tarballs=True, - only_valid_output=True): + + +def find_chargemol_directories( + parent_dir, + filenames=[ + "DDEC6_even_tempered_atomic_spin_moments.xyz", + "DDEC6_even_tempered_net_atomic_charges.xyz", + "DDEC_atomic_Rfourth_moments.xyz", + "overlap_populations.xyz", + "DDEC6_even_tempered_bond_orders.xyz", + "DDEC_atomic_Rcubed_moments.xyz", + "DDEC_atomic_Rsquared_moments.xyz", + "POTCAR", + ], + all_present=True, + extract_tarballs=True, + only_valid_output=True, +): if extract_tarballs: - gen_tools.find_and_extract_files_from_tarballs_parallel(parent_dir=parent_dir, - extension=".tar.gz", - filenames=filenames, - suffix=None, - prefix=None) - - directories = gen_tools.find_directories_with_files(parent_dir=parent_dir, - filenames=filenames, - all_present=all_present) - + gen_tools.find_and_extract_files_from_tarballs_parallel( + parent_dir=parent_dir, + extension=".tar.gz", + filenames=filenames, + suffix=None, + prefix=None, + ) + + directories = gen_tools.find_directories_with_files( + parent_dir=parent_dir, filenames=filenames, all_present=all_present + ) + if only_valid_output: converged_list = [] non_converged_list = [] @@ -266,6 +351,7 @@ def find_chargemol_directories(parent_dir, directories = converged_list return directories + def parse_DDEC6_analysis_output(filename): """ Parses VASP_DDEC_analysis.output files and returns a Structure object and bond matrix. @@ -316,46 +402,69 @@ def parse_DDEC6_analysis_output(filename): flist = open(filename).readlines() bohr_to_angstrom_conversion_factor = 0.529177 - structure_lattice = gen_tools.parse_lines(flist, trigger_start="vectors", trigger_end="direct_coords")[0] - structure_lattice = np.array([list(map(float, line.split())) for line in structure_lattice]) + structure_lattice = gen_tools.parse_lines( + flist, trigger_start="vectors", trigger_end="direct_coords" + )[0] + structure_lattice = np.array( + [list(map(float, line.split())) for line in structure_lattice] + ) structure_lattice = structure_lattice * bohr_to_angstrom_conversion_factor - structure_frac_coords = gen_tools.parse_lines(flist, trigger_start="direct_coords", trigger_end="totnumA")[0] - structure_frac_coords = [np.array([float(coord) for coord in entry.split()]) for entry in structure_frac_coords] + structure_frac_coords = gen_tools.parse_lines( + flist, trigger_start="direct_coords", trigger_end="totnumA" + )[0] + structure_frac_coords = [ + np.array([float(coord) for coord in entry.split()]) + for entry in structure_frac_coords + ] # Convert atomic numbers to element symbols - structure_atomic_no = gen_tools.parse_lines(flist, trigger_start="(Missing core electrons will be inserted using stored core electron reference densities.)", trigger_end=" Finished the check for missing core electrons.") - structure_atomic_no = [Element.from_Z(int(atomic_number.split()[1])).symbol for atomic_number in structure_atomic_no[0]] + structure_atomic_no = gen_tools.parse_lines( + flist, + trigger_start="(Missing core electrons will be inserted using stored core electron reference densities.)", + trigger_end=" Finished the check for missing core electrons.", + ) + structure_atomic_no = [ + Element.from_Z(int(atomic_number.split()[1])).symbol + for atomic_number in structure_atomic_no[0] + ] structure = Structure(structure_lattice, structure_atomic_no, structure_frac_coords) - data_column_names = ['atom1',\ - 'atom2',\ - 'repeata',\ - 'repeatb',\ - 'repeatc',\ - 'min-na',\ - 'max-na',\ - 'min-nb',\ - 'max-nb',\ - 'min-nc',\ - 'max-nc',\ - 'contact-exchange',\ - 'avg-spin-pol-bonding-term',\ - 'overlap-population',\ - 'isoaepfcbo',\ - 'coord-term-tanh',\ - 'pairwise-term',\ - 'exp-term-comb-coord-pairwise',\ - 'bond-idx-before-self-exch',\ - 'final_bond_order'] - - bond_matrix = gen_tools.parse_lines(flist, trigger_start="The final bond pair matrix is", trigger_end="The legend for the bond pair matrix follows:")[0] + data_column_names = [ + "atom1", + "atom2", + "repeata", + "repeatb", + "repeatc", + "min-na", + "max-na", + "min-nb", + "max-nb", + "min-nc", + "max-nc", + "contact-exchange", + "avg-spin-pol-bonding-term", + "overlap-population", + "isoaepfcbo", + "coord-term-tanh", + "pairwise-term", + "exp-term-comb-coord-pairwise", + "bond-idx-before-self-exch", + "final_bond_order", + ] + + bond_matrix = gen_tools.parse_lines( + flist, + trigger_start="The final bond pair matrix is", + trigger_end="The legend for the bond pair matrix follows:", + )[0] bond_matrix = np.array([list(map(float, line.split())) for line in bond_matrix]) bond_matrix = pd.DataFrame(bond_matrix, columns=data_column_names) return structure, bond_matrix + def check_valid_chargemol_output(vasp_ddec_analysis_output_filepath): """ Checks if a VASP DDEC analysis output file indicates successful completion of Chargemol. @@ -384,18 +493,23 @@ def check_valid_chargemol_output(vasp_ddec_analysis_output_filepath): contains the necessary information. """ - convergence = gen_tools.search_line_in_file(vasp_ddec_analysis_output_filepath, "Finished chargemol in") + convergence = gen_tools.search_line_in_file( + vasp_ddec_analysis_output_filepath, "Finished chargemol in" + ) return convergence -def plot_structure_projection(structure, - projection_axis = [1, 2], - bond_matrix = None, - atom_size=250, - figsize=(8, 6), - cell_border_colour = "r", - atom_colour_dict = {}, - fontsize=16): + +def plot_structure_projection( + structure, + projection_axis=[1, 2], + bond_matrix=None, + atom_size=250, + figsize=(8, 6), + cell_border_colour="r", + atom_colour_dict={}, + fontsize=16, +): """ Plots the projection of a pymatgen structure on a 2D plane based on the specified projection axis. @@ -414,52 +528,73 @@ def plot_structure_projection(structure, # plt.figure(figsize=figsize) for site in structure: species = site.species_string - color = atom_colour_dict.get(species, 'b') # Default to blue if species not in atom_colour_dict - plt.scatter(site.coords[projection_axis[0]], site.coords[projection_axis[1]], color=color, s=atom_size, edgecolors='black') + color = atom_colour_dict.get( + species, "b" + ) # Default to blue if species not in atom_colour_dict + plt.scatter( + site.coords[projection_axis[0]], + site.coords[projection_axis[1]], + color=color, + s=atom_size, + edgecolors="black", + ) # Set plot title and labels - plt.title('Projection of the Cell', fontsize=16) - plt.xlabel(f'Axis {projection_axis[0]} Coordinate', fontsize=12) - plt.ylabel(f'Axis {projection_axis[1]} Coordinate', fontsize=12) + plt.title("Projection of the Cell", fontsize=16) + plt.xlabel(f"Axis {projection_axis[0]} Coordinate", fontsize=12) + plt.ylabel(f"Axis {projection_axis[1]} Coordinate", fontsize=12) # Set plot limits based on the atomic coordinates x_min, x_max = min(x_coords), max(x_coords) y_min, y_max = min(y_coords), max(y_coords) plt.xlim(x_min - 1, x_max + 1) plt.ylim(y_min - 1, y_max + 1) - + if bond_matrix is not None: - relevant_plot_bonds = bond_matrix[(bond_matrix['repeata'] == 0) & (bond_matrix['repeatb'] == 0)] - for idx, bonds in relevant_plot_bonds.iterrows(): - atom1 = int(bonds["atom1"])-1 - atom2 = int(bonds["atom2"])-1 - bondstrength = np.round(bonds["final_bond_order"],2) + relevant_plot_bonds = bond_matrix[ + (bond_matrix["repeata"] == 0) & (bond_matrix["repeatb"] == 0) + ] + for idx, bonds in relevant_plot_bonds.iterrows(): + atom1 = int(bonds["atom1"]) - 1 + atom2 = int(bonds["atom2"]) - 1 + bondstrength = np.round(bonds["final_bond_order"], 2) if bondstrength < 0.28: - c = 'r' + c = "r" else: - c = 'k' + c = "k" c = "k" - plt.plot([structure[atom1].coords[projection_axis[0]],structure[atom2].coords[projection_axis[0]]], - [structure[atom1].coords[projection_axis[1]],structure[atom2].coords[projection_axis[1]]], - '-', - color=c, - linewidth=bondstrength/0.56*5) - + plt.plot( + [ + structure[atom1].coords[projection_axis[0]], + structure[atom2].coords[projection_axis[0]], + ], + [ + structure[atom1].coords[projection_axis[1]], + structure[atom2].coords[projection_axis[1]], + ], + "-", + color=c, + linewidth=bondstrength / 0.56 * 5, + ) + # Draw the cell with a black border based on the projection_axis lattice_vectors = structure.lattice.matrix[projection_axis] # Draw the cell with a border based on the projection_axis - rect = plt.Rectangle((0,0), - structure.lattice.abc[projection_axis[0]], - structure.lattice.abc[projection_axis[1]], - edgecolor=cell_border_colour, - linewidth=3, - fill=False, - linestyle = '--') + rect = plt.Rectangle( + (0, 0), + structure.lattice.abc[projection_axis[0]], + structure.lattice.abc[projection_axis[1]], + edgecolor=cell_border_colour, + linewidth=3, + fill=False, + linestyle="--", + ) plt.gca().add_patch(rect) - plt.gca().set_aspect('equal') + plt.gca().set_aspect("equal") plt.grid() - + + def get_unique_values_in_nth_value(arr_list, n, tolerance): unique_values = [] for sublist in arr_list: @@ -473,6 +608,7 @@ def get_unique_values_in_nth_value(arr_list, n, tolerance): unique_values.append(value) return np.sort(unique_values) + def compute_average_pairs(lst): averages = [] for i in range(len(lst) - 1): @@ -480,11 +616,18 @@ def compute_average_pairs(lst): averages.append(average) return averages -def get_ANSBO(structure, bond_matrix, cleavage_plane, axis = 2): - bond_matrix['atom1pos'] = [structure[int(x)-1].coords[axis] for x in bond_matrix['atom1'].values] - bond_matrix['atom2pos'] = [structure[int(x)-1].coords[axis] for x in bond_matrix['atom2'].values] - clp_df = bond_matrix[(bond_matrix[['atom1pos','atom2pos']].max(axis=1) > cleavage_plane) - & (bond_matrix[['atom1pos','atom2pos']].min(axis=1) < cleavage_plane) ] + +def get_ANSBO(structure, bond_matrix, cleavage_plane, axis=2): + bond_matrix["atom1pos"] = [ + structure[int(x) - 1].coords[axis] for x in bond_matrix["atom1"].values + ] + bond_matrix["atom2pos"] = [ + structure[int(x) - 1].coords[axis] for x in bond_matrix["atom2"].values + ] + clp_df = bond_matrix[ + (bond_matrix[["atom1pos", "atom2pos"]].max(axis=1) > cleavage_plane) + & (bond_matrix[["atom1pos", "atom2pos"]].min(axis=1) < cleavage_plane) + ] if axis == 0: repeat1 = "repeatb" repeat2 = "repeatc" @@ -494,22 +637,30 @@ def get_ANSBO(structure, bond_matrix, cleavage_plane, axis = 2): elif axis == 2: repeat1 = "repeata" repeat2 = "repeatb" - + clp_df = clp_df.copy()[(clp_df[repeat1] == 0) | (clp_df[repeat2] == 0)] # We only want to calculate for atoms that exist in cell. This is important for bond order/area normalisation clp_df_countonce = clp_df.copy()[(clp_df[repeat1] == 0) & (clp_df[repeat2] == 0)] clp_df_counthalf = clp_df.copy()[(clp_df[repeat1] != 0) | (clp_df[repeat2] != 0)] # Basic summed bond order over CP - final_bond_order = clp_df_countonce.final_bond_order.sum() + 0.5*clp_df_counthalf.final_bond_order.sum() + final_bond_order = ( + clp_df_countonce.final_bond_order.sum() + + 0.5 * clp_df_counthalf.final_bond_order.sum() + ) # N largest - #final_bond_order = clp_df.nlargest(15, ['final_bond_order'])["final_bond_order"].sum() + # final_bond_order = clp_df.nlargest(15, ['final_bond_order'])["final_bond_order"].sum() # IMPORTANT: This assumes that the cross sectional area can be calculated this way - a_fbo = final_bond_order/(float(structure.lattice.volume)/float(structure.lattice.abc[axis])) - #print("area of this is %s" % (float(structure.lattice.volume)/float(structure.lattice.c))) + a_fbo = final_bond_order / ( + float(structure.lattice.volume) / float(structure.lattice.abc[axis]) + ) + # print("area of this is %s" % (float(structure.lattice.volume)/float(structure.lattice.c))) return a_fbo -def get_ANSBO_all_cleavage_planes(structure, bond_matrix, axis = 2, tolerance = 0.1): - atomic_layers = get_unique_values_in_nth_value(structure.cart_coords, axis, tolerance = tolerance) + +def get_ANSBO_all_cleavage_planes(structure, bond_matrix, axis=2, tolerance=0.1): + atomic_layers = get_unique_values_in_nth_value( + structure.cart_coords, axis, tolerance=tolerance + ) cp_list = compute_average_pairs(atomic_layers) ANSBO_profile = [] @@ -517,45 +668,47 @@ def get_ANSBO_all_cleavage_planes(structure, bond_matrix, axis = 2, tolerance = ANSBO_profile.append(get_ANSBO(structure, bond_matrix, cp)) return cp_list, ANSBO_profile -def plot_ANSBO_profile(structure, - bond_matrix, - projection_axis = [1, 2]): - ANSBO_values = get_ANSBO_all_cleavage_planes(structure, bond_matrix, projection_axis[-1]) - atomic_layer_coords = get_unique_values_in_nth_value(structure.cart_coords, projection_axis[-1], tolerance= 0.1) + +def plot_ANSBO_profile(structure, bond_matrix, projection_axis=[1, 2]): + ANSBO_values = get_ANSBO_all_cleavage_planes( + structure, bond_matrix, projection_axis[-1] + ) + atomic_layer_coords = get_unique_values_in_nth_value( + structure.cart_coords, projection_axis[-1], tolerance=0.1 + ) if len(atomic_layer_coords) != len(ANSBO_values) + 1: print("Error: Lengths of the lists are not compatible.") return - + # plt.figure(figsize=(3,10)) - + # Create lists for the x and y coordinates of the lines x_lines = [] y_lines = [] - + # Iterate over the elements of ANSBO_profile for i, value in enumerate(ANSBO_values): # Append x-coordinates for the horizontal lines x_lines.extend([value, value]) # Append y-coordinates for the horizontal lines - y_lines.extend([atomic_layer_coords[i], atomic_layer_coords[i+1]]) + y_lines.extend([atomic_layer_coords[i], atomic_layer_coords[i + 1]]) # Append x-coordinates for the vertical lines x_lines.append(value) # Append y-coordinates for the vertical lines - y_lines.append(atomic_layer_coords[i+1]) - + y_lines.append(atomic_layer_coords[i + 1]) + # Plotting the lines plt.plot(x_lines, y_lines) plt.grid() # Labeling the axes - plt.xlabel('ANSBO Profile') - plt.ylabel('Coordinates (Angstrom)') - -def plot_ANSBO_profile_and_structure(structure, - bond_matrix, - write=False, - filename="ANSBO.jpg", - fontsize=16): + plt.xlabel("ANSBO Profile") + plt.ylabel("Coordinates (Angstrom)") + + +def plot_ANSBO_profile_and_structure( + structure, bond_matrix, write=False, filename="ANSBO.jpg", fontsize=16 +): """ Plot the structure bond projection and the ANSBO profile side by side. @@ -570,20 +723,29 @@ def plot_ANSBO_profile_and_structure(structure, """ # Create a new figure with two subplots side by side - fig, axs = plt.subplots(1, 2, figsize=(10, 20), gridspec_kw={'width_ratios': [2, 1]}) - + fig, axs = plt.subplots( + 1, 2, figsize=(10, 20), gridspec_kw={"width_ratios": [2, 1]} + ) + # Activate the first subplot and call plot_structure_projection plt.sca(axs[0]) - plot_structure_projection(structure, bond_matrix=bond_matrix, figsize=(8, 6), atom_colour_dict={"Fe": "b", "Ac": "r"}) - plt.grid(True, which='major', linestyle='-') - plt.grid(True, which='minor', linestyle='--') + plot_structure_projection( + structure, + bond_matrix=bond_matrix, + figsize=(8, 6), + atom_colour_dict={"Fe": "b", "Ac": "r"}, + ) + plt.grid(True, which="major", linestyle="-") + plt.grid(True, which="minor", linestyle="--") axs[0].xaxis.set_minor_locator(ticker.MultipleLocator(1)) axs[0].yaxis.set_minor_locator(ticker.MultipleLocator(1)) # Activate the second subplot and call plot_ANSBO_profile plt.sca(axs[1]) - plot_ANSBO_profile(structure, bond_matrix) # Assuming you have defined the plot_ANSBO_profile function - plt.grid(True, which='major', linestyle='-') - plt.grid(True, which='minor', linestyle='--') + plot_ANSBO_profile( + structure, bond_matrix + ) # Assuming you have defined the plot_ANSBO_profile function + plt.grid(True, which="major", linestyle="-") + plt.grid(True, which="minor", linestyle="--") axs[1].xaxis.set_minor_locator(ticker.MultipleLocator(1)) axs[1].yaxis.set_minor_locator(ticker.MultipleLocator(1)) # Set the same y-axis limits for both subplots @@ -593,8 +755,8 @@ def plot_ANSBO_profile_and_structure(structure, plt.subplots_adjust(wspace=0.01) # Set the desired spacing between the subplots # Set titles for the subplots - axs[0].set_title('Structure Bond Projection', fontsize=fontsize) - axs[1].set_title('ANSBO Profile', fontsize=fontsize) + axs[0].set_title("Structure Bond Projection", fontsize=fontsize) + axs[1].set_title("ANSBO Profile", fontsize=fontsize) # Optionally, save the plot to a file if write: @@ -602,7 +764,10 @@ def plot_ANSBO_profile_and_structure(structure, # Display the plot plt.show() - + + def plot_ANSBO_profile_and_structure_from_dir(directory, extract_from_tarball=True): - structure, bond_matrix = parse_DDEC6_analysis_output(os.path.join(directory, "VASP_DDEC_analysis.output")) - plot_ANSBO_profile_and_structure(structure, bond_matrix) \ No newline at end of file + structure, bond_matrix = parse_DDEC6_analysis_output( + os.path.join(directory, "VASP_DDEC_analysis.output") + ) + plot_ANSBO_profile_and_structure(structure, bond_matrix) diff --git a/utils/custom_custodian_handlers.py b/utils/custom_custodian_handlers.py index 8f15da6..60e8f9f 100644 --- a/utils/custom_custodian_handlers.py +++ b/utils/custom_custodian_handlers.py @@ -30,9 +30,7 @@ from custodian.utils import backup from custodian.vasp.interpreter import VaspModder -__author__ = ( - "Han Lin Mai" -) +__author__ = "Han Lin Mai" __version__ = "0.1" __maintainer__ = "Han Mai" __email__ = "h.mai@mpie.de" @@ -51,6 +49,7 @@ "std_err.txt", } + class Han_CustomVaspErrorHandler(ErrorHandler): """Check if a run is converged.""" @@ -89,11 +88,20 @@ def correct(self): # expensive algorithms. if len(actions) == 0: if algo == "veryfast": - actions.append({"dict": "INCAR", "action": {"_set": {"ALGO": "Fast"}}}) + actions.append( + {"dict": "INCAR", "action": {"_set": {"ALGO": "Fast"}}} + ) elif algo == "fast": - actions.append({"dict": "INCAR", "action": {"_set": {"ALGO": "Normal"}}}) - elif algo == "normal" and (v.incar.get("ISMEAR", -1) >= 0 or not 50 <= v.incar.get("IALGO", 38) <= 59): - actions.append({"dict": "INCAR", "action": {"_set": {"ALGO": "All"}}}) + actions.append( + {"dict": "INCAR", "action": {"_set": {"ALGO": "Normal"}}} + ) + elif algo == "normal" and ( + v.incar.get("ISMEAR", -1) >= 0 + or not 50 <= v.incar.get("IALGO", 38) <= 59 + ): + actions.append( + {"dict": "INCAR", "action": {"_set": {"ALGO": "All"}}} + ) else: # Try mixing as last resort new_settings = { @@ -105,8 +113,12 @@ def correct(self): "BMIX_MAG": 0.001, } - if not all(v.incar.get(k, "") == val for k, val in new_settings.items()): - actions.append({"dict": "INCAR", "action": {"_set": new_settings}}) + if not all( + v.incar.get(k, "") == val for k, val in new_settings.items() + ): + actions.append( + {"dict": "INCAR", "action": {"_set": new_settings}} + ) elif not v.converged_ionic: # Just continue optimizing and let other handlers fix ionic @@ -123,4 +135,4 @@ def correct(self): return {"errors": ["Unconverged"], "actions": actions} # Unfixable error. Just return None for actions. - return {"errors": ["Unconverged"], "actions": None} \ No newline at end of file + return {"errors": ["Unconverged"], "actions": None} diff --git a/utils/functions.py b/utils/functions.py index 522e7fa..6a6777a 100644 --- a/utils/functions.py +++ b/utils/functions.py @@ -6,15 +6,19 @@ from pymatgen.io.vasp.inputs import Potcar, Incar, Kpoints from utils.jobfile import jobfile + potcar_library_path = "/root/POTCAR_Library/GGA" -#potcar_library_path = "/u/hmai/pyiron-resources-cmmc/vasp/potentials/potpaw_PBE" +# potcar_library_path = "/u/hmai/pyiron-resources-cmmc/vasp/potentials/potpaw_PBE" + +sites_to_study = { + "S11-RA110-S3-32": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22], + "S3-RA110-S1-11": [20, 22, 24, 26, 28, 30, 32, 34, 36], + "S3-RA110-S1-12": [12, 14, 16, 18, 20, 22, 24], + "S5-RA001-S210": [24, 27, 29, 31, 33, 35, 37], + "S5-RA001-S310": [23, 27, 33, 37, 40], + "S9-RA110-S2-21": list(range(23, 37)), +} -sites_to_study = {"S11-RA110-S3-32": [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22], - "S3-RA110-S1-11": [20, 22, 24, 26, 28, 30, 32, 34, 36], - "S3-RA110-S1-12": [12, 14, 16, 18, 20, 22, 24], - "S5-RA001-S210": [24, 27, 29, 31, 33, 35, 37], - "S5-RA001-S310": [23, 27, 33, 37, 40], - "S9-RA110-S2-21": list(range(23, 37))} def structures_from_vasp_folder(folder_path): # Initialize an empty dictionary to store the structures @@ -23,24 +27,27 @@ def structures_from_vasp_folder(folder_path): # Loop through all files in the folder for filename in os.listdir(folder_path): # Check if the file is a .vasp file - if filename.endswith('.vasp'): + if filename.endswith(".vasp"): # Read in the Structure from the VASP file using pymatgen's Structure class structure = Structure.from_file(os.path.join(folder_path, filename)) # Strip the .vasp extension from the filename and use it as the dictionary key key = os.path.splitext(filename)[0] # Assign the Structure object to the dictionary with the key structures_dict[key] = structure - + # Return the dictionary containing the structures return structures_dict -def createFolder(directory, delete_folder='no'): - import os; import shutil + +def createFolder(directory, delete_folder="no"): + import os + import shutil + if not os.path.exists(directory): os.makedirs(directory) else: - if delete_folder == 'no': - #print('no replacement/deletion created due to folder existing') + if delete_folder == "no": + # print('no replacement/deletion created due to folder existing') x = 1 else: print("removing directory...") @@ -51,22 +58,30 @@ def createFolder(directory, delete_folder='no'): else: print("given path is a special file - manually remove") + def get_immediate_subdirectories(a_dir): - return [f.path for f in os.scandir(a_dir) if f.is_dir() and os.path.basename(f) != ".ipynb_checkpoints"] - -def generateINCAR(structure, - path = os.path.join(os.getcwd(), "INCAR"), - ISIF = 2, - ISPIN = 1, - ENCUT = 350, - EDIFF = 1E-4, - EDIFFG = -0.02, - NCORE = 4, - KPAR = 1, - SYSTEM = "filltext", - functional = 'PBE', - reverse_magmom = False, - base_element = "Fe"): + return [ + f.path + for f in os.scandir(a_dir) + if f.is_dir() and os.path.basename(f) != ".ipynb_checkpoints" + ] + + +def generateINCAR( + structure, + path=os.path.join(os.getcwd(), "INCAR"), + ISIF=2, + ISPIN=1, + ENCUT=350, + EDIFF=1e-4, + EDIFFG=-0.02, + NCORE=4, + KPAR=1, + SYSTEM="filltext", + functional="PBE", + reverse_magmom=False, + base_element="Fe", +): INCAR_file = Incar() INCAR_file = INCAR_file.from_file(path) @@ -78,157 +93,170 @@ def generateINCAR(structure, INCAR_file["NCORE"] = NCORE INCAR_file["KPAR"] = KPAR - dictionary_of_functionals = {"PW91" : '91', - "PBE" : 'PE', - "AM05" : 'AM', - "PBEsol": 'PS', - "Hendin-Lundquist" : "HL", - "Ceperley-Alder" : "CA", - "Perdew-Zunger" : "PZ", - "Wigner" : 'WI', - "Revised-PBE-Pade" : "RP", - "revPBE" : "RE", - "Vosko-Wilk-Nusair" : "VW", - "B3LYP-LDA-VWN3" : "B3", - "B3LYP-LDA-BWN5" : "B5", - "BEEF" : "BF", - "no-xc" : "CO"} + dictionary_of_functionals = { + "PW91": "91", + "PBE": "PE", + "AM05": "AM", + "PBEsol": "PS", + "Hendin-Lundquist": "HL", + "Ceperley-Alder": "CA", + "Perdew-Zunger": "PZ", + "Wigner": "WI", + "Revised-PBE-Pade": "RP", + "revPBE": "RE", + "Vosko-Wilk-Nusair": "VW", + "B3LYP-LDA-VWN3": "B3", + "B3LYP-LDA-BWN5": "B5", + "BEEF": "BF", + "no-xc": "CO", + } # These magmoms are from projects past and present... Feel free to alter them # Ni-H from Ni-GB manuscript # dictionary_of_magmom = {"Ni" : 2.0, # "H" : 0.0} # rest from Fe-bulk manuscript - dictionary_of_magmom = {'Ac': -0.196, - 'Ag': 0.114, - 'Al': -0.17, - 'Ar': 0.354, - 'As': -0.136, - 'At': -0.084, - 'Au': 0.308, - 'Ba': -0.25, - 'Bi': -0.302, - 'Br': 0.158, - 'Ca': -0.494, - 'Cd': -0.158, - 'Ce': -0.928, - 'Cl': 0.286, - 'Co': 3.37, - 'Cr': -3.71, - 'Cs': 0.06, - 'Cu': 0.238, - 'Dy': 9.11, - 'Er': 5.048, - 'Eu': -13.498, - 'Fe': 3.0, - 'Fr': -0.046, - 'Ga': -0.4, - 'Gd': -14.248, - 'Ge': -0.258, - 'Hf': -1.17, - 'Hg': -0.1, - 'Ho': 6.942, - 'I': -0.024, - 'In': -0.51, - 'Ir': 0.756, - 'K': 0.152, - 'Kr': 0.384, - 'La': -0.416, - 'Lu': -0.544, - 'Mg': -0.128, - 'Mn': -4.128, - 'Mo': -1.662, - 'Na': -0.09, - 'Nb': -1.518, - 'Nd': -6.142, - 'Ne': 0.02, - #'Ne': -3.0, - 'Ni': 1.774, - 'Os': -0.224, - 'P': -0.112, - 'Pa': -1.184, - 'Pb': -0.41, - 'Pd': 0.73, - 'Pm': -8.76, - 'Po': -0.188, - 'Pr': -3.256, - 'Pt': 0.74, - 'Ra': -0.096, - 'Rb': 0.11, - 'Re': -1.27, - 'Rh': 1.194, - 'Rn': 0.032, - 'Ru': 0.454, - 'S': 0.082, - 'Sb': -0.186, - 'Sc': -1.12, - 'Se': -0.008, - 'Si': -0.194, - 'Sm': -10.964, - 'Sn': -0.426, - 'Sr': -0.128, - 'Ta': -1.588, - 'Tb': -12.568, - 'Tc': -1.208, - 'Te': -0.13, - 'Th': -0.508, - 'Ti': -1.93, - 'Tl': -0.45, - 'Tm': 2.776, - 'U': -2.76, - 'V': -2.86, - 'W': -1.606, - 'Xe': 0.288, - 'Y': -0.668, - 'Yb': 0.414, - 'Zn': -0.196, - 'Zr': -0.888, - 'H' : -0.018, - 'He': -0.010, - 'Li': -0.168, - 'Be': -0.302, - 'B' : -0.314, - 'C' : -0.204, - 'N' : 0.094, - 'O' : 0.454, - 'F' : 0.348} + dictionary_of_magmom = { + "Ac": -0.196, + "Ag": 0.114, + "Al": -0.17, + "Ar": 0.354, + "As": -0.136, + "At": -0.084, + "Au": 0.308, + "Ba": -0.25, + "Bi": -0.302, + "Br": 0.158, + "Ca": -0.494, + "Cd": -0.158, + "Ce": -0.928, + "Cl": 0.286, + "Co": 3.37, + "Cr": -3.71, + "Cs": 0.06, + "Cu": 0.238, + "Dy": 9.11, + "Er": 5.048, + "Eu": -13.498, + "Fe": 3.0, + "Fr": -0.046, + "Ga": -0.4, + "Gd": -14.248, + "Ge": -0.258, + "Hf": -1.17, + "Hg": -0.1, + "Ho": 6.942, + "I": -0.024, + "In": -0.51, + "Ir": 0.756, + "K": 0.152, + "Kr": 0.384, + "La": -0.416, + "Lu": -0.544, + "Mg": -0.128, + "Mn": -4.128, + "Mo": -1.662, + "Na": -0.09, + "Nb": -1.518, + "Nd": -6.142, + "Ne": 0.02, + #'Ne': -3.0, + "Ni": 1.774, + "Os": -0.224, + "P": -0.112, + "Pa": -1.184, + "Pb": -0.41, + "Pd": 0.73, + "Pm": -8.76, + "Po": -0.188, + "Pr": -3.256, + "Pt": 0.74, + "Ra": -0.096, + "Rb": 0.11, + "Re": -1.27, + "Rh": 1.194, + "Rn": 0.032, + "Ru": 0.454, + "S": 0.082, + "Sb": -0.186, + "Sc": -1.12, + "Se": -0.008, + "Si": -0.194, + "Sm": -10.964, + "Sn": -0.426, + "Sr": -0.128, + "Ta": -1.588, + "Tb": -12.568, + "Tc": -1.208, + "Te": -0.13, + "Th": -0.508, + "Ti": -1.93, + "Tl": -0.45, + "Tm": 2.776, + "U": -2.76, + "V": -2.86, + "W": -1.606, + "Xe": 0.288, + "Y": -0.668, + "Yb": 0.414, + "Zn": -0.196, + "Zr": -0.888, + "H": -0.018, + "He": -0.010, + "Li": -0.168, + "Be": -0.302, + "B": -0.314, + "C": -0.204, + "N": 0.094, + "O": 0.454, + "F": 0.348, + } ele_list, ele_count = stackElementString(structure) # This is a funny quirk involving 4d metals - we have to adjust the LMAXMIX flag for faster convergence if [i for i in ["Mo", "Nb"] if i in ele_list]: - #print("Mo/Nb present, LMAXMIX = 4 adjustment") + # print("Mo/Nb present, LMAXMIX = 4 adjustment") INCAR_file["LMAXMIX"] = 4 elif "W" in ele_list: - #print("W present, LMAXMIX = 6 adjustment") + # print("W present, LMAXMIX = 6 adjustment") INCAR_file["LMAXMIX"] = 6 else: - INCAR_file.pop('LMAXMIX',None) + INCAR_file.pop("LMAXMIX", None) if ISPIN != 2: - INCAR_file.pop("MAGMOM",None) - INCAR_file.pop('BMIX_MAG',None) - INCAR_file.pop('AMIX_MAG',None) + INCAR_file.pop("MAGMOM", None) + INCAR_file.pop("BMIX_MAG", None) + INCAR_file.pop("AMIX_MAG", None) else: - incar_magmom_str = '' + incar_magmom_str = "" for idx, element in enumerate(ele_list): if reverse_magmom: if element == base_element: - incar_magmom_str += "%s*%s " % (ele_count[idx], dictionary_of_magmom[ele_list[idx]]) + incar_magmom_str += "%s*%s " % ( + ele_count[idx], + dictionary_of_magmom[ele_list[idx]], + ) else: - incar_magmom_str += "%s*%s " % (ele_count[idx], -dictionary_of_magmom[ele_list[idx]]) + incar_magmom_str += "%s*%s " % ( + ele_count[idx], + -dictionary_of_magmom[ele_list[idx]], + ) else: - incar_magmom_str += "%s*%s " % (ele_count[idx], dictionary_of_magmom[ele_list[idx]]) + incar_magmom_str += "%s*%s " % ( + ele_count[idx], + dictionary_of_magmom[ele_list[idx]], + ) INCAR_file["MAGMOM"] = incar_magmom_str - if functional == "LDA": - INCAR_file.pop('GGA', None) + INCAR_file.pop("GGA", None) else: INCAR_file["GGA"] = dictionary_of_functionals[functional] # print('functional key is %s' % dictionary_of_functionals[functional]) return INCAR_file + def stackElementString(structure): site_element_list = [site.species_string for site in structure] past_element = site_element_list[0] @@ -246,52 +274,54 @@ def stackElementString(structure): element_count.append(count) return element_list, element_count -def createPOTCAR(structure, path = os.getcwd()): + +def createPOTCAR(structure, path=os.getcwd()): element_list = stackElementString(structure)[0] potcar_paths = [] for element in element_list: if element == "Nb": - element = "Nb_sv" # Use 13 electron - element = "Nb_pv" # Use 11 electron + element = "Nb_sv" # Use 13 electron + element = "Nb_pv" # Use 11 electron elif element == "K": - element = "K_sv" # 9 electron - element = "K_pv" # 7 electron + element = "K_sv" # 9 electron + element = "K_pv" # 7 electron elif element == "Ca": - element = "Ca_sv" # 9 electron - element = "Ca_pv" # 7 electron + element = "Ca_sv" # 9 electron + element = "Ca_pv" # 7 electron elif element == "Rb": - element = "Rb_sv" # 9 electron - element = "Rb_pv" # 7 electron + element = "Rb_sv" # 9 electron + element = "Rb_pv" # 7 electron elif element == "Sr": - element = "Sr_sv" # 9 electron + element = "Sr_sv" # 9 electron elif element == "Cs": - element = "Cs_sv" # 9 electron + element = "Cs_sv" # 9 electron elif element == "Ba": - element = "Ba_sv" # 10 electron + element = "Ba_sv" # 10 electron elif element == "Fr": - element = "Fr_sv" # 9 electron + element = "Fr_sv" # 9 electron elif element == "Ra": - element = "Ra_sv" # 9 electron + element = "Ra_sv" # 9 electron elif element == "Y": - element = "Y_sv" # 9 electron + element = "Y_sv" # 9 electron elif element == "Zr": - element = "Zr_sv" # 10 electron + element = "Zr_sv" # 10 electron elif element == "Fr": - element = "Fr_sv" # 9 electron + element = "Fr_sv" # 9 electron elif element == "Ra": - element = "Ra_sv" # 9 electron + element = "Ra_sv" # 9 electron elif element == "Y": - element = "Y_sv" # 9 electron + element = "Y_sv" # 9 electron potcar_paths.append(os.path.join(potcar_library_path, element, "POTCAR")) - with open(os.path.join(path, "POTCAR"),'wb') as wfd: + with open(os.path.join(path, "POTCAR"), "wb") as wfd: for f in potcar_paths: - with open(f,'rb') as fd: + with open(f, "rb") as fd: shutil.copyfileobj(fd, wfd) + class KPOINTS: """ Class for KPOINTS object for passing into createJobFolder @@ -303,13 +333,12 @@ class KPOINTS: shift: optional shift of mesh, input as list e.g. [0, 0, 0] """ + def __init__(self, subdivs, shift): self.subdivs = subdivs self.shift = shift - def to_file(self,\ - case_name = 'KPOINTS',\ - filepath = os.getcwd()): + def to_file(self, case_name="KPOINTS", filepath=os.getcwd()): """ Writes KPOINTS file with MP gamma centred grid: @@ -318,48 +347,58 @@ def to_file(self,\ """ createFolder(filepath) - f = io.open(os.path.join(filepath, "KPOINTS"), 'w', newline='\n') - with open(os.path.join(filepath, "KPOINTS"), 'a', newline='\n') as f: + f = io.open(os.path.join(filepath, "KPOINTS"), "w", newline="\n") + with open(os.path.join(filepath, "KPOINTS"), "a", newline="\n") as f: # File name (just string on first line of KPOINTS) - f.write('%s\n' % case_name) + f.write("%s\n" % case_name) # Use automatic generation "0" - f.write('0\n') + f.write("0\n") # Monkhorst-Pack Gamma centred grid - f.write('Gamma\n') + f.write("Gamma\n") # Subdivisions along reciprocal lattice vectors - subdiv_string = '' + subdiv_string = "" for i in self.subdivs: subdiv_string += "%s " % str(i) - f.write('%s\n' % subdiv_string) + f.write("%s\n" % subdiv_string) # optional shift of the mesh (s_1, s_2, s_3) - shift_string = '' + shift_string = "" for i in self.shift: shift_string += "%s " % str(i) - f.write('%s\n' % shift_string) + f.write("%s\n" % shift_string) f.close() -def createJobFolder(structure,\ - KPOINT = None,\ - folder_path = os.path.join(os.getcwd(), "jobfolder"),\ - INCAR = None,\ - jobfile = None,\ - quiet=True): + +def createJobFolder( + structure, + KPOINT=None, + folder_path=os.path.join(os.getcwd(), "jobfolder"), + INCAR=None, + jobfile=None, + quiet=True, +): # This assumes that incar file base is present already, please adjust this function to adjust the incar flags # creates a subdirectory of chosen name in current directory parent_folder = os.getcwd() createFolder(folder_path) - structure.to(fmt="poscar", filename = os.path.join(folder_path, f"starter-{os.path.basename(folder_path)}.vasp")) - structure.to(fmt="poscar", filename = os.path.join(folder_path, "POSCAR")) + structure.to( + fmt="poscar", + filename=os.path.join( + folder_path, f"starter-{os.path.basename(folder_path)}.vasp" + ), + ) + structure.to(fmt="poscar", filename=os.path.join(folder_path, "POSCAR")) - createPOTCAR(structure, path = "%s" % folder_path) + createPOTCAR(structure, path="%s" % folder_path) INCAR.write_file(os.path.join(folder_path, "INCAR")) if KPOINT: - KPOINT.to_file(filepath = folder_path) + KPOINT.to_file(filepath=folder_path) - jobfile.to_file(job_name = '%s.sh' % os.path.basename(folder_path),\ - output_path = "%s" % (folder_path)) + jobfile.to_file( + job_name="%s.sh" % os.path.basename(folder_path), + output_path="%s" % (folder_path), + ) if not quiet: print("Generating jobfolder, name %s" % (os.path.basename(folder_path))) diff --git a/utils/generic.py b/utils/generic.py index 5d16d81..c762fa2 100644 --- a/utils/generic.py +++ b/utils/generic.py @@ -12,6 +12,7 @@ from monty.os.path import find_exts from monty.io import zopen + def chunk_list(lst, n): """ Split a list into smaller chunks with a maximum size of n. @@ -29,7 +30,8 @@ def chunk_list(lst, n): >>> print(chunked_list) [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10]] """ - return [lst[i:i + n] for i in range(0, len(lst), n)] + return [lst[i : i + n] for i in range(0, len(lst), n)] + def get_latest_file_iteration(directory, filename_with_iteration): # Check for existing resubmit.log_m files and find the largest m @@ -37,22 +39,23 @@ def get_latest_file_iteration(directory, filename_with_iteration): for filename in os.listdir(directory): if filename_with_iteration in filename: resubmit_log_files.append(filename) - + max_integer = -1 - + if not resubmit_log_files: return -1 else: for log_file in resubmit_log_files: if log_file.startswith(filename_with_iteration): try: - num_str = log_file[len(filename_with_iteration):] + num_str = log_file[len(filename_with_iteration) :] num = int(num_str) max_integer = max(max_integer, num) except ValueError: pass # Ignore non-integer parts after "resubmit.log_" return max_integer - + + def search_line_in_file(filename, line_to_search, search_depth=None, reverse=True): """ Searches for a specific line in a file. @@ -85,22 +88,23 @@ def search_line_in_file(filename, line_to_search, search_depth=None, reverse=Tru - If the file is not found, the function returns False. """ try: - with open(filename, 'r') as file: + with open(filename, "r") as file: lines = file.readlines() if reverse: - lines = reversed(lines) # Reverse the lines + lines = reversed(lines) # Reverse the lines count = 0 for line in lines: if search_depth is not None and count >= search_depth: - break + break if line_to_search in line.strip(): - return True - count += 1 + return True + count += 1 return False except FileNotFoundError: # print("File not found:", filename) return False + def parse_lines(flist, trigger_start, trigger_end, recursive=True): """ Parses lines from a list of strings based on start and end triggers and returns the parsed data. @@ -155,6 +159,7 @@ def parse_lines(flist, trigger_start, trigger_end, recursive=True): return data + def find_directories_with_files(parent_dir, filenames, all_present=True): """ Finds directories in a parent directory that contain specified files. @@ -182,7 +187,9 @@ def find_directories_with_files(parent_dir, filenames, all_present=True): - The function returns a list of directories that meet the specified conditions. """ directories = [] - file_set = set(filenames) # Convert filenames to a set for efficient membership checking + file_set = set( + filenames + ) # Convert filenames to a set for efficient membership checking for root, dirs, files in os.walk(parent_dir): # Check if the intersection of file_set and files is not empty @@ -193,6 +200,7 @@ def find_directories_with_files(parent_dir, filenames, all_present=True): return directories + def extract_tarball(archive_filepath, extraction_path): """ Extracts the contents of an archive file to the specified extraction path. @@ -224,7 +232,10 @@ def extract_tarball(archive_filepath, extraction_path): except Exception as e: print(f"Error extracting archive: {e}") -def find_and_extract_tarballs_parallel(parent_dir, extensions=(".tar.gz",), max_workers=None): + +def find_and_extract_tarballs_parallel( + parent_dir, extensions=(".tar.gz",), max_workers=None +): """ Finds tarball files with specified extensions in a directory and extracts them in parallel. @@ -249,11 +260,15 @@ def find_and_extract_tarballs_parallel(parent_dir, extensions=(".tar.gz",), max_ extraction_filepaths = [os.path.dirname(filepath) for filepath in filepaths] # Prepare args_list as a list of tuples - args_list = [(filepath, extraction_path) for filepath, extraction_path in zip(filepaths, extraction_filepaths)] + args_list = [ + (filepath, extraction_path) + for filepath, extraction_path in zip(filepaths, extraction_filepaths) + ] # Call parallelise function parallelise(extract_tarball, args_list, max_workers=max_workers) + def extract_files_from_tarball(tarball_filepath, filenames, suffix=None, prefix=None): """ Extracts specific files from a tarball file and optionally renames them with a suffix. @@ -286,25 +301,39 @@ def extract_files_from_tarball(tarball_filepath, filenames, suffix=None, prefix= elif tarball_filepath.endswith(".bz2"): compression_type = "bz2" else: - raise ValueError("Unsupported compression type. Only .gz and .bz2 are supported.") + raise ValueError( + "Unsupported compression type. Only .gz and .bz2 are supported." + ) with tarfile.open(tarball_filepath, f"r:{compression_type}") as tar: extracted_filepaths = [] for filename in filenames: try: - matching_names = [name for name in tar.getnames() if name.endswith(filename)] + matching_names = [ + name for name in tar.getnames() if name.endswith(filename) + ] for name in matching_names: tar.extract(name, path=os.path.dirname(tarball_filepath)) if name.startswith("./"): - extracted_filepath = os.path.join(os.path.dirname(tarball_filepath), name[2:]) + extracted_filepath = os.path.join( + os.path.dirname(tarball_filepath), name[2:] + ) else: - extracted_filepath = os.path.join(os.path.dirname(tarball_filepath), name) + extracted_filepath = os.path.join( + os.path.dirname(tarball_filepath), name + ) if suffix: - new_path = os.path.join(os.path.dirname(extracted_filepath), os.path.basename(extracted_filepath) + "_" + suffix) + new_path = os.path.join( + os.path.dirname(extracted_filepath), + os.path.basename(extracted_filepath) + "_" + suffix, + ) os.rename(extracted_filepath, new_path) extracted_filepath = new_path if prefix: - new_path = os.path.join(prefix + "_" + os.path.dirname(extracted_filepath), os.path.basename(extracted_filepath)) + new_path = os.path.join( + prefix + "_" + os.path.dirname(extracted_filepath), + os.path.basename(extracted_filepath), + ) os.rename(extracted_filepath, new_path) extracted_filepath = new_path extracted_filepaths.append(extracted_filepath) @@ -313,7 +342,10 @@ def extract_files_from_tarball(tarball_filepath, filenames, suffix=None, prefix= return extracted_filepaths -def extract_files_from_tarballs_parallel(tarball_paths, filenames, suffix=False, max_workers=None): + +def extract_files_from_tarballs_parallel( + tarball_paths, filenames, suffix=False, max_workers=None +): """ Extracts specific files from multiple tarball files in parallel and optionally renames them with suffixes. @@ -346,39 +378,47 @@ def extract_files_from_tarballs_parallel(tarball_paths, filenames, suffix=False, elif isinstance(filenames, list): if isinstance(filenames[0], str): if len(filenames) != len(tarball_paths): - raise ValueError("The length of filenames should match the number of tarball_paths.") + raise ValueError( + "The length of filenames should match the number of tarball_paths." + ) else: raise ValueError("Invalid format for filenames.") else: raise ValueError("Invalid format for filenames.") if suffix: - suffixes = [os.path.basename(filepath).split(".tar")[0] for filepath in tarball_paths] + suffixes = [ + os.path.basename(filepath).split(".tar")[0] for filepath in tarball_paths + ] else: suffixes = [None for _ in tarball_paths] # Prepare args_list as a list of tuples - args_list = [(tarball_path, filename, suffix) for tarball_path, filename, suffix in zip(tarball_paths, filenames, suffixes)] + args_list = [ + (tarball_path, filename, suffix) + for tarball_path, filename, suffix in zip(tarball_paths, filenames, suffixes) + ] # Call parallelise function parallelise(extract_files_from_tarball, args_list, max_workers=max_workers) -def find_and_extract_files_from_tarballs_parallel(parent_dir, - extension=(".tar.gz",), - filenames=[], - suffix=False, - prefix=False, - exclude_containing=["error."], - max_workers=None): + +def find_and_extract_files_from_tarballs_parallel( + parent_dir, + extension=(".tar.gz",), + filenames=[], + suffix=False, + prefix=False, + exclude_containing=["error."], + max_workers=None, +): """ Finds and extracts specific files from multiple tarball files within a parent directory using parallel processing. Parameters: parent_dir (str): The path of the parent directory to search for tarball files. extension (str or tuple, optional): The file extension(s) of the tarball files to search for. Defaults to ".tar.gz". - filenames (str or list, optional): The filenames to extract from the tarball(s). If a string, it will be used for all tarball files. - If a list, it should have the same length as the number of tarball files found in the parent directory. - Defaults to an empty list, which means all files will be extracted. + filenames (list, optional): List of filenames to extract from the tarball files. suffix (bool, optional): Determines whether to append suffixes to the extracted filenames. Defaults to False. prefix (bool, optional): Determines whether to prepend prefixes to the extracted filenames. Defaults to False. exclude_containing (list, optional): A list of strings. Tarballs whose names contain any of these strings will be excluded from extraction. @@ -398,43 +438,60 @@ def find_and_extract_files_from_tarballs_parallel(parent_dir, - The function searches for tarball files within the specified `parent_dir` using the provided `extension`. - It finds and extracts specific `filenames` from the tarball files, either all files or the specified files. - If `suffix` is True, the extracted filenames will be appended with suffixes. + - If `prefix` is True, the extracted filenames will be prepended with prefixes. - The extraction process is parallelized using the `parallelise()` function and the `extract_files_from_tarball` function. - Tarballs whose names contain any of the strings in `exclude_containing` will be skipped during extraction. """ filepaths = find_exts(top=parent_dir, exts=extension) # Filter out tarballs that contain any of the strings in exclude_containing - filepaths = [filepath for filepath in filepaths if not any(exclude in os.path.basename(filepath) for exclude in exclude_containing)] + filepaths = [ + filepath + for filepath in filepaths + if not any( + exclude in os.path.basename(filepath) for exclude in exclude_containing + ) + ] if suffix: - suffixes = [os.path.basename(filepath).split(".tar")[0] for filepath in filepaths] + suffixes = [ + os.path.basename(filepath).split(".tar")[0] for filepath in filepaths + ] else: suffixes = [None for _ in filepaths] - + if prefix: - prefixes = [os.path.basename(filepath).split(".tar")[0] for filepath in filepaths] + prefixes = [ + os.path.basename(filepath).split(".tar")[0] for filepath in filepaths + ] else: prefixes = [None for _ in filepaths] - if isinstance(filenames, str): - filenames = [filenames] * len(filepaths) - elif isinstance(filenames, list): - if len(filenames) != len(filepaths): - raise ValueError("The length of filenames should match the number of tarball files found.") - else: - raise ValueError("Invalid format for filenames.") + if not isinstance(filenames, list): + raise ValueError( + "The 'filenames' parameter should be a list of filenames to extract." + ) # Prepare args_list as a list of tuples - args_list = [(filepath, filename) for filepath, filename in zip(filepaths, filenames)] + args_list = [(filepath, filenames) for filepath in filepaths] # Call parallelise function - parallelise(extract_files_from_tarball, args_list, max_workers=max_workers, suffix=suffixes, prefix=prefixes) - -def compress_directory(directory_path, - exclude_files = [], - exclude_file_patterns = [], - print_message=True, - inside_dir=True): + parallelise( + extract_files_from_tarball, + args_list, + max_workers=max_workers, + suffix=suffixes, + prefix=prefixes, + ) + + +def compress_directory( + directory_path, + exclude_files=[], + exclude_file_patterns=[], + print_message=True, + inside_dir=True, +): """ Compresses a directory and its contents into a tarball with gzip compression. @@ -465,9 +522,14 @@ def compress_directory(directory_path, - The `print_message` parameter controls whether a message indicating the compression is printed. By default, it is set to True. """ if inside_dir: - output_file = os.path.join(directory_path, os.path.basename(directory_path) + '.tar.gz') + output_file = os.path.join( + directory_path, os.path.basename(directory_path) + ".tar.gz" + ) else: - output_file = os.path.join(os.path.dirname(directory_path), os.path.basename(directory_path) + '.tar.gz') + output_file = os.path.join( + os.path.dirname(directory_path), + os.path.basename(directory_path) + ".tar.gz", + ) with tarfile.open(output_file, "w:gz") as tar: for root, _, files in os.walk(directory_path): for file in files: @@ -475,23 +537,31 @@ def compress_directory(directory_path, # Exclude the output tarball from being added if file_path == output_file: continue - if any(fnmatch.fnmatch(file, pattern) for pattern in exclude_file_patterns): + if any( + fnmatch.fnmatch(file, pattern) for pattern in exclude_file_patterns + ): continue if file in exclude_files: continue - arcname = os.path.join(os.path.basename(directory_path), os.path.relpath(file_path, directory_path)) + arcname = os.path.join( + os.path.basename(directory_path), + os.path.relpath(file_path, directory_path), + ) tar.add(file_path, arcname=arcname) - # tar.add(file_path, arcname=os.path.relpath(file_path, directory_path)) + # tar.add(file_path, arcname=os.path.relpath(file_path, directory_path)) # print(f"{file} added") if print_message: print(f"Compressed directory: {directory_path}") -def compress_directory_parallel(directory_paths, - exclude_files=None, - exclude_file_patterns=None, - print_message=None, - inside_dir=None, - max_workers=None): + +def compress_directory_parallel( + directory_paths, + exclude_files=None, + exclude_file_patterns=None, + print_message=None, + inside_dir=None, + max_workers=None, +): """ Compresses multiple directories and their contents into tarballs with gzip compression in parallel. @@ -522,15 +592,18 @@ def compress_directory_parallel(directory_paths, - The `print_message` parameter controls whether a message indicating the compression is printed for each directory. - The function parallelizes the compression process using the `parallelise()` function and the `compress_directory` function. """ - parallelise(compress_directory, - directory_paths, - max_workers=max_workers, - exclude_files=exclude_files, - exclude_file_patterns=exclude_file_patterns, - print_message=print_message, - inside_dir=inside_dir) - -def cleanup_dir(directory_path, keep=True, files=[], file_patterns=[]): + parallelise( + compress_directory, + directory_paths, + max_workers=max_workers, + exclude_files=exclude_files, + exclude_file_patterns=exclude_file_patterns, + print_message=print_message, + inside_dir=inside_dir, + ) + + +def cleanup_dir(directory_path, keep=True, files=[], file_patterns=[]): """ Cleans up files in a directory based on specified conditions. @@ -577,15 +650,18 @@ def cleanup_dir(directory_path, keep=True, files=[], file_patterns=[]): break if should_remove or file in files: os.remove(file_path) - -def compress_and_cleanup(directory_path, - exclude_files_from_tarball=[], - exclude_filepatterns_from_tarball=[], - keep_after=True, - files=[], - file_patterns=[], - print_msg=False, - inside_dir=True): + + +def compress_and_cleanup( + directory_path, + exclude_files_from_tarball=[], + exclude_filepatterns_from_tarball=[], + keep_after=True, + files=[], + file_patterns=[], + print_msg=False, + inside_dir=True, +): """ Compresses a directory and its contents into a tarball with gzip compression, and performs cleanup operations. @@ -615,32 +691,39 @@ def compress_and_cleanup(directory_path, - The `print_msg` parameter controls whether a message indicating the compression is printed for the directory. - The `inside_dir` parameter determines whether the output tarball should be placed inside the directory (True) or in the same directory as the directory (False). """ - compress_directory(directory_path=directory_path, - exclude_files=exclude_files_from_tarball, - exclude_file_patterns=exclude_filepatterns_from_tarball, - print_message=print_msg, - inside_dir=inside_dir) + compress_directory( + directory_path=directory_path, + exclude_files=exclude_files_from_tarball, + exclude_file_patterns=exclude_filepatterns_from_tarball, + print_message=print_msg, + inside_dir=inside_dir, + ) # Add the newly compressed directory to the exceptions, or we'll remove it! if keep_after: file_patterns += [f"{os.path.basename(directory_path)}.tar.gz"] else: file_patterns = file_patterns - cleanup_dir(directory_path=directory_path, - keep=keep_after, - files=files, - file_patterns=file_patterns) - -def find_and_compress_directories_parallel(parent_dir, - valid_dir_if_filenames, - all_present=False, - exclude_files_from_tarball=[], - exclude_filepatterns_from_tarball=[], - keep_after=True, - files=[], - file_patterns=[], - print_msg=False, - inside_dir=True, - max_workers=None): + cleanup_dir( + directory_path=directory_path, + keep=keep_after, + files=files, + file_patterns=file_patterns, + ) + + +def find_and_compress_directories_parallel( + parent_dir, + valid_dir_if_filenames, + all_present=False, + exclude_files_from_tarball=[], + exclude_filepatterns_from_tarball=[], + keep_after=True, + files=[], + file_patterns=[], + print_msg=False, + inside_dir=True, + max_workers=None, +): """ Finds directories containing specific files, and compresses each directory and its contents into tarballs with gzip compression in parallel. @@ -676,17 +759,22 @@ def find_and_compress_directories_parallel(parent_dir, - The `inside_dir` parameter determines whether the output tarball should be placed inside each directory (True) or in the same directory as each directory (False). - The function parallelizes the compression process using the `parallelise()` function and the `compress_and_cleanup` function. """ - dirs_to_compress = find_directories_with_files(parent_dir=parent_dir, filenames=valid_dir_if_filenames, all_present=all_present) - parallelise(compress_and_cleanup, - dirs_to_compress, - max_workers=max_workers, - exclude_files_from_tarball=exclude_files_from_tarball, - exclude_filepatterns_from_tarball=exclude_filepatterns_from_tarball, - keep_after=keep_after, - files = files, - file_patterns = file_patterns, - print_msg = print_msg, - inside_dir = inside_dir) + dirs_to_compress = find_directories_with_files( + parent_dir=parent_dir, filenames=valid_dir_if_filenames, all_present=all_present + ) + parallelise( + compress_and_cleanup, + dirs_to_compress, + max_workers=max_workers, + exclude_files_from_tarball=exclude_files_from_tarball, + exclude_filepatterns_from_tarball=exclude_filepatterns_from_tarball, + keep_after=keep_after, + files=files, + file_patterns=file_patterns, + print_msg=print_msg, + inside_dir=inside_dir, + ) + def is_line_in_file(filepath, line, exact_match=True): """ @@ -711,7 +799,7 @@ def is_line_in_file(filepath, line, exact_match=True): ... print("Line not found in the file.") """ try: - with open(filepath, 'r') as file: + with open(filepath, "r") as file: for file_line in file: if exact_match and line == file_line.strip(): return True diff --git a/utils/jobfile.py b/utils/jobfile.py index b5d9a73..5031286 100644 --- a/utils/jobfile.py +++ b/utils/jobfile.py @@ -1,6 +1,7 @@ import os import shutil + def create_folder(directory, delete_folder=False, quiet=True): """ Create a folder if it doesn't exist, and optionally delete it if it does. @@ -29,20 +30,23 @@ def create_folder(directory, delete_folder=False, quiet=True): print("No replacement/deletion created due to folder existing") else: os.makedirs(directory) - + + class jobfile: - def __init__(self, - file_path, - HPC = "Gadi", - VASP_version = "5.4.4", - CPU = 192, - cpu_per_node = 48, - RAM = 64, - walltime = 999, - max_resubmissions = 999, - generic_insert_field = [], - generic_insert = []): - ''' + def __init__( + self, + file_path, + HPC="Gadi", + VASP_version="5.4.4", + CPU=192, + cpu_per_node=48, + RAM=64, + walltime=999, + max_resubmissions=999, + generic_insert_field=[], + generic_insert=[], + ): + """ Initialize a jobfile instance. Parameters: @@ -50,7 +54,7 @@ def __init__(self, - HPC (str): One of "Gadi", "Setonix", or "Magnus" specifying the high-performance computing system. - VASP_version (str): VASP version, defaults to "5.4.4". - CPU (int): Number of CPUs to use in the job. - - cpu_per_node (int): Number of CPUs per node on the HPC system. + - cpu_per_node (int): Number of CPUs per node on the HPC system. Gadi: 48 is 1 node (Only use in full nodes, as you are charged for full nodes) Magnus: 24 is 1 node (Only use in full nodes, as you are charged for full nodes) Setonix: 128 is 1 node (Charged on a per-cpu hour basis, not per-node like Gadi) @@ -62,7 +66,7 @@ def __init__(self, Returns: - None - ''' + """ self.file_path = file_path self.HPC = HPC self.VASP_version = VASP_version @@ -73,9 +77,11 @@ def __init__(self, self.cpu_per_node = cpu_per_node self.generic_insert_field = generic_insert_field self.generic_insert = generic_insert - - def to_file(self, job_name='template_job', output_path=os.path.join(os.getcwd(), "test")): - ''' + + def to_file( + self, job_name="template_job", output_path=os.path.join(os.getcwd(), "test") + ): + """ Generate a jobfile by replacing placeholders in the template and insert values from generic_insert. Parameters: @@ -84,11 +90,11 @@ def to_file(self, job_name='template_job', output_path=os.path.join(os.getcwd(), Returns: - None - ''' + """ create_folder(output_path) - with open("%s" % (self.file_path), 'r') as fin: + with open("%s" % (self.file_path), "r") as fin: filedata = fin.read() fin = open("%s" % (self.file_path), "rt", newline="\n") @@ -97,9 +103,13 @@ def to_file(self, job_name='template_job', output_path=os.path.join(os.getcwd(), replace_dict = { "{WALLTIMESTRING}": "%s:00:00" % self.walltime, "{CPUSTRING}": str(self.CPU), - "{MAXCONVITERATIONS}": str(self.max_resubmissions-1), + "{MAXCONVITERATIONS}": str(self.max_resubmissions - 1), "{MEMORYSTRING}": "%sGB" % self.RAM if self.HPC == "Gadi" else "", - "{NODESTRING}": "1" if self.CPU <= self.cpu_per_node else "%s" % int(self.CPU/self.cpu_per_node), + "{NODESTRING}": ( + "1" + if self.CPU <= self.cpu_per_node + else "%s" % int(self.CPU / self.cpu_per_node) + ), "{CASESTRING}": "%s" % job_name, } @@ -107,18 +117,26 @@ def to_file(self, job_name='template_job', output_path=os.path.join(os.getcwd(), filedata = filedata.replace(field, value) if self.VASP_version == "5.4.4": - filedata = filedata.replace("{VASPMODULELOADSTRING}", 'module load vasp/%s' % self.VASP_version) + filedata = filedata.replace( + "{VASPMODULELOADSTRING}", "module load vasp/%s" % self.VASP_version + ) else: if self.HPC == "Setonix" and self.VASP_version in ["6.3.0", "6.2.1"]: - filedata = filedata.replace("{VASPMODULELOADSTRING}", 'module load vasp6/%s' % self.VASP_version) + filedata = filedata.replace( + "{VASPMODULELOADSTRING}", "module load vasp6/%s" % self.VASP_version + ) else: - filedata = filedata.replace("{VASPMODULELOADSTRING}", 'module load vasp/%s' % self.VASP_version) + filedata = filedata.replace( + "{VASPMODULELOADSTRING}", "module load vasp/%s" % self.VASP_version + ) # Insert values from generic_insert into corresponding fields - for insert_field, insert_value in zip(self.generic_insert_field, self.generic_insert): + for insert_field, insert_value in zip( + self.generic_insert_field, self.generic_insert + ): if os.path.isfile(insert_value): # If insert_value is a path, inject the contents of the file - with open(insert_value, 'r') as insert_file: + with open(insert_value, "r") as insert_file: insert_content = insert_file.read() filedata = filedata.replace(insert_field, insert_content) else: @@ -126,15 +144,14 @@ def to_file(self, job_name='template_job', output_path=os.path.join(os.getcwd(), filedata = filedata.replace(insert_field, insert_value) # Write the file out again - with open(os.path.join(output_path, job_name), 'w') as fout: + with open(os.path.join(output_path, job_name), "w") as fout: fout.write(filedata) fin.close() fout.close() - - @staticmethod - def _replace_fields(template_path, - user_inputs): + + @staticmethod + def _replace_fields(template_path, user_inputs): """ Read a file, replace specified fields with user inputs, and create a jobfile instance. @@ -144,7 +161,7 @@ def _replace_fields(template_path, Returns: - string containing generated text - + Example: template_path = '/cmmc/u/hmai/personal_dev/utils/jobscript_templates/CustodianScripts/SDRS_template.py' user_inputs = { @@ -155,7 +172,7 @@ def _replace_fields(template_path, """ # Read the template file - with open(template_path, 'r') as template_file: + with open(template_path, "r") as template_file: template_content = template_file.read() # Replace specified fields with user inputs @@ -163,18 +180,18 @@ def _replace_fields(template_path, template_content = template_content.replace(field, str(value)) return template_content - + def to_string(self): - ''' + """ Convert the jobfile instance to a string representation. Returns: - str: String representation of the jobfile content. - ''' - with open(self.file_path, 'r') as file: + """ + with open(self.file_path, "r") as file: content = file.read() # Replace placeholders in the content if needed # content = content.replace("{SOME_PLACEHOLDER}", str(self.some_attribute)) - return content \ No newline at end of file + return content diff --git a/utils/jobscript_templates/CustodianScripts/template_BASE.py b/utils/jobscript_templates/CustodianScripts/template_BASE.py index 84b0d8c..8daa80a 100644 --- a/utils/jobscript_templates/CustodianScripts/template_BASE.py +++ b/utils/jobscript_templates/CustodianScripts/template_BASE.py @@ -1,10 +1,10 @@ import sys from custodian.custodian import Custodian from custodian.vasp.handlers import ( - VaspErrorHandler, + VaspErrorHandler, NonConvergingErrorHandler, - PositiveEnergyErrorHandler, - FrozenJobErrorHandler + PositiveEnergyErrorHandler, + FrozenJobErrorHandler, ) from utils.custom_custodian_handlers import Han_CustomVaspErrorHandler from custodian.vasp.jobs import VaspJob @@ -16,11 +16,9 @@ Han_CustomVaspErrorHandler(), NonConvergingErrorHandler(), PositiveEnergyErrorHandler(), - FrozenJobErrorHandler(output_filename=output_filename) + FrozenJobErrorHandler(output_filename=output_filename), ] -jobs = [VaspJob(sys.argv[1:], - output_file=output_filename, - suffix = "")] +jobs = [VaspJob(sys.argv[1:], output_file=output_filename, suffix="")] c = Custodian(handlers, jobs, max_errors=10) c.run() diff --git a/utils/jobscript_templates/CustodianScripts/template_DRS.py b/utils/jobscript_templates/CustodianScripts/template_DRS.py index 8339fb0..e678779 100644 --- a/utils/jobscript_templates/CustodianScripts/template_DRS.py +++ b/utils/jobscript_templates/CustodianScripts/template_DRS.py @@ -1,10 +1,10 @@ import sys from custodian.custodian import Custodian from custodian.vasp.handlers import ( - VaspErrorHandler, + VaspErrorHandler, NonConvergingErrorHandler, - PositiveEnergyErrorHandler, - FrozenJobErrorHandler + PositiveEnergyErrorHandler, + FrozenJobErrorHandler, ) from utils.custom_custodian_handlers import Han_CustomVaspErrorHandler from custodian.vasp.jobs import VaspJob @@ -16,36 +16,65 @@ Han_CustomVaspErrorHandler(), NonConvergingErrorHandler(), PositiveEnergyErrorHandler(), - FrozenJobErrorHandler(output_filename=output_filename) + FrozenJobErrorHandler(output_filename=output_filename), ] # Original job list original_jobs = [ - VaspJob(sys.argv[1:], - output_file=output_filename, - suffix=".relax_1", - final=False, - settings_override=[ - {"dict": "INCAR", "action": {"_set": {"NSW": 100, "LAECHG": False, "LCHARGE": False, "NELM": 80, "EDIFF": 1E-4, "KSPACING": 0.9}}} - ], - copy_magmom=True), - - VaspJob(sys.argv[1:], - output_file=output_filename, - suffix=".relax_2", - final=False, - settings_override=[ - {"file": "CONTCAR", "action": {"_file_copy": {"dest": "POSCAR"}}}, - {"dict": "INCAR", "action": {"_set": {"KSPACING": 0.5, "EDIFF": 1E-5}}}, - ], - copy_magmom=True), - - VaspJob(sys.argv[1:], - output_file=output_filename, - suffix="", - settings_override=[{"dict": "INCAR", "action": {"_set": {"NSW": 0, "LAECHG": True, "LCHARGE": True, "NELM": 500, "ALGO": "VeryFast"}}}, - {"file": "CONTCAR", "action": {"_file_copy": {"dest": "POSCAR"}}}]) - ] + VaspJob( + sys.argv[1:], + output_file=output_filename, + suffix=".relax_1", + final=False, + settings_override=[ + { + "dict": "INCAR", + "action": { + "_set": { + "NSW": 100, + "LAECHG": False, + "LCHARGE": False, + "NELM": 80, + "EDIFF": 1e-4, + "KSPACING": 0.9, + } + }, + } + ], + copy_magmom=True, + ), + VaspJob( + sys.argv[1:], + output_file=output_filename, + suffix=".relax_2", + final=False, + settings_override=[ + {"file": "CONTCAR", "action": {"_file_copy": {"dest": "POSCAR"}}}, + {"dict": "INCAR", "action": {"_set": {"KSPACING": 0.5, "EDIFF": 1e-5}}}, + ], + copy_magmom=True, + ), + VaspJob( + sys.argv[1:], + output_file=output_filename, + suffix="", + settings_override=[ + { + "dict": "INCAR", + "action": { + "_set": { + "NSW": 0, + "LAECHG": True, + "LCHARGE": True, + "NELM": 500, + "ALGO": "VeryFast", + } + }, + }, + {"file": "CONTCAR", "action": {"_file_copy": {"dest": "POSCAR"}}}, + ], + ), +] # Number of elements to get from the end of the list n = {STAGES_LEFT} diff --git a/utils/jobscript_templates/CustodianScripts/template_SDRS.py b/utils/jobscript_templates/CustodianScripts/template_SDRS.py index baadcfe..8383ee4 100644 --- a/utils/jobscript_templates/CustodianScripts/template_SDRS.py +++ b/utils/jobscript_templates/CustodianScripts/template_SDRS.py @@ -1,10 +1,10 @@ import sys from custodian.custodian import Custodian from custodian.vasp.handlers import ( - VaspErrorHandler, + VaspErrorHandler, NonConvergingErrorHandler, - PositiveEnergyErrorHandler, - FrozenJobErrorHandler + PositiveEnergyErrorHandler, + FrozenJobErrorHandler, ) from utils.custom_custodian_handlers import Han_CustomVaspErrorHandler from custodian.vasp.jobs import VaspJob @@ -16,46 +16,90 @@ Han_CustomVaspErrorHandler(), NonConvergingErrorHandler(), PositiveEnergyErrorHandler(), - FrozenJobErrorHandler(output_filename=output_filename) + FrozenJobErrorHandler(output_filename=output_filename), ] # Original job list original_jobs = [ - VaspJob(sys.argv[1:], - output_file=output_filename, - suffix=".static_1", - final=False, - settings_override=[{"dict": "INCAR", "action": {"_set": {"NSW": 0, "LAECHG": True, "LCHARGE": True, "NELM": 400, "KSPACING": 0.5}}}]), - - VaspJob(sys.argv[1:], - output_file=output_filename, - suffix=".relax_1", - final=False, - settings_override=[ - {"file": "CHGCAR", "action": {"_file_copy": {"dest": "CHGCAR.static_1"}}}, - {"file": "AECCAR0", "action": {"_file_copy": {"dest": "AECCAR0.static_1"}}}, - {"file": "AECCAR1", "action": {"_file_copy": {"dest": "AECCAR1.static_1"}}}, - {"file": "AECCAR2", "action": {"_file_copy": {"dest": "AECCAR2.static_1"}}}, - {"dict": "INCAR", "action": {"_set": {"NSW": 100, "LAECHG": False, "LCHARGE": False, "NELM": 80, "ALGO": "VeryFast", "EDIFF": 1E-4, "KSPACING": 0.9}}} - ], - copy_magmom=True), - - VaspJob(sys.argv[1:], - output_file=output_filename, - suffix=".relax_2", - final=False, - settings_override=[ - {"file": "CONTCAR", "action": {"_file_copy": {"dest": "POSCAR"}}}, - {"dict": "INCAR", "action": {"_set": {"KSPACING": 0.5, "EDIFF": 1E-5}}}, - ], - copy_magmom=True), - - VaspJob(sys.argv[1:], - output_file=output_filename, - suffix="", - settings_override=[{"dict": "INCAR", "action": {"_set": {"NSW": 0, "LAECHG": True, "LCHARGE": True, "NELM": 500, "ALGO": "VeryFast"}}}, - {"file": "CONTCAR", "action": {"_file_copy": {"dest": "POSCAR"}}}]) - ] + VaspJob( + sys.argv[1:], + output_file=output_filename, + suffix=".static_1", + final=False, + settings_override=[ + { + "dict": "INCAR", + "action": { + "_set": { + "NSW": 0, + "LAECHG": True, + "LCHARGE": True, + "NELM": 400, + "KSPACING": 0.5, + } + }, + } + ], + ), + VaspJob( + sys.argv[1:], + output_file=output_filename, + suffix=".relax_1", + final=False, + settings_override=[ + {"file": "CHGCAR", "action": {"_file_copy": {"dest": "CHGCAR.static_1"}}}, + {"file": "AECCAR0", "action": {"_file_copy": {"dest": "AECCAR0.static_1"}}}, + {"file": "AECCAR1", "action": {"_file_copy": {"dest": "AECCAR1.static_1"}}}, + {"file": "AECCAR2", "action": {"_file_copy": {"dest": "AECCAR2.static_1"}}}, + { + "dict": "INCAR", + "action": { + "_set": { + "NSW": 100, + "LAECHG": False, + "LCHARGE": False, + "NELM": 80, + "ALGO": "VeryFast", + "EDIFF": 1e-4, + "KSPACING": 0.9, + } + }, + }, + ], + copy_magmom=True, + ), + VaspJob( + sys.argv[1:], + output_file=output_filename, + suffix=".relax_2", + final=False, + settings_override=[ + {"file": "CONTCAR", "action": {"_file_copy": {"dest": "POSCAR"}}}, + {"dict": "INCAR", "action": {"_set": {"KSPACING": 0.5, "EDIFF": 1e-5}}}, + ], + copy_magmom=True, + ), + VaspJob( + sys.argv[1:], + output_file=output_filename, + suffix="", + settings_override=[ + { + "dict": "INCAR", + "action": { + "_set": { + "NSW": 0, + "LAECHG": True, + "LCHARGE": True, + "NELM": 500, + "ALGO": "VeryFast", + } + }, + }, + {"file": "CONTCAR", "action": {"_file_copy": {"dest": "POSCAR"}}}, + ], + ), +] # Number of elements to get from the end of the list n = {STAGES_LEFT} diff --git a/utils/jobscript_templates/CustodianScripts/template_SDRS_KPOINTS.py b/utils/jobscript_templates/CustodianScripts/template_SDRS_KPOINTS.py index 1b3dfbc..4bd118a 100644 --- a/utils/jobscript_templates/CustodianScripts/template_SDRS_KPOINTS.py +++ b/utils/jobscript_templates/CustodianScripts/template_SDRS_KPOINTS.py @@ -1,10 +1,10 @@ import sys from custodian.custodian import Custodian from custodian.vasp.handlers import ( - VaspErrorHandler, + VaspErrorHandler, NonConvergingErrorHandler, - PositiveEnergyErrorHandler, - FrozenJobErrorHandler + PositiveEnergyErrorHandler, + FrozenJobErrorHandler, ) from utils.custom_custodian_handlers import Han_CustomVaspErrorHandler from custodian.vasp.jobs import VaspJob @@ -16,45 +16,90 @@ Han_CustomVaspErrorHandler(), NonConvergingErrorHandler(), PositiveEnergyErrorHandler(), - FrozenJobErrorHandler(output_filename=output_filename) + FrozenJobErrorHandler(output_filename=output_filename), ] # Original job list original_jobs = [ - VaspJob(sys.argv[1:], - output_file=output_filename, - suffix = ".relax_1", final=False, - settings_override=[{"file": "KPOINTS", "action": {"_file_move": {"dest": "KPOINTS_moved"}}}, - {"dict": "INCAR", "action": {"_set": {"NSW": 100, "LAECHG": False, "LCHARGE": False, "NELM": 80, "EDIFF": 1E-5, "KSPACING" : 0.9}}}]), - - VaspJob(sys.argv[1:], - output_file=output_filename, - suffix=".relax_1", - final=False, - settings_override=[ - {"file": "CHGCAR", "action": {"_file_copy": {"dest": "CHGCAR.static_1"}}}, - {"file": "AECCAR0", "action": {"_file_copy": {"dest": "AECCAR0.static_1"}}}, - {"file": "AECCAR1", "action": {"_file_copy": {"dest": "AECCAR1.static_1"}}}, - {"file": "AECCAR2", "action": {"_file_copy": {"dest": "AECCAR2.static_1"}}}, - {"dict": "INCAR", "action": {"_set": {"NSW": 100, "LAECHG": False, "LCHARGE": False, "NELM": 80, "EDIFF": 1E-4, "KSPACING": 0.9}}} - ], - copy_magmom=True), - - VaspJob(sys.argv[1:], - output_file=output_filename, - suffix=".relax_2", - final=False, - settings_override=[ - {"file": "CONTCAR", "action": {"_file_copy": {"dest": "POSCAR"}}}, - {"dict": "INCAR", "action": {"_set": {"KSPACING": 0.5, "EDIFF": 1E-5}}}, - ], - copy_magmom=True), - - VaspJob(sys.argv[1:], - output_file=output_filename, - suffix="", - settings_override=[{"dict": "INCAR", "action": {"_set": {"NSW": 0, "LAECHG": True, "LCHARGE": True, "NELM": 500, "ALGO": "VeryFast"}}}]) - ] + VaspJob( + sys.argv[1:], + output_file=output_filename, + suffix=".relax_1", + final=False, + settings_override=[ + {"file": "KPOINTS", "action": {"_file_move": {"dest": "KPOINTS_moved"}}}, + { + "dict": "INCAR", + "action": { + "_set": { + "NSW": 100, + "LAECHG": False, + "LCHARGE": False, + "NELM": 80, + "EDIFF": 1e-5, + "KSPACING": 0.9, + } + }, + }, + ], + ), + VaspJob( + sys.argv[1:], + output_file=output_filename, + suffix=".relax_1", + final=False, + settings_override=[ + {"file": "CHGCAR", "action": {"_file_copy": {"dest": "CHGCAR.static_1"}}}, + {"file": "AECCAR0", "action": {"_file_copy": {"dest": "AECCAR0.static_1"}}}, + {"file": "AECCAR1", "action": {"_file_copy": {"dest": "AECCAR1.static_1"}}}, + {"file": "AECCAR2", "action": {"_file_copy": {"dest": "AECCAR2.static_1"}}}, + { + "dict": "INCAR", + "action": { + "_set": { + "NSW": 100, + "LAECHG": False, + "LCHARGE": False, + "NELM": 80, + "EDIFF": 1e-4, + "KSPACING": 0.9, + } + }, + }, + ], + copy_magmom=True, + ), + VaspJob( + sys.argv[1:], + output_file=output_filename, + suffix=".relax_2", + final=False, + settings_override=[ + {"file": "CONTCAR", "action": {"_file_copy": {"dest": "POSCAR"}}}, + {"dict": "INCAR", "action": {"_set": {"KSPACING": 0.5, "EDIFF": 1e-5}}}, + ], + copy_magmom=True, + ), + VaspJob( + sys.argv[1:], + output_file=output_filename, + suffix="", + settings_override=[ + { + "dict": "INCAR", + "action": { + "_set": { + "NSW": 0, + "LAECHG": True, + "LCHARGE": True, + "NELM": 500, + "ALGO": "VeryFast", + } + }, + } + ], + ), +] # Number of elements to get from the end of the list n = {STAGE} diff --git a/utils/jobscript_templates/CustodianScripts/template_Static.py b/utils/jobscript_templates/CustodianScripts/template_Static.py index 36a97de..e3ebcd8 100644 --- a/utils/jobscript_templates/CustodianScripts/template_Static.py +++ b/utils/jobscript_templates/CustodianScripts/template_Static.py @@ -1,10 +1,10 @@ import sys from custodian.custodian import Custodian from custodian.vasp.handlers import ( - VaspErrorHandler, + VaspErrorHandler, NonConvergingErrorHandler, - PositiveEnergyErrorHandler, - FrozenJobErrorHandler + PositiveEnergyErrorHandler, + FrozenJobErrorHandler, ) from utils.custom_custodian_handlers import Han_CustomVaspErrorHandler from custodian.vasp.jobs import VaspJob @@ -16,13 +16,31 @@ Han_CustomVaspErrorHandler(), NonConvergingErrorHandler(), PositiveEnergyErrorHandler(), - FrozenJobErrorHandler(output_filename=output_filename) + FrozenJobErrorHandler(output_filename=output_filename), ] -jobs = [VaspJob(sys.argv[1:], output_file=output_filename, suffix = "", - settings_override = [{"dict": "INCAR", - "action": {"_set":{"NSW": 0, "LAECHG": True, "LCHARGE": True, "NELM": 500, "ALGO": "VeryFast", "EDIFF": 1E-5}}}] - )] +jobs = [ + VaspJob( + sys.argv[1:], + output_file=output_filename, + suffix="", + settings_override=[ + { + "dict": "INCAR", + "action": { + "_set": { + "NSW": 0, + "LAECHG": True, + "LCHARGE": True, + "NELM": 500, + "ALGO": "VeryFast", + "EDIFF": 1e-5, + } + }, + } + ], + ) +] c = Custodian(handlers, jobs, max_errors={MAXCUSTODIANERRORS}) c.run() diff --git a/utils/parallel.py b/utils/parallel.py index 951207d..4463668 100644 --- a/utils/parallel.py +++ b/utils/parallel.py @@ -1,5 +1,6 @@ from multiprocessing import Pool, cpu_count + def parallelise(func, args_list, **kwargs_list): """ Executes the given function in parallel by applying it to multiple sets of arguments, @@ -40,13 +41,15 @@ def sample_function(x, flag=False): """ if not args_list: return [] - - max_workers = kwargs_list.pop('max_workers', None) + + max_workers = kwargs_list.pop("max_workers", None) if isinstance(max_workers, int): max_workers = max_workers else: - max_workers = cpu_count() # Use default CPU count if max_workers not specified or not an int - + max_workers = ( + cpu_count() + ) # Use default CPU count if max_workers not specified or not an int + # Replicate kwargs handling special cases replicated_kwargs = {} for key, value in kwargs_list.items(): @@ -61,13 +64,16 @@ def sample_function(x, flag=False): # Combine args and kwargs for each function call combined_args = [ - (list(args) if isinstance(args, tuple) else [args]) + [replicated_kwargs[key][i] for key in replicated_kwargs] + (list(args) if isinstance(args, tuple) else [args]) + + [replicated_kwargs[key][i] for key in replicated_kwargs] for i, args in enumerate(args_list) ] # Determine the number of processors to use num_processors = min(len(args_list), max_workers or cpu_count()) - print(f"# Processes: {len(args_list)}, Processors available: {cpu_count()}, CPUs used: {num_processors}") + print( + f"# Processes: {len(args_list)}, Processors available: {cpu_count()}, CPUs used: {num_processors}" + ) # Execute the function in parallel with Pool(processes=num_processors) as pool: results = pool.starmap(func, tuple(combined_args)) diff --git a/utils/periodic_table.py b/utils/periodic_table.py index bc4b697..9184133 100644 --- a/utils/periodic_table.py +++ b/utils/periodic_table.py @@ -1,7 +1,7 @@ import numpy as np import pandas as pd -import os +import os import matplotlib.pyplot as plt import matplotlib.patches as patches @@ -15,13 +15,15 @@ module_path = os.path.dirname(os.path.abspath(__file__)) ptable = pd.read_csv(os.path.join(module_path, "periodic_table.csv")) + def get_element_number(symbol): try: return Element(symbol).Z except ValueError: warnings.warn(f"Warning: Symbol '{symbol}' was not found.") return np.nan - + + def get_element_symbol(element_number): row = ptable[ptable["Z"] == element_number] if not row.empty: @@ -29,75 +31,153 @@ def get_element_symbol(element_number): else: warnings.warn(f"Warning: Element with Z:{element_number} was not found.") return np.nan - + + def classify_elements(element): # Define the properties of the different groups of elements in a dictionary element_groups = { - 'Actinoids': ['Ac', 'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr'], - 'Noble gases': ['He', 'Ne', 'Ar', 'Kr', 'Xe', 'Rn', 'Og'], - 'Rare earths': ['La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu'], - 'Transition metals': ['Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg'], - 'Alkali metals': ['Li', 'Na', 'K', 'Rb', 'Cs', 'Fr'], - 'Alkaline earths': ['Be', 'Mg', 'Ca', 'Sr', 'Ba', 'Ra'], - 'Halogens': ['F', 'Cl', 'Br', 'I', 'At'], - 'Metalloids': ['B', 'Si', 'Ge', 'As', 'Sb', 'Te', 'Po'], - 'Reactive nonmetals': ['H', 'C', 'N', 'O', 'P', 'S', 'Se'], # Excluding Halogens as they're classified separately - 'Post-transition metals': ['Al', 'Ga', 'In', 'Sn', 'Tl', 'Pb', 'Bi'] + "Actinoids": [ + "Ac", + "Th", + "Pa", + "U", + "Np", + "Pu", + "Am", + "Cm", + "Bk", + "Cf", + "Es", + "Fm", + "Md", + "No", + "Lr", + ], + "Noble gases": ["He", "Ne", "Ar", "Kr", "Xe", "Rn", "Og"], + "Rare earths": [ + "La", + "Ce", + "Pr", + "Nd", + "Pm", + "Sm", + "Eu", + "Gd", + "Tb", + "Dy", + "Ho", + "Er", + "Tm", + "Yb", + "Lu", + ], + "Transition metals": [ + "Sc", + "Ti", + "V", + "Cr", + "Mn", + "Fe", + "Co", + "Ni", + "Cu", + "Zn", + "Y", + "Zr", + "Nb", + "Mo", + "Tc", + "Ru", + "Rh", + "Pd", + "Ag", + "Cd", + "Hf", + "Ta", + "W", + "Re", + "Os", + "Ir", + "Pt", + "Au", + "Hg", + ], + "Alkali metals": ["Li", "Na", "K", "Rb", "Cs", "Fr"], + "Alkaline earths": ["Be", "Mg", "Ca", "Sr", "Ba", "Ra"], + "Halogens": ["F", "Cl", "Br", "I", "At"], + "Metalloids": ["B", "Si", "Ge", "As", "Sb", "Te", "Po"], + "Reactive nonmetals": [ + "H", + "C", + "N", + "O", + "P", + "S", + "Se", + ], # Excluding Halogens as they're classified separately + "Post-transition metals": ["Al", "Ga", "In", "Sn", "Tl", "Pb", "Bi"], } - + # Check which group the element belongs to for group, elements in element_groups.items(): if element in elements: return group - + # If the element doesn't match any group, return 'Others' - return 'Others' + return "Others" + def get_colour_element(element): # Define the color map inside the function - color_map = {'Actinoids': 'r', - 'Noble gases': 'royalblue', - 'Rare earths': 'm', - 'Transition metals': 'purple', - 'Alkali metals': 'gold', - 'Alkaline earths': "moccasin", - 'Halogens': 'mediumspringgreen', - 'Metalloids': 'darkcyan', - 'Others': 'slategray'} + color_map = { + "Actinoids": "r", + "Noble gases": "royalblue", + "Rare earths": "m", + "Transition metals": "purple", + "Alkali metals": "gold", + "Alkaline earths": "moccasin", + "Halogens": "mediumspringgreen", + "Metalloids": "darkcyan", + "Others": "slategray", + } # Classify the element using the classify_elements function element_group = classify_elements(element) - + # Assign color based on the classification using the color_map dictionary - colour = color_map.get(element_group, 'slategray') # Default to 'slategray' if not found in color_map - + colour = color_map.get( + element_group, "slategray" + ) # Default to 'slategray' if not found in color_map + return colour -def periodic_table_plot(plot_df, - property="Eseg_min", - count_min=None, - count_max=None, - center_cm_zero=False, - center_point=None, # New parameter for arbitrary centering - property_name=None, - cmap=cm.Blues, - element_font_color = "darkgoldenrod" + +def periodic_table_plot( + plot_df, + property="Eseg_min", + count_min=None, + count_max=None, + center_cm_zero=False, + center_point=None, # New parameter for arbitrary centering + property_name=None, + cmap=cm.Blues, + element_font_color="darkgoldenrod", ): module_path = os.path.dirname(os.path.abspath(__file__)) - ptable = pd.read_csv(os.path.join(module_path, 'periodic_table.csv')) - ptable.index = ptable['symbol'].values - elem_tracker = ptable['count'] - ptable = ptable[ptable['Z'] <= 92] # Cap at element 92 + ptable = pd.read_csv(os.path.join(module_path, "periodic_table.csv")) + ptable.index = ptable["symbol"].values + elem_tracker = ptable["count"] + ptable = ptable[ptable["Z"] <= 92] # Cap at element 92 - n_row = ptable['row'].max() - n_column = ptable['column'].max() + n_row = ptable["row"].max() + n_column = ptable["column"].max() fig, ax = plt.subplots(figsize=(n_column, n_row)) - rows = ptable['row'] - columns = ptable['column'] - symbols = ptable['symbol'] + rows = ptable["row"] + columns = ptable["column"] + symbols = ptable["symbol"] rw = 0.9 # rectangle width - rh = rw # rectangle height + rh = rw # rectangle height if count_min is None: count_min = plot_df[property].min() @@ -116,127 +196,170 @@ def periodic_table_plot(plot_df, norm = Normalize(vmin=count_min, vmax=count_max) for row, column, symbol in zip(rows, columns, symbols): - row = ptable['row'].max() - row + row = ptable["row"].max() - row if symbol in plot_df.element.unique(): count = plot_df[plot_df["element"] == symbol][property].values[0] # Check for NaN and adjust color and skip text accordingly if pd.isna(count): - color = 'grey' # Set color to none for NaN values - count = '' # Avoid displaying text for NaN values + color = "grey" # Set color to none for NaN values + count = "" # Avoid displaying text for NaN values else: color = cmap(norm(count)) else: - count = '' - color = 'none' + count = "" + color = "none" if row < 3: row += 0.5 - rect = patches.Rectangle((column, row), rw, rh, - linewidth=1.5, - edgecolor='gray', - facecolor=color, - alpha=1) + rect = patches.Rectangle( + (column, row), + rw, + rh, + linewidth=1.5, + edgecolor="gray", + facecolor=color, + alpha=1, + ) # Element symbol - plt.text(column + rw / 2, row + rh / 2 + 0.2, symbol, - horizontalalignment='center', - verticalalignment='center', - fontsize=22, # Adjusted for visibility - fontweight='semibold', - color=element_font_color) + plt.text( + column + rw / 2, + row + rh / 2 + 0.2, + symbol, + horizontalalignment="center", + verticalalignment="center", + fontsize=22, # Adjusted for visibility + fontweight="semibold", + color=element_font_color, + ) # Property value - Added below the symbol if count: # Only display if count is not empty (including not NaN) - plt.text(column + rw / 2, row + rh / 2 - 0.25, f"{count:.2f}", # Formatting count to 2 decimal places - horizontalalignment='center', - verticalalignment='center', - fontsize=14, # Smaller font size for the count value - fontweight='semibold', - color=element_font_color) + plt.text( + column + rw / 2, + row + rh / 2 - 0.25, + f"{count:.2f}", # Formatting count to 2 decimal places + horizontalalignment="center", + verticalalignment="center", + fontsize=14, # Smaller font size for the count value + fontweight="semibold", + color=element_font_color, + ) ax.add_patch(rect) # Generate the color bar granularity = 20 - colormap_array = np.linspace(norm.vmin, norm.vmax, granularity) if center_point is None else np.linspace(center_point - max_diff, center_point + max_diff, granularity) - + colormap_array = ( + np.linspace(norm.vmin, norm.vmax, granularity) + if center_point is None + else np.linspace(center_point - max_diff, center_point + max_diff, granularity) + ) + for i, value in enumerate(colormap_array): color = cmap(norm(value)) - color = 'silver' if value == 0 else color + color = "silver" if value == 0 else color length = 9 x_offset = 3.5 y_offset = 7.8 x_loc = i / granularity * length + x_offset width = length / granularity height = 0.35 - rect = patches.Rectangle((x_loc, y_offset), width, height, - linewidth=1.5, - edgecolor='gray', - facecolor=color, - alpha=1) - - if i in [0, granularity//4, granularity//2, 3*granularity//4, granularity-1]: - plt.text(x_loc + width / 2, y_offset - 0.4, f'{value:.1f}', - horizontalalignment='center', - verticalalignment='center', - fontweight='semibold', - fontsize=20, color='k') + rect = patches.Rectangle( + (x_loc, y_offset), + width, + height, + linewidth=1.5, + edgecolor="gray", + facecolor=color, + alpha=1, + ) + + if i in [ + 0, + granularity // 4, + granularity // 2, + 3 * granularity // 4, + granularity - 1, + ]: + plt.text( + x_loc + width / 2, + y_offset - 0.4, + f"{value:.1f}", + horizontalalignment="center", + verticalalignment="center", + fontweight="semibold", + fontsize=20, + color="k", + ) ax.add_patch(rect) if property_name is None: property_name = property - plt.text(x_offset + length / 2, y_offset + 1.0, - property_name, - horizontalalignment='center', - verticalalignment='center', - fontweight='semibold', - fontsize=20, color='k') - ax.set_ylim(-0.15, n_row + .1) + plt.text( + x_offset + length / 2, + y_offset + 1.0, + property_name, + horizontalalignment="center", + verticalalignment="center", + fontweight="semibold", + fontsize=20, + color="k", + ) + ax.set_ylim(-0.15, n_row + 0.1) ax.set_xlim(0.85, n_column + 1.1) - ax.axis('off') + ax.axis("off") plt.draw() plt.pause(0.001) plt.close() return fig, ax -def periodic_table_dual_plot(plot_df, - property1="Eseg_min1", - property2="Eseg_min2", # New property - count_min1=None, - count_max1=None, - count_min2=None, - count_max2=None, - center_cm_zero1=False, - center_cm_zero2=False, - center_point1=None, # New parameter for arbitrary centering - center_point2=None, - property_name1=None, - property_name2=None, - cmap1=plt.cm.Blues, # Colormap for the first property - cmap2=plt.cm.Reds, # Colormap for the second property - element_font_color="darkgoldenrod"): + +def periodic_table_dual_plot( + plot_df, + property1="Eseg_min1", + property2="Eseg_min2", # New property + count_min1=None, + count_max1=None, + count_min2=None, + count_max2=None, + center_cm_zero1=False, + center_cm_zero2=False, + center_point1=None, # New parameter for arbitrary centering + center_point2=None, + property_name1=None, + property_name2=None, + cmap1=plt.cm.Blues, # Colormap for the first property + cmap2=plt.cm.Reds, # Colormap for the second property + element_font_color="darkgoldenrod", +): module_path = os.path.dirname(os.path.abspath(__file__)) - ptable = pd.read_csv(os.path.join(module_path, 'periodic_table.csv')) - ptable.index = ptable['symbol'].values - elem_tracker = ptable['count'] - ptable = ptable[ptable['Z'] <= 92] # Cap at element 92 + ptable = pd.read_csv(os.path.join(module_path, "periodic_table.csv")) + ptable.index = ptable["symbol"].values + elem_tracker = ptable["count"] + ptable = ptable[ptable["Z"] <= 92] # Cap at element 92 - n_row = ptable['row'].max() - n_column = ptable['column'].max() + n_row = ptable["row"].max() + n_column = ptable["column"].max() fig, ax = plt.subplots(figsize=(n_column, n_row)) - rows = ptable['row'] - columns = ptable['column'] - symbols = ptable['symbol'] + rows = ptable["row"] + columns = ptable["column"] + symbols = ptable["symbol"] rw = 0.9 # rectangle width - rh = rw # rectangle height - - if count_min1 is None or count_min2 is None or count_max1 is None or count_max2 is None: + rh = rw # rectangle height + + if ( + count_min1 is None + or count_min2 is None + or count_max1 is None + or count_max2 is None + ): show_symbols = False else: show_symbols = True - + if count_min1 is None: count_min1 = plot_df[property1].min() if count_max1 is None: @@ -270,9 +393,9 @@ def periodic_table_dual_plot(plot_df, norm2 = Normalize(vmin=count_min2, vmax=count_max2) for row, column, symbol in zip(rows, columns, symbols): - row = ptable['row'].max() - row + row = ptable["row"].max() - row # Initial color set to 'none' for both properties - color1, color2 = 'none', 'none' + color1, color2 = "none", "none" if symbol in plot_df.element.unique(): element_data = plot_df[plot_df["element"] == symbol] @@ -284,41 +407,58 @@ def periodic_table_dual_plot(plot_df, color2 = cmap2(norm2(value2)) # Draw upper right triangle for property1 - triangle1 = patches.Polygon([(column, row), (column + rw, row), (column + rw, row + rh)], - closed=True, color=color1) + triangle1 = patches.Polygon( + [(column, row), (column + rw, row), (column + rw, row + rh)], + closed=True, + color=color1, + ) ax.add_patch(triangle1) - + # Draw lower left triangle for property2 - triangle2 = patches.Polygon([(column, row), (column, row + rh), (column + rw, row + rh)], - closed=True, color=color2) + triangle2 = patches.Polygon( + [(column, row), (column, row + rh), (column + rw, row + rh)], + closed=True, + color=color2, + ) ax.add_patch(triangle2) # Element symbol - plt.text(column + rw / 2, row + rh / 2, symbol, - horizontalalignment='center', - verticalalignment='center', - fontsize=22, # Adjusted for visibility - fontweight='semibold', - color=element_font_color) + plt.text( + column + rw / 2, + row + rh / 2, + symbol, + horizontalalignment="center", + verticalalignment="center", + fontsize=22, # Adjusted for visibility + fontweight="semibold", + color=element_font_color, + ) position1 = 3.5, 7.8 position2 = 3.5, 9.4 # draw_color_bar(fig, ax, norm1, cmap1, property_name1, position1, granularity=20) # draw_color_bar(fig, ax, norm2, cmap2, property_name2, position2, granularity=20) - draw_color_bar(fig, ax, norm1, cmap1, property_name1, position1, show_symbols, granularity=20) - draw_color_bar(fig, ax, norm2, cmap2, property_name2, position2, show_symbols, granularity=20) - - ax.set_ylim(-0.15, n_row + .1) + draw_color_bar( + fig, ax, norm1, cmap1, property_name1, position1, show_symbols, granularity=20 + ) + draw_color_bar( + fig, ax, norm2, cmap2, property_name2, position2, show_symbols, granularity=20 + ) + + ax.set_ylim(-0.15, n_row + 0.1) ax.set_xlim(0.85, n_column + 1.1) - ax.axis('off') - + ax.axis("off") + plt.draw() plt.pause(0.001) plt.close() return fig, ax -def draw_color_bar(fig, ax, norm, cmap, property_name, position, show_symbols=True, granularity=20): + +def draw_color_bar( + fig, ax, norm, cmap, property_name, position, show_symbols=True, granularity=20 +): colormap_array = np.linspace(norm.vmin, norm.vmax, granularity) - + length = 9 width = length / granularity height = 0.35 @@ -326,37 +466,55 @@ def draw_color_bar(fig, ax, norm, cmap, property_name, position, show_symbols=Tr for i, value in enumerate(colormap_array): color = cmap(norm(value)) - color = 'silver' if value == 0 and not norm.vmin <= 0 <= norm.vmax else color + color = "silver" if value == 0 and not norm.vmin <= 0 <= norm.vmax else color x_loc = i / granularity * length + x_offset - - rect = patches.Rectangle((x_loc, y_offset), width, height, - linewidth=1.5, - edgecolor='gray', - facecolor=color, - alpha=1) + + rect = patches.Rectangle( + (x_loc, y_offset), + width, + height, + linewidth=1.5, + edgecolor="gray", + facecolor=color, + alpha=1, + ) ax.add_patch(rect) - if i in [0, granularity//4, granularity//2, 3*granularity//4, granularity-1]: - label = f'{value:.1f}' + if i in [ + 0, + granularity // 4, + granularity // 2, + 3 * granularity // 4, + granularity - 1, + ]: + label = f"{value:.1f}" if show_symbols: if i == 0: label = "<" + label elif i == granularity - 1: label = ">" + label - - plt.text(x_loc + width / 2, y_offset - 0.4, label, - horizontalalignment='center', - verticalalignment='center', - fontweight='semibold', - fontsize=20, color='k') - - plt.text(x_offset + length / 2, y_offset + 0.75, - property_name, - horizontalalignment='center', - verticalalignment='center', - fontweight='semibold', - fontsize=24, color='k') + plt.text( + x_loc + width / 2, + y_offset - 0.4, + label, + horizontalalignment="center", + verticalalignment="center", + fontweight="semibold", + fontsize=20, + color="k", + ) + + plt.text( + x_offset + length / 2, + y_offset + 0.75, + property_name, + horizontalalignment="center", + verticalalignment="center", + fontweight="semibold", + fontsize=24, + color="k", + ) # Example of how to use the function diff --git a/utils/plotters/grid_plots.py b/utils/plotters/grid_plots.py index 4a3c2b6..009b803 100644 --- a/utils/plotters/grid_plots.py +++ b/utils/plotters/grid_plots.py @@ -2,18 +2,21 @@ import matplotlib.ticker as ticker import numpy as np -def plot_pivot_table(df, - colormap_thresholds=[None, None], - figsize=(18, 30), - colormap='bwr', - colormap_label='E$_{\\rm{seg}}$ (eV)', - color_label_fontsize=20, - colormap_tick_fontsize=12, - xtick_fontsize=18, - ytick_fontsize=12, - threshold_low=None, - threshold_high=None, - transpose_axes=False): + +def plot_pivot_table( + df, + colormap_thresholds=[None, None], + figsize=(18, 30), + colormap="bwr", + colormap_label="E$_{\\rm{seg}}$ (eV)", + color_label_fontsize=20, + colormap_tick_fontsize=12, + xtick_fontsize=18, + ytick_fontsize=12, + threshold_low=None, + threshold_high=None, + transpose_axes=False, +): """ Plot a heatmap with custom parameters. @@ -33,32 +36,47 @@ def plot_pivot_table(df, if threshold_low is not None or threshold_high is not None: df = df.copy() df[(df < threshold_low) | (df > threshold_high)] = np.nan - + if transpose_axes: df = df.T fig, axs = plt.subplots(nrows=1, ncols=1, figsize=figsize) cmap = plt.get_cmap(colormap) - cmap.set_bad('k') + cmap.set_bad("k") if colormap_thresholds == [None, None]: vmax = max(abs(np.nanmin(df.max())), abs(np.nanmin(df.min()))) vmin = -vmax else: vmin, vmax = colormap_thresholds im = axs.imshow(df, cmap=cmap, vmax=vmax, vmin=vmin) - cm = plt.colorbar(im, ax=axs, shrink=0.3, location='right', pad=0.01) - cm.set_label(colormap_label, rotation=270, labelpad=15, fontsize=color_label_fontsize) + cm = plt.colorbar(im, ax=axs, shrink=0.3, location="right", pad=0.01) + cm.set_label( + colormap_label, rotation=270, labelpad=15, fontsize=color_label_fontsize + ) # cm.ax.tick_params(labelsize=colormap_tick_fontsize) # Set colorbar tick label size if colormap_thresholds != [None, None]: ticks = cm.get_ticks() if len(ticks) > 1: # Check to ensure there are ticks to modify - tick_labels = [f"$<{vmin}$" if i == 0 else f"$>{vmax}$" if i == len(ticks)-1 else str(tick) for i, tick in enumerate(ticks)] + tick_labels = [ + ( + f"$<{vmin}$" + if i == 0 + else f"$>{vmax}$" if i == len(ticks) - 1 else str(tick) + ) + for i, tick in enumerate(ticks) + ] cm.set_ticks(ticks) # Set the ticks back if they were changed - cm.set_ticklabels(tick_labels, fontsize=colormap_tick_fontsize) # Set the modified tick labels + cm.set_ticklabels( + tick_labels, fontsize=colormap_tick_fontsize + ) # Set the modified tick labels else: - cm.set_ticklabels(cm.get_ticks(), fontsize=colormap_tick_fontsize) # Set the modified tick labels + cm.set_ticklabels( + cm.get_ticks(), fontsize=colormap_tick_fontsize + ) # Set the modified tick labels - plt.xticks(np.arange(len(df.columns)), df.columns, rotation=0, fontsize=xtick_fontsize) + plt.xticks( + np.arange(len(df.columns)), df.columns, rotation=0, fontsize=xtick_fontsize + ) plt.yticks(np.arange(len(df.index)), df.index, fontsize=ytick_fontsize) axs.xaxis.set_major_locator(ticker.MultipleLocator(1)) @@ -66,7 +84,7 @@ def plot_pivot_table(df, axs.xaxis.set_minor_locator(ticker.MultipleLocator(0.5)) axs.yaxis.set_minor_locator(ticker.MultipleLocator(0.5)) - axs.tick_params(axis='both', which='major', width=1.5, length=4) - axs.grid(which='minor', color='black', linestyle='-', linewidth=1) - - return fig, axs \ No newline at end of file + axs.tick_params(axis="both", which="major", width=1.5, length=4) + axs.grid(which="minor", color="black", linestyle="-", linewidth=1) + + return fig, axs diff --git a/utils/plotters/structure_plots.py b/utils/plotters/structure_plots.py index e7e72f0..9ebd91d 100644 --- a/utils/plotters/structure_plots.py +++ b/utils/plotters/structure_plots.py @@ -3,26 +3,29 @@ from matplotlib.cm import ScalarMappable from matplotlib.colors import Normalize, TwoSlopeNorm -def plot_structure_projection(structure, - projection_axis=[1, 2], - bond_matrix=None, - atom_size=250, - figsize=(8, 6), - cell_border_colour="r", - no_fill_elements=["Fe"], - fill_color="red", - atom_size_dict={}, - fontsize=16, - values_list=None, - title=None, - cmap='viridis', - colorbar_label=r"$\rm{E}_{seg}$", - xlabel_fontsize=None, - ylabel_fontsize=None, - title_fontsize=None, - colorbar_fontsize=None, - colorbar_ticks_fontsize=None, - center_colorbar_at_zero=True): + +def plot_structure_projection( + structure, + projection_axis=[1, 2], + bond_matrix=None, + atom_size=250, + figsize=(8, 6), + cell_border_colour="r", + no_fill_elements=["Fe"], + fill_color="red", + atom_size_dict={}, + fontsize=16, + values_list=None, + title=None, + cmap="viridis", + colorbar_label=r"$\rm{E}_{seg}$", + xlabel_fontsize=None, + ylabel_fontsize=None, + title_fontsize=None, + colorbar_fontsize=None, + colorbar_ticks_fontsize=None, + center_colorbar_at_zero=True, +): """ Plots the projection of a pymatgen structure on a 2D plane based on the specified projection axis. @@ -60,19 +63,19 @@ def plot_structure_projection(structure, if values_list is not None: # Adjust vmin and vmax based on the absolute maximum value for symmetry max_abs_value = max(abs(min(values_list)), abs(max(values_list))) - + if center_colorbar_at_zero: norm = TwoSlopeNorm(vmin=-max_abs_value, vcenter=0, vmax=max_abs_value) else: norm = Normalize(vmin=min(values_list), vmax=max(values_list)) - + sm = ScalarMappable(cmap=cmap, norm=norm) sm.set_array([]) # Required for ScalarMappable to work for i, site in enumerate(structure): species = site.species_string if species in no_fill_elements: - color = 'none' # No fill for specified elements + color = "none" # No fill for specified elements else: if values_list is not None: color = sm.to_rgba(values_list[i]) @@ -81,8 +84,13 @@ def plot_structure_projection(structure, # Use custom size if available, otherwise use the default size size = atom_size_dict.get(species, atom_size) - plt.scatter(site.coords[projection_axis[0]], site.coords[projection_axis[1]], color=color, s=size, - edgecolors='black') + plt.scatter( + site.coords[projection_axis[0]], + site.coords[projection_axis[1]], + color=color, + s=size, + edgecolors="black", + ) # Set plot title and labels if title is not None: @@ -97,43 +105,57 @@ def plot_structure_projection(structure, plt.ylim(y_min - 1, y_max + 1) if bond_matrix is not None: - relevant_plot_bonds = bond_matrix[(bond_matrix['repeata'] == 0) & (bond_matrix['repeatb'] == 0)] + relevant_plot_bonds = bond_matrix[ + (bond_matrix["repeata"] == 0) & (bond_matrix["repeatb"] == 0) + ] for idx, bonds in relevant_plot_bonds.iterrows(): atom1 = int(bonds["atom1"]) - 1 atom2 = int(bonds["atom2"]) - 1 bondstrength = np.round(bonds["final_bond_order"], 2) if bondstrength < 0.28: - c = 'r' + c = "r" else: - c = 'k' + c = "k" c = "k" - plt.plot([structure[atom1].coords[projection_axis[0]], structure[atom2].coords[projection_axis[0]]], - [structure[atom1].coords[projection_axis[1]], structure[atom2].coords[projection_axis[1]]], - '-', - color=c, - linewidth=bondstrength / 0.56 * 5) + plt.plot( + [ + structure[atom1].coords[projection_axis[0]], + structure[atom2].coords[projection_axis[0]], + ], + [ + structure[atom1].coords[projection_axis[1]], + structure[atom2].coords[projection_axis[1]], + ], + "-", + color=c, + linewidth=bondstrength / 0.56 * 5, + ) # Draw the cell with a black border based on the projection_axis lattice_vectors = structure.lattice.matrix[projection_axis] # Draw the cell with a border based on the projection_ax|is - rect = plt.Rectangle((0, 0), - structure.lattice.abc[projection_axis[0]], - structure.lattice.abc[projection_axis[1]], - edgecolor=cell_border_colour, - linewidth=3, - fill=False, - linestyle='--') + rect = plt.Rectangle( + (0, 0), + structure.lattice.abc[projection_axis[0]], + structure.lattice.abc[projection_axis[1]], + edgecolor=cell_border_colour, + linewidth=3, + fill=False, + linestyle="--", + ) plt.gca().add_patch(rect) - plt.gca().set_aspect('equal') + plt.gca().set_aspect("equal") plt.grid() # Add colorbar if colorbar_fontsize is not None and values_list is not None: - cbar_ax = fig.add_axes([0.55, 0.1, 0.005, 0.8]) # Adjust these values to position the colorbar as needed + cbar_ax = fig.add_axes( + [0.55, 0.1, 0.005, 0.8] + ) # Adjust these values to position the colorbar as needed cbar = plt.colorbar(sm, cax=cbar_ax, label=colorbar_label) cbar.set_label(colorbar_label, fontsize=colorbar_fontsize) if colorbar_ticks_fontsize is not None: cbar.ax.tick_params(labelsize=colorbar_ticks_fontsize) - + plt.show() diff --git a/utils/structure_featuriser.py b/utils/structure_featuriser.py index 709dbd1..8f8bda3 100644 --- a/utils/structure_featuriser.py +++ b/utils/structure_featuriser.py @@ -3,44 +3,66 @@ from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler -#from maml.describers import SmoothOverlapAtomicPosition + +# from maml.describers import SmoothOverlapAtomicPosition from pymatgen.analysis.local_env import VoronoiNN from pymatgen.core import Structure + def get_stats(property_list, property_str): - return [f"{property_str}_std",f"{property_str}_mean",f"{property_str}_min",f"{property_str}_max"],\ - [np.std(property_list), np.mean(property_list), np.min(property_list), np.max(property_list)] - + return [ + f"{property_str}_std", + f"{property_str}_mean", + f"{property_str}_min", + f"{property_str}_max", + ], [ + np.std(property_list), + np.mean(property_list), + np.min(property_list), + np.max(property_list), + ] + + def VoronoiSiteFeaturiser(structure, site): - coord_no = VoronoiNN().get_cn(structure = structure, n = site) + coord_no = VoronoiNN().get_cn(structure=structure, n=site) site_info_dict = VoronoiNN().get_voronoi_polyhedra(structure, site) - volumes = [site_info_dict[polyhedra]["volume"] for polyhedra in list(site_info_dict.keys())] - vertices = [site_info_dict[polyhedra]["n_verts"] for polyhedra in list(site_info_dict.keys())] - distances = [site_info_dict[polyhedra]["face_dist"] for polyhedra in list(site_info_dict.keys())] - areas = [site_info_dict[polyhedra]["area"] for polyhedra in list(site_info_dict.keys())] - + volumes = [ + site_info_dict[polyhedra]["volume"] for polyhedra in list(site_info_dict.keys()) + ] + vertices = [ + site_info_dict[polyhedra]["n_verts"] + for polyhedra in list(site_info_dict.keys()) + ] + distances = [ + site_info_dict[polyhedra]["face_dist"] + for polyhedra in list(site_info_dict.keys()) + ] + areas = [ + site_info_dict[polyhedra]["area"] for polyhedra in list(site_info_dict.keys()) + ] + total_area = np.sum(areas) total_volume = np.sum(volumes) - + df_str_list = ["VorNN_CoordNo", "VorNN_tot_vol", "VorNN_tot_area"] df_prop_list = [coord_no, total_volume, total_area] - + data_str_list = ["volumes", "vertices", "areas", "distances"] for i, value_list in enumerate([volumes, vertices, areas, distances]): - property_str_list, property_stats_list = get_stats(value_list, f"VorNN_{data_str_list[i]}") + property_str_list, property_stats_list = get_stats( + value_list, f"VorNN_{data_str_list[i]}" + ) df_str_list += property_str_list df_prop_list += property_stats_list - + return df_str_list, df_prop_list -def get_per_site_SOAP_descriptor(structure, - cutoff=3, - l_max=10, - n_max=10, - atom_sigma=0.5, - verbose=False): + +def get_per_site_SOAP_descriptor( + structure, cutoff=3, l_max=10, n_max=10, atom_sigma=0.5, verbose=False +): """ Process a list of pymatgen structures using the Smooth Overlap of Atomic Positions (SOAP) method and organize the results into a list of DataFrames per structure, @@ -59,9 +81,16 @@ def get_per_site_SOAP_descriptor(structure, list: List of DataFrames, each DataFrame containing the SOAP descriptors for each site in the structure. """ # Change n_jobs to the number of cores you have available - s = SmoothOverlapAtomicPosition(cutoff=cutoff, l_max=l_max, n_max=n_max, atom_sigma=atom_sigma, verbose=verbose, n_jobs=1) + s = SmoothOverlapAtomicPosition( + cutoff=cutoff, + l_max=l_max, + n_max=n_max, + atom_sigma=atom_sigma, + verbose=verbose, + n_jobs=1, + ) # Create a DataFrame with the list of structures - df = pd.DataFrame({'structure': [structure]}) + df = pd.DataFrame({"structure": [structure]}) # Transform the structures using SOAP a = s.transform(df["structure"]) # Copy and reset the index of the transformed DataFrame @@ -69,10 +98,16 @@ def get_per_site_SOAP_descriptor(structure, # Rename the "level_1" column to "site" soap_df.rename(columns={"level_1": "site"}, inplace=True) # Group the DataFrame by "input_index" and drop the "input_index" column from each group - df_list = [soap_df.reset_index(drop=True).drop(columns='input_index') for _, soap_df in soap_df.groupby(["input_index"])] + df_list = [ + soap_df.reset_index(drop=True).drop(columns="input_index") + for _, soap_df in soap_df.groupby(["input_index"]) + ] return df -def get_per_site_SOAP_dfs(struct_list, cutoff=3, l_max=10, n_max=10, atom_sigma=0.5, verbose=False, n_jobs=16): + +def get_per_site_SOAP_dfs( + struct_list, cutoff=3, l_max=10, n_max=10, atom_sigma=0.5, verbose=False, n_jobs=16 +): """ Process a list of pymatgen structures using the Smooth Overlap of Atomic Positions (SOAP) method and organize the results into a list of DataFrames per structure, @@ -91,9 +126,16 @@ def get_per_site_SOAP_dfs(struct_list, cutoff=3, l_max=10, n_max=10, atom_sigma= list: List of DataFrames, each DataFrame containing the SOAP descriptors for each site in the structure. """ # Change n_jobs to the number of cores you have available - s = SmoothOverlapAtomicPosition(cutoff=cutoff, l_max=l_max, n_max=n_max, atom_sigma=atom_sigma, verbose=verbose, n_jobs=n_jobs) + s = SmoothOverlapAtomicPosition( + cutoff=cutoff, + l_max=l_max, + n_max=n_max, + atom_sigma=atom_sigma, + verbose=verbose, + n_jobs=n_jobs, + ) # Create a DataFrame with the list of structures - df = pd.DataFrame({'structure': struct_list}) + df = pd.DataFrame({"structure": struct_list}) # Transform the structures using SOAP a = s.transform(df["structure"]) # Copy and reset the index of the transformed DataFrame @@ -101,11 +143,15 @@ def get_per_site_SOAP_dfs(struct_list, cutoff=3, l_max=10, n_max=10, atom_sigma= # Rename the "level_1" column to "site" soap_df.rename(columns={"level_1": "site"}, inplace=True) # Group the DataFrame by "input_index" and drop the "input_index" column from each group - df_list = [soap_df.reset_index(drop=True).drop(columns='input_index') for _, soap_df in soap_df.groupby(["input_index"])] + df_list = [ + soap_df.reset_index(drop=True).drop(columns="input_index") + for _, soap_df in soap_df.groupby(["input_index"]) + ] return df_list -def get_SOAP_PCA_df(struct_list, PCA_comp = 30, write_df = False, filename=None): + +def get_SOAP_PCA_df(struct_list, PCA_comp=30, write_df=False, filename=None): """ Perform Principal Component Analysis (PCA) on Smooth Overlap of Atomic Positions (SOAP) descriptors for a list of structures and return a DataFrame with PCA-transformed SOAP descriptors. @@ -126,7 +172,7 @@ def get_SOAP_PCA_df(struct_list, PCA_comp = 30, write_df = False, filename=None) """ # Compute SOAP descriptors for each site in the structures struct_SOAP_df_list = get_per_site_SOAP_dfs(struct_list) - + # Concatenate all SOAP descriptors and perform standard scaling df_soap = pd.concat(struct_SOAP_df_list) df_soap.columns = df_soap.columns.astype(str) @@ -137,10 +183,12 @@ def get_SOAP_PCA_df(struct_list, PCA_comp = 30, write_df = False, filename=None) # Perform PCA with the specified number of principal components pca = PCA(n_components=PCA_comp) PCA_soap = pca.fit_transform(df_soap) - + # Create a DataFrame for PCA-transformed SOAP descriptors - PCA_soap_df = pd.DataFrame(data=PCA_soap, columns=[f'SOAP_PCA_{i}' for i in np.arange(0, PCA_comp)]) - + PCA_soap_df = pd.DataFrame( + data=PCA_soap, columns=[f"SOAP_PCA_{i}" for i in np.arange(0, PCA_comp)] + ) + # Save the DataFrame as a pickle file if write_df is True if write_df: if filename: @@ -148,9 +196,12 @@ def get_SOAP_PCA_df(struct_list, PCA_comp = 30, write_df = False, filename=None) else: filename = f"SOAP_PCA_{PCA_comp}_segsite.pkl" PCA_soap_df.to_pickle(filename) - - print(f'Explained variation at {PCA_comp} principal components: {np.sum(pca.explained_variance_ratio_)}') - + + print( + f"Explained variation at {PCA_comp} principal components: {np.sum(pca.explained_variance_ratio_)}" + ) + return PCA_soap_df + # def ACE_featuriser(): diff --git a/utils/training_data_nequip.py b/utils/training_data_nequip.py index 745ac94..f893433 100644 --- a/utils/training_data_nequip.py +++ b/utils/training_data_nequip.py @@ -3,6 +3,8 @@ import numpy as np import pandas as pd import time + + def process_list(my_list, n): # if list has length 1 or 2, return the list as is if len(my_list) <= 2: @@ -13,26 +15,38 @@ def process_list(my_list, n): last = my_list[-1] # get every nth element of the list, excluding first and lasat images - new_list = [my_list[i] for i in range(0, len(my_list), n) if i != 0 and i != len(my_list)-1] + new_list = [ + my_list[i] + for i in range(0, len(my_list), n) + if i != 0 and i != len(my_list) - 1 + ] # return the result return [first] + new_list + [last] -def extract_allegro_extxyz(filepath, max_electronic_steps = 120, every_nth_image=4, scf_steps = [], output_filepath = "allegro_training_data.extxyz"): + +def extract_allegro_extxyz( + filepath, + max_electronic_steps=120, + every_nth_image=4, + scf_steps=[], + output_filepath="allegro_training_data.extxyz", +): filtered_list = [] print(filepath) - ase_outcar = read(filepath, format = "vasp-out", index = ":") + ase_outcar = read(filepath, format="vasp-out", index=":") if scf_steps: for j, n_electronic_steps in enumerate(scf_steps): if n_electronic_steps != max_electronic_steps: filtered_list.append(ase_outcar[j]) else: filtered_list = ase_outcar - + every_n_list = process_list(filtered_list, every_nth_image) for _, atoms_obj in enumerate(every_n_list): write(output_filepath, atoms_obj, append=True, format="extxyz") - + + import glob df_pickles_filelist = [] @@ -41,26 +55,33 @@ def extract_allegro_extxyz(filepath, max_electronic_steps = 120, every_nth_image continue print(file) df_pickles_filelist.append(file) - + import multiprocessing + start_time = time.time() num_processors = multiprocessing.cpu_count() if len(df_pickles_filelist) < num_processors: processes = len(df_pickles_filelist) else: - processes = num_processors + processes = num_processors print(f"Number of processors: {num_processors}, used: {processes}") + def allegro_data_setup_from_df(df_pickle_filepath): df = pd.read_pickle(df_pickle_filepath) for _, row in df.iterrows(): output_file = os.path.basename(df_pickle_filepath).split(sep=".pkl")[0] - extract_allegro_extxyz(row.filepath, scf_steps = row.scf_steps, output_filepath = f"{output_file}-AllegroNequip.extxyz") - + extract_allegro_extxyz( + row.filepath, + scf_steps=row.scf_steps, + output_filepath=f"{output_file}-AllegroNequip.extxyz", + ) + + with multiprocessing.Pool(processes=processes) as pool: pool.map(allegro_data_setup_from_df, df_pickles_filelist) end_time = time.time() elapsed_time = end_time - start_time -print("Elapsed time:", np.round(elapsed_time,3), "seconds") +print("Elapsed time:", np.round(elapsed_time, 3), "seconds") diff --git a/utils/vasp/database.py b/utils/vasp/database.py index b27e1cc..b4c75e7 100644 --- a/utils/vasp/database.py +++ b/utils/vasp/database.py @@ -13,12 +13,22 @@ from utils.vasp.parser.outcar import Outcar from utils.vasp.parser.output import parse_vasp_directory -def find_vasp_directories(parent_dir, - filenames=["vasp.log", "INCAR", "POTCAR", "CONTCAR", "KPOINTS", "OUTCAR", "vasprun.xml"], - all_present=False, - extract_tarballs=True, - tarball_extensions=(".tar.gz"), - ): + +def find_vasp_directories( + parent_dir, + filenames=[ + "vasp.log", + "INCAR", + "POTCAR", + "CONTCAR", + "KPOINTS", + "OUTCAR", + "vasprun.xml", + ], + all_present=False, + extract_tarballs=True, + tarball_extensions=(".tar.gz"), +): """ Finds directories in a parent directory that contain specified files. @@ -45,22 +55,22 @@ def find_vasp_directories(parent_dir, - The function returns a list of directories that meet the specified conditions. """ if extract_tarballs: - gen_tools.find_and_extract_files_from_tarballs_parallel(parent_dir=parent_dir, - extension=tarball_extensions, - filenames=filenames, - suffix=None, - prefix=None) - - directories = gen_tools.find_directories_with_files(parent_dir=parent_dir, - filenames=filenames, - all_present=all_present) + gen_tools.find_and_extract_files_from_tarballs_parallel( + parent_dir=parent_dir, + extension=tarball_extensions, + filenames=filenames, + suffix=None, + prefix=None, + ) + + directories = gen_tools.find_directories_with_files( + parent_dir=parent_dir, filenames=filenames, all_present=all_present + ) return directories -def read_OUTCAR(filename="OUTCAR", - free_energy=True, - energy_zero=True, - structures=True): + +def read_OUTCAR(filename="OUTCAR", free_energy=True, energy_zero=True, structures=True): """ Read information from the OUTCAR file and related VASP structure files. @@ -86,15 +96,15 @@ def read_OUTCAR(filename="OUTCAR", - If any part of the parsing encounters an error, the corresponding DataFrame entry will have NaN values. """ outcar = Outcar() - outcar.from_file(filename = filename) + outcar.from_file(filename=filename) structure_name = os.path.basename(os.path.dirname(filename)) - + try: energies = outcar.parse_dict["energies"] except: energies = np.nan - + # create a list of file extensions to search for extensions = [".vasp", "CONTCAR", "POSCAR"] # create an empty list to store matching files @@ -110,69 +120,89 @@ def read_OUTCAR(filename="OUTCAR", break except: pass - + try: - ionic_step_structures = np.array([Structure(cell, structure.species, outcar.parse_dict["positions"][i], coords_are_cartesian=True).to_json() - for i, cell in enumerate(outcar.parse_dict["cells"])]) + ionic_step_structures = np.array( + [ + Structure( + cell, + structure.species, + outcar.parse_dict["positions"][i], + coords_are_cartesian=True, + ).to_json() + for i, cell in enumerate(outcar.parse_dict["cells"]) + ] + ) except: ionic_step_structures = np.nan - + try: - energies_zero = outcar.parse_dict["energies_zero"] + energies_zero = outcar.parse_dict["energies_zero"] except: energies_zero = np.nan - + try: forces = outcar.parse_dict["forces"] except: forces = np.nan - + try: stresses = outcar.parse_dict["stresses"] except: stresses = np.nan - + try: magmoms = np.array(outcar.parse_dict["final_magmoms"]) except: magmoms = np.nan - + try: scf_steps = [len(i) for i in outcar.parse_dict["scf_energies"]] except: scf_steps = np.nan - - df = pd.DataFrame([[structure_name, - filename, - ionic_step_structures, - energies, - energies_zero, - forces, - stresses, - magmoms, - scf_steps]], - columns = ["job_name", - "filepath", - "structures", - "energy", - "energy_zero", - "forces", - "stresses", - "magmoms", - "scf_steps"]) + + df = pd.DataFrame( + [ + [ + structure_name, + filename, + ionic_step_structures, + energies, + energies_zero, + forces, + stresses, + magmoms, + scf_steps, + ] + ], + columns=[ + "job_name", + "filepath", + "structures", + "energy", + "energy_zero", + "forces", + "stresses", + "magmoms", + "scf_steps", + ], + ) return df -def parse_VASP_directory(directory, - INCAR_filename="INCAR", - KPOINTS_filename="KPOINTS", - POTCAR_filename="POTCAR", - OUTCAR_filename="OUTCAR", - vasprunxml_filename="vasprun.xml", - vasplog_filename="vasp.log"): - + +def parse_VASP_directory( + directory, + INCAR_filename="INCAR", + KPOINTS_filename="KPOINTS", + POTCAR_filename="POTCAR", + OUTCAR_filename="OUTCAR", + vasprunxml_filename="vasprun.xml", + vasplog_filename="vasp.log", +): + # Find file matching pattern structure_files = glob.glob(os.path.join(directory, "starter*.vasp")) - + if len(structure_files) > 0: init_structure = Structure.from_file(structure_files[0]) else: @@ -182,7 +212,10 @@ def parse_VASP_directory(directory, try: df = read_OUTCAR(filename=os.path.join(directory, OUTCAR_filename)) except: - df = pd.DataFrame([[os.path.basename(directory), + df = pd.DataFrame( + [ + [ + os.path.basename(directory), directory, np.nan, np.nan, @@ -190,26 +223,33 @@ def parse_VASP_directory(directory, np.nan, np.nan, np.nan, - np.nan]], - columns = ["job_name", - "filepath", - "structures", - "energy", - "energy_zero", - "forces", - "stresses", - "magmoms", - "scf_steps"]) - - convergence = check_convergence(directory=directory, - filename_vasprun=vasprunxml_filename, - filename_vasplog=vasplog_filename) + np.nan, + ] + ], + columns=[ + "job_name", + "filepath", + "structures", + "energy", + "energy_zero", + "forces", + "stresses", + "magmoms", + "scf_steps", + ], + ) + + convergence = check_convergence( + directory=directory, + filename_vasprun=vasprunxml_filename, + filename_vasplog=vasplog_filename, + ) # INCAR try: incar = Incar.from_file(os.path.join(directory, INCAR_filename)).as_dict() except: incar = np.nan - + try: # KPOINTS kpoints = Kpoints.from_file(os.path.join(directory, KPOINTS_filename)).as_dict() @@ -221,32 +261,37 @@ def parse_VASP_directory(directory, kpoints = np.nan try: - element_list, element_count, electron_of_potcar = grab_electron_info(directory_path=directory, - potcar_filename=POTCAR_filename) + element_list, element_count, electron_of_potcar = grab_electron_info( + directory_path=directory, potcar_filename=POTCAR_filename + ) except: element_list = np.nan element_count = np.nan electron_of_potcar = np.nan - try: electron_count = get_total_electron_count(directory_path=directory) except: electron_count = np.nan - + df["element_list"] = [element_list] df["element_count"] = [element_count] df["potcar_electron_count"] = [electron_of_potcar] df["total_electron_count"] = [electron_count] df["convergence"] = [convergence] - + df["kpoints"] = [kpoints] df["incar"] = [incar] return df -def check_convergence(directory, filename_vasprun="vasprun.xml", filename_vasplog="vasp.log", backup_vasplog = "error.out"): +def check_convergence( + directory, + filename_vasprun="vasprun.xml", + filename_vasplog="vasp.log", + backup_vasplog="error.out", +): """ Check the convergence status of a VASP calculation. @@ -272,21 +317,28 @@ def check_convergence(directory, filename_vasprun="vasprun.xml", filename_vasplo vr = Vasprun(filename=os.path.join(directory, filename_vasprun)) return vr.converged except: - line_converged = "reached required accuracy - stopping structural energy minimisation" + line_converged = ( + "reached required accuracy - stopping structural energy minimisation" + ) try: - converged = gen_tools.is_line_in_file(filepath=os.path.join(directory, filename_vasplog), - line=line_converged, - exact_match=False) + converged = gen_tools.is_line_in_file( + filepath=os.path.join(directory, filename_vasplog), + line=line_converged, + exact_match=False, + ) return converged except: try: - converged = gen_tools.is_line_in_file(filepath=os.path.join(directory, backup_vasplog), - line=line_converged, - exact_match=False) + converged = gen_tools.is_line_in_file( + filepath=os.path.join(directory, backup_vasplog), + line=line_converged, + exact_match=False, + ) return converged except: return False + def element_count_ordered(structure): site_element_list = [site.species_string for site in structure] past_element = site_element_list[0] @@ -302,9 +354,12 @@ def element_count_ordered(structure): count = 1 past_element = element element_count.append(count) - return element_list, element_count + return element_list, element_count + -def _try_read_structure(directory_path, structure_filenames = ["CONTCAR", ".vasp", "POSCAR"]): +def _try_read_structure( + directory_path, structure_filenames=["CONTCAR", ".vasp", "POSCAR"] +): structure_files = [] # walk through the directory and check each file's name for root, dirs, files in os.walk(directory_path): @@ -323,15 +378,18 @@ def _try_read_structure(directory_path, structure_filenames = ["CONTCAR", ".vasp structure = np.nan return structure -def grab_electron_info(directory_path, line_before_elec_str="PAW_PBE", potcar_filename = "POTCAR"): - + +def grab_electron_info( + directory_path, line_before_elec_str="PAW_PBE", potcar_filename="POTCAR" +): + structure = _try_read_structure(directory_path=directory_path) if structure != None: element_list, element_count = element_count_ordered(structure) - + electron_of_potcar = [] - - with open(os.path.join(directory_path, potcar_filename), 'r') as file: + + with open(os.path.join(directory_path, potcar_filename), "r") as file: lines = file.readlines() # Read the lines from the file should_append = False # Flag to determine if the next line should be appended for line in lines: @@ -341,35 +399,50 @@ def grab_electron_info(directory_path, line_before_elec_str="PAW_PBE", potcar_fi should_append = False # Reset the flag if stripped_line.startswith(line_before_elec_str): should_append = True # Set the flag to append the next line - + return element_list, element_count, electron_of_potcar -def get_total_electron_count(directory_path, line_before_elec_str="PAW_PBE", potcar_filename = "POTCAR"): - ele_list, ele_count, electron_of_potcar = grab_electron_info(directory_path=directory_path, line_before_elec_str=line_before_elec_str, potcar_filename=potcar_filename) + +def get_total_electron_count( + directory_path, line_before_elec_str="PAW_PBE", potcar_filename="POTCAR" +): + ele_list, ele_count, electron_of_potcar = grab_electron_info( + directory_path=directory_path, + line_before_elec_str=line_before_elec_str, + potcar_filename=potcar_filename, + ) total_electron_count = np.dot(ele_count, electron_of_potcar) return total_electron_count + def _check_convergence(directory): return directory, check_convergence(directory) + def find_converged_dirs(parent_dir): dirs = find_vasp_directories(parent_dir=parent_dir, extract_tarballs=False) # Filter the directories where convergence is True dir_and_convergence = parallelise(_check_convergence, dirs) - - converged_dirs = [directory for directory, convergence in dir_and_convergence if convergence] + + converged_dirs = [ + directory for directory, convergence in dir_and_convergence if convergence + ] return converged_dirs + def flatten_all_iterables(input_list): flat_list = [] for item in input_list: - if isinstance(item, (list, tuple, np.ndarray)): # Now also checks for numpy arrays + if isinstance( + item, (list, tuple, np.ndarray) + ): # Now also checks for numpy arrays flat_list.extend(item) # Extend the flat list with elements of the iterable else: flat_list.append(item) # Add the item directly if it's not an iterable return flat_list + def find_significantly_different_indices_threshold(values, threshold): if not values: return [] @@ -381,161 +454,291 @@ def find_significantly_different_indices_threshold(values, threshold): last_significant_value = current_value return significant_indices + def exclude_non_converged_data(df, columns_to_exclude_data): def process_row(row): - non_converged_indices = [i for i, conv in enumerate(row["scf_convergence"]) if not conv] + non_converged_indices = [ + i for i, conv in enumerate(row["scf_convergence"]) if not conv + ] for column in columns_to_exclude_data: if column in row: # Check if column is in row to avoid KeyError - row[column] = [value for i, value in enumerate(row[column]) if i not in non_converged_indices] + row[column] = [ + value + for i, value in enumerate(row[column]) + if i not in non_converged_indices + ] return row processed_df = df.apply(process_row, axis=1) return processed_df -def get_flattened_df(df, - groupby="filepath", - columns_to_process=["energy", "energy_zero", "structures", "forces", "magmoms", "stresses", "scf_steps", "scf_convergence"]): - processed_df = df.sort_values("calc_start_time").groupby(groupby).agg(lambda x: x.tolist()).reset_index().copy() + +def get_flattened_df( + df, + groupby="filepath", + columns_to_process=[ + "energy", + "energy_zero", + "structures", + "forces", + "magmoms", + "stresses", + "scf_steps", + "scf_convergence", + ], +): + processed_df = ( + df.sort_values("calc_start_time") + .groupby(groupby) + .agg(lambda x: x.tolist()) + .reset_index() + .copy() + ) for column in columns_to_process: processed_df[column] = processed_df[column].apply(flatten_all_iterables) return processed_df -def get_filtered_df(df, - energy_threshold=0.05, - columns=["energy", "energy_zero", "structures", "forces", "magmoms", "stresses", "scf_steps", "scf_convergence"]): + +def get_filtered_df( + df, + energy_threshold=0.05, + columns=[ + "energy", + "energy_zero", + "structures", + "forces", + "magmoms", + "stresses", + "scf_steps", + "scf_convergence", + ], +): def process_row(row, column="energy", columns_to_flatten=columns): - indices = find_significantly_different_indices_threshold(row[column], energy_threshold) - processed_row = {col: (row[col] if col not in columns_to_flatten else [row[col][i] for i in indices if i < len(row[col])]) for col in df.columns} + indices = find_significantly_different_indices_threshold( + row[column], energy_threshold + ) + processed_row = { + col: ( + row[col] + if col not in columns_to_flatten + else [row[col][i] for i in indices if i < len(row[col])] + ) + for col in df.columns + } return processed_row + significant_changes = df.apply(process_row, axis=1) significant_changes_df = pd.DataFrame(list(significant_changes)) - if 'job_name' in significant_changes_df.columns: - significant_changes_df["job_name"] = [row.job_name[0] for _, row in significant_changes_df.iterrows()] + if "job_name" in significant_changes_df.columns: + significant_changes_df["job_name"] = [ + row.job_name[0] for _, row in significant_changes_df.iterrows() + ] return significant_changes_df -def get_potential_data_df(df, - energy_threshold=0.05, - columns_to_process=["energy", "energy_zero", "structures", "forces", "magmoms", "stresses", "scf_steps", "scf_convergence"], - ): + +def get_potential_data_df( + df, + energy_threshold=0.05, + columns_to_process=[ + "energy", + "energy_zero", + "structures", + "forces", + "magmoms", + "stresses", + "scf_steps", + "scf_convergence", + ], +): processed_df = get_flattened_df(df) - processed_df = get_filtered_df(processed_df, - energy_threshold=energy_threshold, - columns=["energy", "energy_zero", "structures", "forces", "magmoms", "stresses", "scf_steps", "scf_convergence"]) + processed_df = get_filtered_df( + processed_df, + energy_threshold=energy_threshold, + columns=[ + "energy", + "energy_zero", + "structures", + "forces", + "magmoms", + "stresses", + "scf_steps", + "scf_convergence", + ], + ) non_corr_df = exclude_non_converged_data(processed_df, columns_to_process) return non_corr_df -class DatabaseGenerator(): - - def __init__(self, - parent_dir, - max_workers=16): + +class DatabaseGenerator: + + def __init__(self, parent_dir, max_workers=16): self.parent_dir = parent_dir self.max_workers = max_workers - def build_database(self, - target_directory = None, - extract_directories = False, - tarball_extensions = (".tar.gz", "tar.bz2"), - read_error_dirs = False, - read_multiple_runs_in_dir = False, - cleanup = False, - keep_filenames_after_cleanup = [], - keep_filename_patterns_after_cleanup = [], - max_dir_count = None, - filenames_to_qualify=["vasp.log", "INCAR", "POTCAR", "CONTCAR", "KPOINTS", "OUTCAR", "vasprun.xml"], - all_present=False, - df_filename = None, - df_compression=True): # Added database_compression flag with default True + def build_database( + self, + target_directory=None, + extract_directories=False, + tarball_extensions=(".tar.gz", "tar.bz2"), + read_error_dirs=False, + read_multiple_runs_in_dir=False, + cleanup=False, + keep_filenames_after_cleanup=[], + keep_filename_patterns_after_cleanup=[], + max_dir_count=None, + filenames_to_qualify=[ + "vasp.log", + "INCAR", + "POTCAR", + "CONTCAR", + "KPOINTS", + "OUTCAR", + "vasprun.xml", + ], + all_present=False, + df_filename=None, + df_compression=True, + ): # Added database_compression flag with default True start_time = time.time() - + if target_directory: - dirs = find_vasp_directories(parent_dir = target_directory, - extract_tarballs = extract_directories, - all_present = all_present, - filenames = filenames_to_qualify, - tarball_extensions = tarball_extensions) + dirs = find_vasp_directories( + parent_dir=target_directory, + extract_tarballs=extract_directories, + all_present=all_present, + filenames=filenames_to_qualify, + tarball_extensions=tarball_extensions, + ) else: - dirs = find_vasp_directories(parent_dir = self.parent_dir, - extract_tarballs = extract_directories, - all_present = all_present, - filenames = filenames_to_qualify, - tarball_extensions = tarball_extensions) - print(f"The total number of vasp directories that we are building the database out of is {len(dirs)}") - - compression_option = 'gzip' if df_compression else None - compression_extension = '.gz' if df_compression else '' - + dirs = find_vasp_directories( + parent_dir=self.parent_dir, + extract_tarballs=extract_directories, + all_present=all_present, + filenames=filenames_to_qualify, + tarball_extensions=tarball_extensions, + ) + print( + f"The total number of vasp directories that we are building the database out of is {len(dirs)}" + ) + + compression_option = "gzip" if df_compression else None + compression_extension = ".gz" if df_compression else "" + if max_dir_count: pkl_filenames = [] for i, chunks in enumerate(gen_tools.chunk_list(dirs, max_dir_count)): step_time = time.time() - df = pd.concat(parallelise(parse_vasp_directory, - [(chunk,) for chunk in chunks], - max_workers=self.max_workers, - extract_error_dirs=read_error_dirs, - parse_all_in_dir=read_multiple_runs_in_dir)) + df = pd.concat( + parallelise( + parse_vasp_directory, + [(chunk,) for chunk in chunks], + max_workers=self.max_workers, + extract_error_dirs=read_error_dirs, + parse_all_in_dir=read_multiple_runs_in_dir, + ) + ) if df_filename: db_filename = f"{i}_{df_filename}.pkl{compression_extension}" else: db_filename = f"{i}.pkl{compression_extension}" pkl_filenames.append(os.path.join(self.parent_dir, db_filename)) - df.to_pickle(os.path.join(self.parent_dir, db_filename), compression=compression_option) + df.to_pickle( + os.path.join(self.parent_dir, db_filename), + compression=compression_option, + ) step_taken_time = np.round(time.time() - step_time, 3) - print(f"Step {i}: {step_taken_time} seconds taken for {len(chunks)} parse steps") - - df = pd.concat([pd.read_pickle(partial_df, compression=compression_option) for partial_df in pkl_filenames]) - final_db_filename = os.path.join(self.parent_dir, f"vasp_database.pkl{compression_extension}") + print( + f"Step {i}: {step_taken_time} seconds taken for {len(chunks)} parse steps" + ) + + df = pd.concat( + [ + pd.read_pickle(partial_df, compression=compression_option) + for partial_df in pkl_filenames + ] + ) + final_db_filename = os.path.join( + self.parent_dir, f"vasp_database.pkl{compression_extension}" + ) df.to_pickle(final_db_filename, compression=compression_option) else: - df = pd.concat(parallelise(parse_vasp_directory, - [(chunk,) for chunk in chunks], - max_workers=self.max_workers, - extract_error_dirs=read_error_dirs, - parse_all_in_dir=read_multiple_runs_in_dir)) - df.to_pickle(os.path.join(self.parent_dir, f"vasp_database.pkl{compression_extension}"), compression=compression_option) - + df = pd.concat( + parallelise( + parse_vasp_directory, + [(chunk,) for chunk in chunks], + max_workers=self.max_workers, + extract_error_dirs=read_error_dirs, + parse_all_in_dir=read_multiple_runs_in_dir, + ) + ) + df.to_pickle( + os.path.join( + self.parent_dir, f"vasp_database.pkl{compression_extension}" + ), + compression=compression_option, + ) + end_time = time.time() elapsed_time = end_time - start_time - + # not optional - keep the tarballs/zips.. keep_filename_patterns_after_cleanup += ".tar.gz" keep_filename_patterns_after_cleanup += ".tar.bz2" keep_filename_patterns_after_cleanup += ".zip" if cleanup: - gen_tools.cleanup_dir(directory_path=dirs, keep=True, files=[], file_patterns=[]) - parallelise(gen_tools.cleanup_dir, dirs, [True] * len(dirs), keep_filenames_after_cleanup*len(dirs), keep_filename_patterns_after_cleanup*len(dirs)) - - print("Elapsed time:", np.round(elapsed_time,3), "seconds") + gen_tools.cleanup_dir( + directory_path=dirs, keep=True, files=[], file_patterns=[] + ) + parallelise( + gen_tools.cleanup_dir, + dirs, + [True] * len(dirs), + keep_filenames_after_cleanup * len(dirs), + keep_filename_patterns_after_cleanup * len(dirs), + ) + + print("Elapsed time:", np.round(elapsed_time, 3), "seconds") return df - - def update_failed_jobs_in_database(self, df_path=None, read_error_dirs=False, read_multiple_runs_in_dir=False, max_dir_count=None, df_compression=True): - compression_option = 'gzip' if df_compression else None - compression_extension = '.gz' if df_compression else '' - + + def update_failed_jobs_in_database( + self, + df_path=None, + read_error_dirs=False, + read_multiple_runs_in_dir=False, + max_dir_count=None, + df_compression=True, + ): + compression_option = "gzip" if df_compression else None + compression_extension = ".gz" if df_compression else "" + if df_path is None: - df_path = os.path.join(self.parent_dir, f"vasp_database.pkl{compression_extension}") - + df_path = os.path.join( + self.parent_dir, f"vasp_database.pkl{compression_extension}" + ) + if os.path.isdir(df_path): potential_files = [ os.path.join(df_path, "vasp_database.pkl.gz"), - os.path.join(df_path, "vasp_database.pkl") + os.path.join(df_path, "vasp_database.pkl"), ] - output_path = os.path.join(df_path, f"vasp_database.pkl{compression_extension}") + output_path = os.path.join( + df_path, f"vasp_database.pkl{compression_extension}" + ) else: potential_files = [df_path] output_path = df_path - + df = None for file in potential_files: try: if file.endswith(".gz"): - df = pd.read_pickle(file, compression='gzip') + df = pd.read_pickle(file, compression="gzip") else: df = pd.read_pickle(file, compression=None) print(f"Successfully read database from {file}") @@ -544,33 +747,56 @@ def update_failed_jobs_in_database(self, df_path=None, read_error_dirs=False, re print(f"Failed to read database from {file}") if df is None: - raise ValueError("Invalid path or filename - please check! Attempted paths: " + ", ".join(potential_files)) - - failed_dirs = df[df['convergence'] == False]['filepath'].tolist() + raise ValueError( + "Invalid path or filename - please check! Attempted paths: " + + ", ".join(potential_files) + ) + + failed_dirs = df[df["convergence"] == False]["filepath"].tolist() print(f"Reparsing {len(failed_dirs)} directories where convergence is False") if max_dir_count: pkl_filenames = [] - for i, chunks in enumerate(gen_tools.chunk_list(failed_dirs, max_dir_count)): + for i, chunks in enumerate( + gen_tools.chunk_list(failed_dirs, max_dir_count) + ): step_time = time.time() - failed_df = pd.concat(parallelise(parse_vasp_directory, - [(chunk,) for chunk in chunks], - max_workers=self.max_workers, - extract_error_dirs=read_error_dirs, - parse_all_in_dir=read_multiple_runs_in_dir)) + failed_df = pd.concat( + parallelise( + parse_vasp_directory, + [(chunk,) for chunk in chunks], + max_workers=self.max_workers, + extract_error_dirs=read_error_dirs, + parse_all_in_dir=read_multiple_runs_in_dir, + ) + ) db_filename = f"update_{i}.pkl{compression_extension}" pkl_filenames.append(os.path.join(self.parent_dir, db_filename)) - failed_df.to_pickle(os.path.join(self.parent_dir, db_filename), compression=compression_option) + failed_df.to_pickle( + os.path.join(self.parent_dir, db_filename), + compression=compression_option, + ) step_taken_time = np.round(time.time() - step_time, 3) - print(f"Step {i}: {step_taken_time} seconds taken for {len(chunks)} parse steps") - - failed_df = pd.concat([pd.read_pickle(partial_df, compression=compression_option) for partial_df in pkl_filenames]) + print( + f"Step {i}: {step_taken_time} seconds taken for {len(chunks)} parse steps" + ) + + failed_df = pd.concat( + [ + pd.read_pickle(partial_df, compression=compression_option) + for partial_df in pkl_filenames + ] + ) else: - failed_df = pd.concat(parallelise(parse_vasp_directory, - [(chunk,) for chunk in failed_dirs], - max_workers=self.max_workers, - extract_error_dirs=read_error_dirs, - parse_all_in_dir=read_multiple_runs_in_dir)) + failed_df = pd.concat( + parallelise( + parse_vasp_directory, + [(chunk,) for chunk in failed_dirs], + max_workers=self.max_workers, + extract_error_dirs=read_error_dirs, + parse_all_in_dir=read_multiple_runs_in_dir, + ) + ) # Use a different method to merge the DataFrames df.update(failed_df, overwrite=True) @@ -578,6 +804,7 @@ def update_failed_jobs_in_database(self, df_path=None, read_error_dirs=False, re df.to_pickle(output_path, compression=compression_option) print(f"Updated dataframe saved to {output_path}") return df + # def update_database(self, # new_calculation_directory, # existing_database_filename = "vasp_database.pkl", @@ -587,7 +814,7 @@ def update_failed_jobs_in_database(self, df_path=None, read_error_dirs=False, re # keep_filename_patterns_after_cleanup = [], # max_dir_count = None, # df_filename = None): - + # update_df = self.build_database(target_directory = existing_database_filename, # extract_directories = extract_directories, # cleanup=cleanup, @@ -597,12 +824,12 @@ def update_failed_jobs_in_database(self, df_path=None, read_error_dirs=False, re # df_filename = df_filename) # def _get_job_dir(filepath): # return os.path.basename(filepath.rstrip("/OUTCAR")) - + # update_df["job_dir"] = [_get_job_dir(row.filepath) for _, row in update_df.iterrows()] # base_df["job_dir"] = [_get_job_dir(row.filepath) for _, row in base_df.iterrows()] # base_df = pd.read_pickle(existing_database_filename) - + # # Merge df1 and df2 based on the common dirname # interm_df = base_df.merge(update_df, on='job_dir', suffixes=('_df1', '_df2'), how='left') @@ -612,11 +839,12 @@ def update_failed_jobs_in_database(self, df_path=None, read_error_dirs=False, re # # Check if the column with suffix '_df2' exists # if (f'{column}_df2' in interm_df.columns): # base_df[column].update(interm_df[column + '_df2'].combine_first(interm_df[column + '_df1'])) - + # base_df.drop(columns=['job_dir'], inplace=True) - + # return base_df + def update_database(df_base, df_update): # Get the unique job names from df2 df_update_jobs = set(df_update["job_name"]) @@ -627,6 +855,8 @@ def update_database(df_base, df_update): # Append df2 to the filtered df1 merged_df = pd.concat([df_base, df_update_jobs], ignore_index=True) return merged_df + + def robust_append_last(clist, value): try: clist.append(value[-1]) @@ -634,18 +864,19 @@ def robust_append_last(clist, value): clist.append(np.nan) return clist + def create_summary(database_df): energies = [] magmoms = [] structures = [] - + for i, row in database_df.iterrows(): energies = robust_append_last(energies, row.energy_zero) magmoms = robust_append_last(magmoms, row.magmoms) structures = robust_append_last(structures, row.structures) - + df = database_df[["job_name", "convergence"]].copy() df["total_energy"] = energies df["magmoms"] = magmoms df["structures"] = structures - return df \ No newline at end of file + return df diff --git a/utils/vasp/job.py b/utils/vasp/job.py index 4f268ca..3fdc240 100644 --- a/utils/vasp/job.py +++ b/utils/vasp/job.py @@ -5,13 +5,16 @@ potcar_library_path = "/root/POTCAR_Library/GGA" potcar_library_path = "/cmmc/u/hmai/pyiron-resources-cmmc/vasp/potentials/potpaw_PBE" -def createFolder(directory, delete_folder='no'): - import os; import shutil + +def createFolder(directory, delete_folder="no"): + import os + import shutil + if not os.path.exists(directory): os.makedirs(directory) else: - if delete_folder == 'no': - #print('no replacement/deletion created due to folder existing') + if delete_folder == "no": + # print('no replacement/deletion created due to folder existing') x = 1 else: print("removing directory...") @@ -22,11 +25,17 @@ def createFolder(directory, delete_folder='no'): else: print("given path is a special file - manually remove") + def get_immediate_subdirectories(a_dir): - return [f.path for f in os.scandir(a_dir) if f.is_dir() and os.path.basename(f) != ".ipynb_checkpoints"] + return [ + f.path + for f in os.scandir(a_dir) + if f.is_dir() and os.path.basename(f) != ".ipynb_checkpoints" + ] + class jobfile: - ''' + """ Class for jobfile object for passing into createJobFolder Attributes: @@ -48,15 +57,18 @@ class jobfile: RAM: RAM to be allocated - this is only specified in the case of Gadi, Setonix + magnus do not need specification. walltime: INTEGER ONLY The walltime of the job in hours - ''' - def __init__(self, - file_path, - HPC = "Gadi", - VASP_version = "5.4.4", - CPU = 192, - RAM = 64, - walltime = 999, - max_resubmissions = 999): + """ + + def __init__( + self, + file_path, + HPC="Gadi", + VASP_version="5.4.4", + CPU=192, + RAM=64, + walltime=999, + max_resubmissions=999, + ): self.file_path = file_path self.HPC = HPC self.VASP_version = VASP_version @@ -65,9 +77,9 @@ def __init__(self, self.walltime = walltime self.max_resubmissions = max_resubmissions - def to_file(self,\ - case_name = 'template_job',\ - output_path = os.path.join(os.getcwd(), "test")): + def to_file( + self, case_name="template_job", output_path=os.path.join(os.getcwd(), "test") + ): """ Writes KPOINTS file with MP gamma centred grid: @@ -78,7 +90,7 @@ def to_file(self,\ createFolder(output_path) - with open("%s" % (self.file_path), 'r') as fin : + with open("%s" % (self.file_path), "r") as fin: filedata = fin.read() if self.HPC == "Gadi": fin = open("%s" % (self.file_path), "rt", newline="\n") @@ -87,7 +99,9 @@ def to_file(self,\ # Replace the target string filedata = filedata.replace("{WALLTIMESTRING}", "%s:00:00" % self.walltime) filedata = filedata.replace("{CPUSTRING}", str(self.CPU)) - filedata = filedata.replace("{MAXCONVITERATIONS}", str(self.max_resubmissions-1)) + filedata = filedata.replace( + "{MAXCONVITERATIONS}", str(self.max_resubmissions - 1) + ) # Only on GADI filedata = filedata.replace("{MEMORYSTRING}", "%sGB" % self.RAM) @@ -99,33 +113,43 @@ def to_file(self,\ max_cpu_count = 128 elif self.HPC == "Garching": max_cpu_count = 40 - if self.CPU <= max_cpu_count: + if self.CPU <= max_cpu_count: filedata = filedata.replace("{NODESTRING}", "1") else: - filedata = filedata.replace("{NODESTRING}", "%s" % int(self.CPU/max_cpu_count)) - + filedata = filedata.replace( + "{NODESTRING}", "%s" % int(self.CPU / max_cpu_count) + ) + filedata = filedata.replace("{CASESTRING}", "%s" % case_name) if self.VASP_version == "5.4.4": - filedata = filedata.replace("{VASPMODULELOADSTRING}", 'module load vasp/%s' % self.VASP_version) + filedata = filedata.replace( + "{VASPMODULELOADSTRING}", "module load vasp/%s" % self.VASP_version + ) else: if self.HPC == "Setonix" and self.VASP_version in ["6.3.0", "6.2.1"]: - filedata = filedata.replace("{VASPMODULELOADSTRING}", 'module load vasp6/%s' % self.VASP_version) + filedata = filedata.replace( + "{VASPMODULELOADSTRING}", "module load vasp6/%s" % self.VASP_version + ) else: - filedata = filedata.replace("{VASPMODULELOADSTRING}", 'module load vasp/%s' % self.VASP_version) + filedata = filedata.replace( + "{VASPMODULELOADSTRING}", "module load vasp/%s" % self.VASP_version + ) if self.HPC == "Garching": # vasp/5.3-constrainedcollinearmagnetism vasp/5.4.4-buildFeb20 vasp/5.4.4-elphon vasp/5.4.4-python vasp/6.4.0-buildMar23 # vasp/5.4.4 vasp/5.4.4-Dudarev vasp/5.4.4-potentiostat vasp/6.4.0 vasp/6.4.0-python - filedata = filedata.replace("{VASPMODULELOADSTRING}", 'module load vasp/%s' % self.VASP_version) - + filedata = filedata.replace( + "{VASPMODULELOADSTRING}", "module load vasp/%s" % self.VASP_version + ) # Write the file out again - with open(os.path.join(output_path, case_name), 'w') as fout: + with open(os.path.join(output_path, case_name), "w") as fout: fout.write(filedata) fin.close() fout.close() + def stackElementString(structure): site_element_list = [site.species_string for site in structure] past_element = site_element_list[0] @@ -143,52 +167,54 @@ def stackElementString(structure): element_count.append(count) return element_list, element_count -def createPOTCAR(structure, path = os.getcwd()): + +def createPOTCAR(structure, path=os.getcwd()): element_list = stackElementString(structure)[0] potcar_paths = [] for element in element_list: if element == "Nb": - element = "Nb_sv" # Use 13 electron - element = "Nb_pv" # Use 11 electron + element = "Nb_sv" # Use 13 electron + element = "Nb_pv" # Use 11 electron elif element == "K": - element = "K_sv" # 9 electron - element = "K_pv" # 7 electron + element = "K_sv" # 9 electron + element = "K_pv" # 7 electron elif element == "Ca": - element = "Ca_sv" # 9 electron - element = "Ca_pv" # 7 electron + element = "Ca_sv" # 9 electron + element = "Ca_pv" # 7 electron elif element == "Rb": - element = "Rb_sv" # 9 electron - element = "Rb_pv" # 7 electron + element = "Rb_sv" # 9 electron + element = "Rb_pv" # 7 electron elif element == "Sr": - element = "Sr_sv" # 9 electron + element = "Sr_sv" # 9 electron elif element == "Cs": - element = "Cs_sv" # 9 electron + element = "Cs_sv" # 9 electron elif element == "Ba": - element = "Ba_sv" # 10 electron + element = "Ba_sv" # 10 electron elif element == "Fr": - element = "Fr_sv" # 9 electron + element = "Fr_sv" # 9 electron elif element == "Ra": - element = "Ra_sv" # 9 electron + element = "Ra_sv" # 9 electron elif element == "Y": - element = "Y_sv" # 9 electron + element = "Y_sv" # 9 electron elif element == "Zr": - element = "Zr_sv" # 10 electron + element = "Zr_sv" # 10 electron elif element == "Fr": - element = "Fr_sv" # 9 electron + element = "Fr_sv" # 9 electron elif element == "Ra": - element = "Ra_sv" # 9 electron + element = "Ra_sv" # 9 electron elif element == "Y": - element = "Y_sv" # 9 electron + element = "Y_sv" # 9 electron potcar_paths.append(os.path.join(potcar_library_path, element, "POTCAR")) - with open(os.path.join(path, "POTCAR"),'wb') as wfd: + with open(os.path.join(path, "POTCAR"), "wb") as wfd: for f in potcar_paths: - with open(f,'rb') as fd: + with open(f, "rb") as fd: shutil.copyfileobj(fd, wfd) - + + class KPOINTS: """ Class for KPOINTS object for passing into createJobFolder @@ -200,13 +226,12 @@ class KPOINTS: shift: optional shift of mesh, input as list e.g. [0, 0, 0] """ + def __init__(self, subdivs, shift): self.subdivs = subdivs self.shift = shift - def to_file(self,\ - case_name = 'KPOINTS',\ - filepath = os.getcwd()): + def to_file(self, case_name="KPOINTS", filepath=os.getcwd()): """ Writes KPOINTS file with MP gamma centred grid: @@ -215,48 +240,58 @@ def to_file(self,\ """ createFolder(filepath) - f = io.open(os.path.join(filepath, "KPOINTS"), 'w', newline='\n') - with open(os.path.join(filepath, "KPOINTS"), 'a', newline='\n') as f: + f = io.open(os.path.join(filepath, "KPOINTS"), "w", newline="\n") + with open(os.path.join(filepath, "KPOINTS"), "a", newline="\n") as f: # File name (just string on first line of KPOINTS) - f.write('%s\n' % case_name) + f.write("%s\n" % case_name) # Use automatic generation "0" - f.write('0\n') + f.write("0\n") # Monkhorst-Pack Gamma centred grid - f.write('Gamma\n') + f.write("Gamma\n") # Subdivisions along reciprocal lattice vectors - subdiv_string = '' + subdiv_string = "" for i in self.subdivs: subdiv_string += "%s " % str(i) - f.write('%s\n' % subdiv_string) + f.write("%s\n" % subdiv_string) # optional shift of the mesh (s_1, s_2, s_3) - shift_string = '' + shift_string = "" for i in self.shift: shift_string += "%s " % str(i) - f.write('%s\n' % shift_string) + f.write("%s\n" % shift_string) f.close() - -def createJobFolder(structure,\ - KPOINT = None,\ - folder_path = os.path.join(os.getcwd(), "jobfolder"),\ - INCAR = None,\ - jobfile = None,\ - quiet=True): + + +def createJobFolder( + structure, + KPOINT=None, + folder_path=os.path.join(os.getcwd(), "jobfolder"), + INCAR=None, + jobfile=None, + quiet=True, +): # This assumes that incar file base is present already, please adjust this function to adjust the incar flags # creates a subdirectory of chosen name in current directory parent_folder = os.getcwd() createFolder(folder_path) - structure.to(fmt="poscar", filename = os.path.join(folder_path, f"starter-{os.path.basename(folder_path)}.vasp")) - structure.to(fmt="poscar", filename = os.path.join(folder_path, "POSCAR")) + structure.to( + fmt="poscar", + filename=os.path.join( + folder_path, f"starter-{os.path.basename(folder_path)}.vasp" + ), + ) + structure.to(fmt="poscar", filename=os.path.join(folder_path, "POSCAR")) - createPOTCAR(structure, path = "%s" % folder_path) + createPOTCAR(structure, path="%s" % folder_path) INCAR.write_file(os.path.join(folder_path, "INCAR")) if KPOINT: - KPOINT.to_file(filepath = folder_path) + KPOINT.to_file(filepath=folder_path) - jobfile.to_file(case_name = '%s.sh' % os.path.basename(folder_path),\ - output_path = "%s" % (folder_path)) + jobfile.to_file( + case_name="%s.sh" % os.path.basename(folder_path), + output_path="%s" % (folder_path), + ) if not quiet: - print("Generating jobfolder, name %s" % (os.path.basename(folder_path))) \ No newline at end of file + print("Generating jobfolder, name %s" % (os.path.basename(folder_path))) diff --git a/utils/vasp/parser/outcar.py b/utils/vasp/parser/outcar.py index b024a1c..3e75a1e 100644 --- a/utils/vasp/parser/outcar.py +++ b/utils/vasp/parser/outcar.py @@ -17,6 +17,7 @@ scipy.constants.physical_constants["joule-electron volt relationship"][0] / 1e22 ) + class Outcar(object): """ This module is used to parse VASP OUTCAR files. @@ -120,10 +121,18 @@ def from_file(self, filename="OUTCAR"): "elapsed_time": elapsed_time, "memory_used": memory_used, } - self.parse_dict["ionic_stop_criteria"] = self.get_ionic_stop_criteria(filename=filename) - self.parse_dict["electronic_stop_criteria"] = self.get_electronic_stop_criteria(filename=filename) - self.parse_dict["max_electronic_steps"] = self.get_electronic_stop_criteria(filename=filename) - self.parse_dict["max_ionic_steps"] = self.get_electronic_stop_criteria(filename=filename) + self.parse_dict["ionic_stop_criteria"] = self.get_ionic_stop_criteria( + filename=filename + ) + self.parse_dict["electronic_stop_criteria"] = self.get_electronic_stop_criteria( + filename=filename + ) + self.parse_dict["max_electronic_steps"] = self.get_electronic_stop_criteria( + filename=filename + ) + self.parse_dict["max_ionic_steps"] = self.get_electronic_stop_criteria( + filename=filename + ) try: self.parse_dict["pressures"] = ( @@ -177,30 +186,34 @@ def from_hdf(self, hdf, group_name="outcar"): """ with hdf.open(group_name) as hdf5_output: for key in hdf5_output.list_nodes(): - self.parse_dict[key] = hdf5_output[key] - - def extract_value_from_line(self, line, position = 1, split = "="): + self.parse_dict[key] = hdf5_output[key] + + def extract_value_from_line(self, line, position=1, split="="): parts = line.split(split) if len(parts) > 1: return float(parts[position].strip().split()[0].strip(";")) return None - + def find_and_extract_value_from_matched_line(self, filename, search_term): - with open(filename, 'r') as file: + with open(filename, "r") as file: for line in file: if search_term in line: value = self.extract_value_from_line(line) - return value - + return value + def get_ionic_stop_criteria(self, filename="OUTCAR"): - return self.find_and_extract_value_from_matched_line(filename, "stopping-criterion for IOM") - + return self.find_and_extract_value_from_matched_line( + filename, "stopping-criterion for IOM" + ) + def get_electronic_stop_criteria(self, filename="OUTCAR"): - return self.find_and_extract_value_from_matched_line(filename, "stopping-criterion for ELM") - + return self.find_and_extract_value_from_matched_line( + filename, "stopping-criterion for ELM" + ) + def get_max_electronic_steps(self, filename="OUTCAR"): return self.find_and_extract_value_from_matched_line(filename, "NELM") - + def get_vasp_version(self, filename="OUTCAR", lines=None): return lines[0].lstrip().split(sep=" ")[0] @@ -218,11 +231,11 @@ def get_datetime(self, filename="OUTCAR", lines=None): if match: date_str, time_str = match.groups() # Combining the date and time strings - datetime_str = date_str + ' ' + time_str + datetime_str = date_str + " " + time_str # Converting to datetime object - datetime_obj = datetime.strptime(datetime_str, '%Y.%m.%d %H:%M:%S') + datetime_obj = datetime.strptime(datetime_str, "%Y.%m.%d %H:%M:%S") return datetime_obj - + def get_positions_and_forces(self, filename="OUTCAR", lines=None, n_atoms=None): """ Gets the forces and positions for every ionic step from the OUTCAR file @@ -253,7 +266,6 @@ def get_positions_and_forces(self, filename="OUTCAR", lines=None, n_atoms=None): ) def get_positions(self, filename="OUTCAR", lines=None, n_atoms=None): - """ Gets the positions for every ionic step from the OUTCAR file @@ -1187,12 +1199,14 @@ def get_energy_components(filename="OUTCAR", lines=None): [ np.hstack( [ - float(lines[ind + i].split()[-1]) - if i != 7 - else [ - float(lines[ind_lst[-1] + 7].split()[-2]), - float(lines[ind_lst[-1] + 7].split()[-1]), - ] + ( + float(lines[ind + i].split()[-1]) + if i != 7 + else [ + float(lines[ind_lst[-1] + 7].split()[-2]), + float(lines[ind_lst[-1] + 7].split()[-1]), + ] + ) for i in range(2, 12) ] ) @@ -1242,9 +1256,11 @@ def _split_indices(ind_ionic_lst, ind_elec_lst): """ ind_elec_array = np.array(ind_elec_lst) return [ - ind_elec_array[(ind_elec_array < j2) & (j1 < ind_elec_array)] - if j1 < j2 - else ind_elec_array[(ind_elec_array < j2)] + ( + ind_elec_array[(ind_elec_array < j2) & (j1 < ind_elec_array)] + if j1 < j2 + else ind_elec_array[(ind_elec_array < j2)] + ) for j1, j2 in zip(np.roll(ind_ionic_lst, 1), ind_ionic_lst) ] @@ -1263,4 +1279,4 @@ def _get_lines_from_file(filename, lines=None): if lines is None: with open(filename, "r") as f: lines = f.readlines() - return lines \ No newline at end of file + return lines diff --git a/utils/vasp/parser/output.py b/utils/vasp/parser/output.py index b864d23..c8a223d 100644 --- a/utils/vasp/parser/output.py +++ b/utils/vasp/parser/output.py @@ -13,7 +13,13 @@ from utils.vasp.parser.outcar import Outcar import utils.generic as gen_tools -def check_convergence(directory, filename_vasprun="vasprun.xml", filename_vasplog="vasp.log", backup_vasplog="error.out"): + +def check_convergence( + directory, + filename_vasprun="vasprun.xml", + filename_vasplog="vasp.log", + backup_vasplog="error.out", +): """ Check the convergence status of a VASP calculation. @@ -29,15 +35,26 @@ def check_convergence(directory, filename_vasprun="vasprun.xml", filename_vasplo vr = Vasprun(filename=os.path.join(directory, filename_vasprun)) return vr.converged except: - line_converged = "reached required accuracy - stopping structural energy minimisation" + line_converged = ( + "reached required accuracy - stopping structural energy minimisation" + ) try: - return gen_tools.is_line_in_file(os.path.join(directory, filename_vasplog), line=line_converged, exact_match=False) + return gen_tools.is_line_in_file( + os.path.join(directory, filename_vasplog), + line=line_converged, + exact_match=False, + ) except: try: - return gen_tools.is_line_in_file(os.path.join(directory, backup_vasplog), line=line_converged, exact_match=False) + return gen_tools.is_line_in_file( + os.path.join(directory, backup_vasplog), + line=line_converged, + exact_match=False, + ) except: return False + def process_error_archives(directory): """ Processes all tar or tar.gz files starting with 'error' in the specified directory and its subdirectories. @@ -48,10 +65,14 @@ def process_error_archives(directory): Returns: pd.DataFrame: DataFrame containing the processed VASP outputs from error archives. """ - error_files = [os.path.join(root, file) - for root, dirs, files in os.walk(directory) - for file in files if file.startswith('error') and (file.endswith('.tar') or file.endswith('.tar.gz'))] - + error_files = [ + os.path.join(root, file) + for root, dirs, files in os.walk(directory) + for file in files + if file.startswith("error") + and (file.endswith(".tar") or file.endswith(".tar.gz")) + ] + df_list = [] for error_file in error_files: with tempfile.TemporaryDirectory() as temp_dir: @@ -67,12 +88,15 @@ def process_error_archives(directory): print(f"Processing error dirs in {directory} complete.") return pd.concat(df_list) if df_list else pd.DataFrame() -def _get_vasp_outputs_from_files(structure, outcar_path="OUTCAR", incar_path="INCAR", kpoints_path="KPOINTS"): + +def _get_vasp_outputs_from_files( + structure, outcar_path="OUTCAR", incar_path="INCAR", kpoints_path="KPOINTS" +): file_data = { "POSCAR": [structure], "OUTCAR": [np.nan], "INCAR": [np.nan], - "KPOINTS": [np.nan] + "KPOINTS": [np.nan], } if os.path.isfile(outcar_path): @@ -96,15 +120,20 @@ def _get_vasp_outputs_from_files(structure, outcar_path="OUTCAR", incar_path="IN file_data["KPOINTS"] = [kpoints] except Exception as e: pass - + return pd.DataFrame(file_data) + def _get_vasp_outputs(directory, structure=None, parse_all_in_dir=True): - outcar_files = glob.glob(os.path.join(directory, "OUTCAR*")) if parse_all_in_dir else glob.glob(os.path.join(directory, "OUTCAR")) - + outcar_files = ( + glob.glob(os.path.join(directory, "OUTCAR*")) + if parse_all_in_dir + else glob.glob(os.path.join(directory, "OUTCAR")) + ) + if structure is None: structure = get_structure(directory) - + if outcar_files: data = [] for outcar_file in outcar_files: @@ -112,22 +141,36 @@ def _get_vasp_outputs(directory, structure=None, parse_all_in_dir=True): incar_file = os.path.join(directory, f"INCAR{suffix}") kpoints_file = os.path.join(directory, f"KPOINTS{suffix}") - output_df = _get_vasp_outputs_from_files(structure, outcar_path=outcar_file, incar_path=incar_file, kpoints_path=kpoints_file) + output_df = _get_vasp_outputs_from_files( + structure, + outcar_path=outcar_file, + incar_path=incar_file, + kpoints_path=kpoints_file, + ) data.append(output_df) data = pd.concat(data) else: - data = pd.DataFrame({"POSCAR": [structure], "OUTCAR": [np.nan], "INCAR": [np.nan], "KPOINTS": [np.nan]}) - + data = pd.DataFrame( + { + "POSCAR": [structure], + "OUTCAR": [np.nan], + "INCAR": [np.nan], + "KPOINTS": [np.nan], + } + ) + return data + def get_SCF_cycle_convergence(outcar_scf_arrays, threshold=1e-5): diff = outcar_scf_arrays[-1] - outcar_scf_arrays[-2] return abs(diff) < threshold + def _get_KPOINTS_info(KPOINTS, INCAR): try: if np.isnan(KPOINTS): - kpoints_key = 'KSPACING' + kpoints_key = "KSPACING" return f"KSPACING: {INCAR.get(kpoints_key, 0.5)}" else: return KPOINTS @@ -135,89 +178,117 @@ def _get_KPOINTS_info(KPOINTS, INCAR): print(e) return np.nan + def process_outcar(outcar, structure): if pd.isna(outcar) or pd.isna(structure): - warning_message = ("Both OUTCAR and structure data are missing. Returning DataFrame with np.nan values." - if pd.isna(outcar) and pd.isna(structure) else - "OUTCAR data is missing. Returning DataFrame with np.nan values for OUTCAR-related fields." - if pd.isna(outcar) else - "Structure data is missing. Returning DataFrame with np.nan values for structure-related fields.") + warning_message = ( + "Both OUTCAR and structure data are missing. Returning DataFrame with np.nan values." + if pd.isna(outcar) and pd.isna(structure) + else ( + "OUTCAR data is missing. Returning DataFrame with np.nan values for OUTCAR-related fields." + if pd.isna(outcar) + else "Structure data is missing. Returning DataFrame with np.nan values for structure-related fields." + ) + ) warnings.warn(warning_message) - - return pd.DataFrame([{ - "calc_start_time": np.nan, - "consumed_time": np.nan, - "structures": np.nan, - "energy": np.nan, - "energy_zero": np.nan, - "forces": np.nan, - "stresses": np.nan, - "magmoms": np.nan, - "scf_steps": np.nan, - "scf_convergence": np.nan - }]) + + return pd.DataFrame( + [ + { + "calc_start_time": np.nan, + "consumed_time": np.nan, + "structures": np.nan, + "energy": np.nan, + "energy_zero": np.nan, + "forces": np.nan, + "stresses": np.nan, + "magmoms": np.nan, + "scf_steps": np.nan, + "scf_convergence": np.nan, + } + ] + ) try: energies = outcar.parse_dict["energies"] except: energies = np.nan - + try: - ionic_step_structures = np.array([Structure(cell, structure.species, outcar.parse_dict["positions"][i], coords_are_cartesian=True).to_json() - for i, cell in enumerate(outcar.parse_dict["cells"])]) + ionic_step_structures = np.array( + [ + Structure( + cell, + structure.species, + outcar.parse_dict["positions"][i], + coords_are_cartesian=True, + ).to_json() + for i, cell in enumerate(outcar.parse_dict["cells"]) + ] + ) except: ionic_step_structures = np.nan - + try: - energies_zero = outcar.parse_dict["energies_zero"] + energies_zero = outcar.parse_dict["energies_zero"] except: energies_zero = np.nan - + try: forces = outcar.parse_dict["forces"] except: forces = np.nan - + try: stresses = outcar.parse_dict["stresses"] except: stresses = np.nan - + try: magmoms = np.array(outcar.parse_dict["final_magmoms"]) except: magmoms = np.nan - + try: scf_steps = [len(i) for i in outcar.parse_dict["scf_energies"]] - scf_conv_list = [get_SCF_cycle_convergence(d, threshold=outcar.parse_dict["electronic_stop_criteria"]) for d in outcar.parse_dict["scf_energies"]] + scf_conv_list = [ + get_SCF_cycle_convergence( + d, threshold=outcar.parse_dict["electronic_stop_criteria"] + ) + for d in outcar.parse_dict["scf_energies"] + ] except Exception as e: print(e) scf_steps = np.nan scf_conv_list = np.nan - + try: calc_start_time = outcar.parse_dict["execution_datetime"] except: calc_start_time = np.nan - + try: consumed_time = outcar.parse_dict["resources"] except: consumed_time = np.nan - - return pd.DataFrame([{ - "calc_start_time": calc_start_time, - "consumed_time": consumed_time, - "structures": ionic_step_structures, - "energy": energies, - "energy_zero": energies_zero, - "forces": forces, - "stresses": stresses, - "magmoms": magmoms, - "scf_steps": scf_steps, - "scf_convergence": scf_conv_list - }]) + + return pd.DataFrame( + [ + { + "calc_start_time": calc_start_time, + "consumed_time": consumed_time, + "structures": ionic_step_structures, + "energy": energies, + "energy_zero": energies_zero, + "forces": forces, + "stresses": stresses, + "magmoms": magmoms, + "scf_steps": scf_steps, + "scf_convergence": scf_conv_list, + } + ] + ) + def get_structure(directory): """ @@ -229,29 +300,37 @@ def get_structure(directory): Returns: pymatgen.core.Structure: The structure object if successful, None otherwise. """ - structure_filenames = ["CONTCAR", "POSCAR"] + glob.glob(os.path.join(directory, "starter*.vasp")) + structure_filenames = ["CONTCAR", "POSCAR"] + glob.glob( + os.path.join(directory, "starter*.vasp") + ) for filename in structure_filenames: try: return Structure.from_file(os.path.join(directory, filename)) except Exception as e: - #print(f"Failed to parse structure file {filename}: {e}") + # print(f"Failed to parse structure file {filename}: {e}") pass print("Failed to parse appropriate structure file completely") return np.nan + def get_vasp_outputs(directory, extract_error_dirs=True, parse_all_in_dir=True): df_direct_outputs = _get_vasp_outputs(directory, parse_all_in_dir=parse_all_in_dir) - df_error_outputs = process_error_archives(directory) if extract_error_dirs else pd.DataFrame() + df_error_outputs = ( + process_error_archives(directory) if extract_error_dirs else pd.DataFrame() + ) return pd.concat([df_direct_outputs, df_error_outputs]) -def grab_electron_info(directory_path, line_before_elec_str="PAW_PBE", potcar_filename="POTCAR"): + +def grab_electron_info( + directory_path, line_before_elec_str="PAW_PBE", potcar_filename="POTCAR" +): structure = get_structure(directory_path) if structure: element_list, element_count = element_count_ordered(structure) - + electron_of_potcar = [] - with open(os.path.join(directory_path, potcar_filename), 'r') as file: + with open(os.path.join(directory_path, potcar_filename), "r") as file: lines = file.readlines() should_append = False for line in lines: @@ -261,13 +340,19 @@ def grab_electron_info(directory_path, line_before_elec_str="PAW_PBE", potcar_fi should_append = False if stripped_line.startswith(line_before_elec_str): should_append = True - + return element_list, element_count, electron_of_potcar -def get_total_electron_count(directory_path, line_before_elec_str="PAW_PBE", potcar_filename="POTCAR"): - ele_list, ele_count, electron_of_potcar = grab_electron_info(directory_path, line_before_elec_str, potcar_filename) + +def get_total_electron_count( + directory_path, line_before_elec_str="PAW_PBE", potcar_filename="POTCAR" +): + ele_list, ele_count, electron_of_potcar = grab_electron_info( + directory_path, line_before_elec_str, potcar_filename + ) return np.dot(ele_count, electron_of_potcar) + def element_count_ordered(structure): site_element_list = [site.species_string for site in structure] past_element = site_element_list[0] @@ -283,10 +368,15 @@ def element_count_ordered(structure): count = 1 past_element = element element_count.append(count) - return element_list, element_count + return element_list, element_count + def parse_vasp_directory(directory, extract_error_dirs=True, parse_all_in_dir=True): - df = get_vasp_outputs(directory, extract_error_dirs=extract_error_dirs, parse_all_in_dir=parse_all_in_dir) + df = get_vasp_outputs( + directory, + extract_error_dirs=extract_error_dirs, + parse_all_in_dir=parse_all_in_dir, + ) results_df = [] kpoints_list = [] for _, row in df.iterrows(): @@ -296,9 +386,11 @@ def parse_vasp_directory(directory, extract_error_dirs=True, parse_all_in_dir=Tr results_df = pd.concat(results_df).sort_values(by="calc_start_time") results_df["KPOINTS"] = kpoints_list results_df["INCAR"] = df["INCAR"].tolist() - + try: - element_list, element_count, electron_of_potcar = grab_electron_info(directory_path=directory, potcar_filename="POTCAR") + element_list, element_count, electron_of_potcar = grab_electron_info( + directory_path=directory, potcar_filename="POTCAR" + ) except: element_list = np.nan element_count = np.nan diff --git a/utils/vasp/resubmitter.py b/utils/vasp/resubmitter.py index 104827f..e92691d 100644 --- a/utils/vasp/resubmitter.py +++ b/utils/vasp/resubmitter.py @@ -8,97 +8,163 @@ from utils.generic import get_latest_file_iteration from utils.jobfile import jobfile + def get_slurm_jobs_working_directories(username="hmai"): - command = f"squeue -u {username} -o \"%i %Z\"" + command = f'squeue -u {username} -o "%i %Z"' result = subprocess.run(command, shell=True, capture_output=True, text=True) output_lines = result.stdout.strip().split("\n")[1:] # Remove the header line - + # Parse the output lines into a list of tuples (job_id, working_directory) data = [line.split() for line in output_lines] - + # Create a Pandas DataFrame from the data df = pd.DataFrame(data, columns=["Job ID", "Working Directory"]) - + return df -class CalculationConverger(): - - def __init__(self, parent_dir, script_template_dir, max_submissions=1000, submission_command="sbatch", username="hmai"): + +class CalculationConverger: + + def __init__( + self, + parent_dir, + script_template_dir, + max_submissions=1000, + submission_command="sbatch", + username="hmai", + ): self.parent_dir = parent_dir self.max_submissions = max_submissions self.submission_command = submission_command - self.vasp_dirs = find_vasp_directories(parent_dir, filenames=["INCAR", "POTCAR"], all_present=True, extract_tarballs=False) + self.vasp_dirs = find_vasp_directories( + parent_dir, + filenames=["INCAR", "POTCAR"], + all_present=True, + extract_tarballs=False, + ) self.script_template_dir = script_template_dir self.user = username def submit_to_queue(self, dirpath, script_name): os.system(f"cd {dirpath} && {self.submission_command} {script_name}") - - def reconverge_all(self, calc_type="DRS", HPC="Setonix", VASP_version="5.4.4", CPU=128, walltime=24, cpu_per_node=128, from_dataframe_path=None): + + def reconverge_all( + self, + calc_type="DRS", + HPC="Setonix", + VASP_version="5.4.4", + CPU=128, + walltime=24, + cpu_per_node=128, + from_dataframe_path=None, + ): non_converged = self.load_non_converged_paths(from_dataframe_path) running_jobs_df = get_slurm_jobs_working_directories(self.user) running_queued_job_directories = running_jobs_df["Working Directory"].to_list() dirs_to_search_next_time, leftover_calcs_exceeding_queue_limit = [], [] - dirs_to_apply_reconverge = set(non_converged or self.vasp_dirs) - set(running_queued_job_directories) + dirs_to_apply_reconverge = set(non_converged or self.vasp_dirs) - set( + running_queued_job_directories + ) for i, dir in enumerate(dirs_to_apply_reconverge): if not check_convergence(dir): if i + len(running_queued_job_directories) > self.max_submissions: leftover_calcs_exceeding_queue_limit.append(dir) else: - self.reconverge(dir, calc_type, HPC, VASP_version, CPU, walltime, cpu_per_node) + self.reconverge( + dir, calc_type, HPC, VASP_version, CPU, walltime, cpu_per_node + ) dirs_to_search_next_time.append(dir) else: print(f"CONVERGED: {dir}") - self.update_resubmit_log(dirs_to_search_next_time + running_queued_job_directories + leftover_calcs_exceeding_queue_limit) + self.update_resubmit_log( + dirs_to_search_next_time + + running_queued_job_directories + + leftover_calcs_exceeding_queue_limit + ) return dirs_to_search_next_time def load_non_converged_paths(self, from_dataframe_path): if from_dataframe_path: df = pd.read_pickle(from_dataframe_path) - return [path.rstrip(os.sep + "OUTCAR") if path.endswith(os.sep + "OUTCAR") else path for path in df['filepath'].tolist()] + return [ + ( + path.rstrip(os.sep + "OUTCAR") + if path.endswith(os.sep + "OUTCAR") + else path + ) + for path in df["filepath"].tolist() + ] return self.reconverge_from_log_file() - + def update_resubmit_log(self, dirs_to_search_next_time): with open(os.path.join(self.parent_dir, "resubmit.log"), "w") as log_file: for dir_path in dirs_to_search_next_time: log_file.write(dir_path + "\n") - def reconverge(self, dirpath, calc_type="SDRS", HPC="Setonix", VASP_version="5.4.4", CPU=128, walltime=24, cpu_per_node=128): + def reconverge( + self, + dirpath, + calc_type="SDRS", + HPC="Setonix", + VASP_version="5.4.4", + CPU=128, + walltime=24, + cpu_per_node=128, + ): self.handle_error_run_files(dirpath) reconverge_methods = { "static": self.reconverge_static, "SDRS": self.reconverge_SDRS, "DRS": self.reconverge_DRS, - "base": self.reconverge_base + "base": self.reconverge_base, } reconverge_method = reconverge_methods.get(calc_type, self.reconverge_base) reconverge_method(dirpath, HPC, VASP_version, CPU, walltime, cpu_per_node) - + def handle_error_run_files(self, dirpath): - error_tar_files_exist = any("error" in f and "tar" in f for f in os.listdir(dirpath)) + error_tar_files_exist = any( + "error" in f and "tar" in f for f in os.listdir(dirpath) + ) if error_tar_files_exist: latest_error_run_index = self.find_latest_error_run_index(dirpath) - error_run_folder_path = os.path.join(dirpath, f"error_run_{latest_error_run_index + 1}") + error_run_folder_path = os.path.join( + dirpath, f"error_run_{latest_error_run_index + 1}" + ) os.makedirs(error_run_folder_path) self.move_files_to_error_run_folder(dirpath, error_run_folder_path) def move_files_to_error_run_folder(self, dirpath, error_run_folder_path): for f in os.listdir(dirpath): if ("error" in f and "tar" in f) or f.endswith(".sh"): - shutil.move(os.path.join(dirpath, f), os.path.join(error_run_folder_path, f)) + shutil.move( + os.path.join(dirpath, f), os.path.join(error_run_folder_path, f) + ) for og_file in ["INCAR.orig", "POSCAR.orig", "KPOINTS.orig", "custodian.json"]: if os.path.exists(os.path.join(dirpath, og_file)): - shutil.move(os.path.join(dirpath, og_file), os.path.join(error_run_folder_path, og_file)) - - for current_run in ["INCAR", "POSCAR", "POTCAR", "OUTCAR", "vasprun.xml", "vasp.log"]: + shutil.move( + os.path.join(dirpath, og_file), + os.path.join(error_run_folder_path, og_file), + ) + + for current_run in [ + "INCAR", + "POSCAR", + "POTCAR", + "OUTCAR", + "vasprun.xml", + "vasp.log", + ]: if os.path.exists(os.path.join(dirpath, current_run)): - shutil.copy(os.path.join(dirpath, current_run), os.path.join(error_run_folder_path, current_run)) - + shutil.copy( + os.path.join(dirpath, current_run), + os.path.join(error_run_folder_path, current_run), + ) + def find_latest_error_run_index(self, dirpath): error_run_indices = [0] for f in os.listdir(dirpath): @@ -109,24 +175,56 @@ def find_latest_error_run_index(self, dirpath): except ValueError as e: print(f"Exception occurred at {dirpath}: {e}") return max(error_run_indices) - + def generate_custodian_string(self, template_filename, user_inputs): template_path = os.path.join(self.script_template_dir, template_filename) return jobfile._replace_fields(template_path, user_inputs) def reconverge_base(self, dirpath, HPC, VASP_version, CPU, walltime, cpu_per_node): - self.reconverge_generic(dirpath, "template_BASE.py", HPC, VASP_version, CPU, walltime, cpu_per_node) + self.reconverge_generic( + dirpath, "template_BASE.py", HPC, VASP_version, CPU, walltime, cpu_per_node + ) - def reconverge_static(self, dirpath, HPC, VASP_version, CPU, walltime, cpu_per_node): - self.reconverge_generic(dirpath, "template_Static.py", HPC, VASP_version, CPU, walltime, cpu_per_node) + def reconverge_static( + self, dirpath, HPC, VASP_version, CPU, walltime, cpu_per_node + ): + self.reconverge_generic( + dirpath, + "template_Static.py", + HPC, + VASP_version, + CPU, + walltime, + cpu_per_node, + ) def reconverge_DRS(self, dirpath, HPC, VASP_version, CPU, walltime, cpu_per_node): stages_left = self.get_stages_left(dirpath, ["relax_1", "relax_2"], 3) - self.reconverge_generic(dirpath, "template_DRS.py", HPC, VASP_version, CPU, walltime, cpu_per_node, {"{STAGES_LEFT}": str(stages_left)}) + self.reconverge_generic( + dirpath, + "template_DRS.py", + HPC, + VASP_version, + CPU, + walltime, + cpu_per_node, + {"{STAGES_LEFT}": str(stages_left)}, + ) def reconverge_SDRS(self, dirpath, HPC, VASP_version, CPU, walltime, cpu_per_node): - stages_left = self.get_stages_left(dirpath, ["static_1", "relax_1", "relax_2"], 4) - self.reconverge_generic(dirpath, "template_SDRS.py", HPC, VASP_version, CPU, walltime, cpu_per_node, {"{STAGES_LEFT}": str(stages_left)}) + stages_left = self.get_stages_left( + dirpath, ["static_1", "relax_1", "relax_2"], 4 + ) + self.reconverge_generic( + dirpath, + "template_SDRS.py", + HPC, + VASP_version, + CPU, + walltime, + cpu_per_node, + {"{STAGES_LEFT}": str(stages_left)}, + ) def get_stages_left(self, dirpath, stage_markers, default_stages_left): for i, marker in enumerate(reversed(stage_markers)): @@ -134,17 +232,41 @@ def get_stages_left(self, dirpath, stage_markers, default_stages_left): return i + 1 return default_stages_left - def reconverge_generic(self, dirpath, template_filename, HPC, VASP_version, CPU, walltime, cpu_per_node, extra_inputs=None): + def reconverge_generic( + self, + dirpath, + template_filename, + HPC, + VASP_version, + CPU, + walltime, + cpu_per_node, + extra_inputs=None, + ): user_inputs = { - '{VASPOUTPUTFILENAME}': '"vasp.log"', - '{MAXCUSTODIANERRORS}': "20" + "{VASPOUTPUTFILENAME}": '"vasp.log"', + "{MAXCUSTODIANERRORS}": "20", } if extra_inputs: user_inputs.update(extra_inputs) - custodian_string = self.generate_custodian_string(template_filename, user_inputs) - script_name = os.path.join(self.script_template_dir, f"{template_filename.split('_')[0]}_Custodian_{HPC}.sh") - job = jobfile(file_path=script_name, HPC=HPC, VASP_version=VASP_version, CPU=CPU, walltime=walltime, cpu_per_node=cpu_per_node, generic_insert_field=["{CUSTODIANSTRING}"], generic_insert=[custodian_string]) + custodian_string = self.generate_custodian_string( + template_filename, user_inputs + ) + script_name = os.path.join( + self.script_template_dir, + f"{template_filename.split('_')[0]}_Custodian_{HPC}.sh", + ) + job = jobfile( + file_path=script_name, + HPC=HPC, + VASP_version=VASP_version, + CPU=CPU, + walltime=walltime, + cpu_per_node=cpu_per_node, + generic_insert_field=["{CUSTODIANSTRING}"], + generic_insert=[custodian_string], + ) target_script_name = f"{os.path.basename(dirpath)}.sh" job.to_file(job_name=target_script_name, output_path=dirpath) self.submit_to_queue(dirpath, target_script_name) @@ -156,7 +278,10 @@ def reconverge_from_log_file(self): non_converged_dirs = [line.strip() for line in log_file.readlines()] largest_n = get_latest_file_iteration(self.parent_dir, "resubmit.log_") - os.rename(resubmit_log_file, os.path.join(self.parent_dir, f"resubmit.log_{largest_n + 1}")) + os.rename( + resubmit_log_file, + os.path.join(self.parent_dir, f"resubmit.log_{largest_n + 1}"), + ) return non_converged_dirs else: diff --git a/utils/vasp/vasp_potential_training_database.py b/utils/vasp/vasp_potential_training_database.py index a4b6498..e69de29 100644 --- a/utils/vasp/vasp_potential_training_database.py +++ b/utils/vasp/vasp_potential_training_database.py @@ -1 +0,0 @@ -def \ No newline at end of file