Skip to content

Commit

Permalink
update potential_df generator
Browse files Browse the repository at this point in the history
  • Loading branch information
Han Lin Mai committed Jun 18, 2024
1 parent 3e347c5 commit 6fceec5
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 31 deletions.
50 changes: 29 additions & 21 deletions actual_usage/build_vasp_potential_training_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import warnings
from multiprocessing import cpu_count


def main():
warnings.filterwarnings("ignore")

Expand Down Expand Up @@ -36,6 +35,11 @@ def main():
default=True,
help="Read directories with errors",
)
parser.add_argument(
"--use_total_energy_over_eVatom",
action="store_true",
help="Use total energy for filtering instead of eV/atom",
)
args = parser.parse_args()

datagen = DatabaseGenerator(args.directory, max_workers=cpu_count())
Expand All @@ -45,25 +49,29 @@ def main():
max_dir_count = args.max_dir_count
else:
max_dir_count = 2000 # Default value

df = datagen.build_potential_database(extract_directories=args.extract,
read_multiple_runs_in_dir=args.read_all_runs_in_dir,
read_error_dirs=args.read_error_runs_in_dir,
max_dir_count=max_dir_count,
tarball_extensions=(".tar.gz", ".tar.bz2"),
cleanup=False,
keep_filenames_after_cleanup=[],
keep_filename_patterns_after_cleanup=[],
filenames_to_qualify=["OUTCAR"],#, "vasprun.xml"],
all_present=True,
df_filename=None,
df_compression=True,
incar_checks={"ENCUT": 400,
"LREAL": "Auto"
},
energy_threshold=0.2
)
df.to_pickle("potential_training_df.pkl.gz",
compression="gzip")

df = datagen.build_potential_database(
extract_directories=args.extract,
read_multiple_runs_in_dir=args.read_all_runs_in_dir,
read_error_dirs=args.read_error_runs_in_dir,
max_dir_count=max_dir_count,
tarball_extensions=(".tar.gz", ".tar.bz2"),
cleanup=False,
keep_filenames_after_cleanup=[],
keep_filename_patterns_after_cleanup=[],
filenames_to_qualify=["OUTCAR"], # "vasprun.xml"],
all_present=True,
df_filename=None,
df_compression=True,
incar_checks={
"ENCUT": 400,
"LREAL": "Auto"
},
energy_threshold=0.1,
use_ev_atom=not args.use_total_energy_over_eVatom # Toggle based on the argument
)

df.to_pickle("potential_training_df.pkl.gz", compression="gzip")

if __name__ == "__main__":
main()
21 changes: 11 additions & 10 deletions utils/vasp/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -829,9 +829,9 @@ def build_potential_database(
df_filename=None,
df_compression=True,
incar_checks={},
energy_threshold=0.1 # eV/atom
energy_threshold=0.1, # eV/atom
use_ev_atom=True # Toggle for using eV_atom or energy
):

def drop_nan_structures_rows(df, column_name="structures"):
return df[~df[column_name].apply(lambda x: isinstance(x, list) and all(pd.isna(y) for y in x))].reset_index(drop=True)

Expand Down Expand Up @@ -862,7 +862,7 @@ def _check_INCAR_param(incar):
df['INCAR_ok'] = df['INCAR'].apply(_check_INCAR_param)
return df

def filter_by_ev_atom(df, column="eV_atom", threshold=0.1):
def filter_by_energy(df, column, threshold=0.1):
df_sorted = df.sort_values(by=column).reset_index(drop=True)
filtered_rows = []

Expand All @@ -876,11 +876,11 @@ def filter_by_ev_atom(df, column="eV_atom", threshold=0.1):

def apply_filter_to_groups(df, group_column="job_name", filter_column="eV_atom", threshold=0.1):
filtered_dfs = df.groupby(group_column).apply(
lambda group: filter_by_ev_atom(group, column=filter_column, threshold=threshold)
lambda group: filter_by_energy(group, column=filter_column, threshold=threshold)
)
filtered_dfs = filtered_dfs.reset_index(drop=True)
return filtered_dfs

df = self.build_database(
target_directory=target_directory,
extract_directories=extract_directories,
Expand All @@ -907,10 +907,12 @@ def apply_filter_to_groups(df, group_column="job_name", filter_column="eV_atom",
df = df.dropna(subset=['energy'])
df["structures"] = df.structures.apply(lambda x: Structure.from_str(x, fmt="json"))
df["n_atoms"] = df.structures.apply(lambda x: x.num_sites)
df["eV_atom"] = df.energy/df.n_atoms

df = apply_filter_to_groups(df, group_column="job_name", filter_column="eV_atom", threshold=energy_threshold)

df["eV_atom"] = df.energy / df.n_atoms

df = df[df["INCAR_ok"] == True]
filter_column = "eV_atom" if use_ev_atom else "energy"
df = apply_filter_to_groups(df, group_column="job_name", filter_column=filter_column, threshold=energy_threshold)

return df
# def update_database(self,
# new_calculation_directory,
Expand Down Expand Up @@ -951,7 +953,6 @@ def apply_filter_to_groups(df, group_column="job_name", filter_column="eV_atom",

# return base_df


def update_database(df_base, df_update):
# Get the unique job names from df2
df_update_jobs = set(df_update["job_name"])
Expand Down

0 comments on commit 6fceec5

Please sign in to comment.