diff --git a/README.md b/README.md index 2cbb2e92..ab0cbe57 100644 --- a/README.md +++ b/README.md @@ -32,13 +32,15 @@ Latest Update: follow up by clicking `Starred` and `Watch` on our [GitHub repos The following figure is the simplified overview of Geochemistry π:
-![Overview of workflow](https://github.com/ZJUEarthData/geochemistrypi/assets/47497750/28e174f0-1f2f-4367-96bd-9526352101bd) +

+ Overview of workflow +

The following figure is the frontend-backend separation architecture of Geochemistry:
-
- Frontend-backend separation architecture of Geochemistry -
+

+ Frontend-backend separation architecture of Geochemistry +

## Quick Installation @@ -140,6 +142,11 @@ Its data section provides feature engineering based on **arithmatic operation**. Its models section provides both **supervised learning** and **unsupervised learning** methods from **Scikit-learn** framework, including four types of algorithms, regression, classification, clustering, and dimensional reduction. Integrated with **FLAML** and **Ray** framework, it allows the users to run AutoML easily, fastly and cost-effectively on the built-in supervised learning algorithms in our framework. +The following figure is the hierarchical architecture of Geochemistry π: +

+ Hierarchical Architecture +

+ ### Second Phase Currently, we are building three access ways to provide more user-friendly service, including **web portal**, **CLI package** and **API**. It allows the user to perform **continuous training** and **model inference** by automating the ML pipeline and **machine learning lifecycle management** by unique storage mechanism in different access layers. @@ -151,9 +158,9 @@ The following figure is the system architecture diagram:
The following figure is the customized automated ML pipeline:
-
+

Customized automated ML pipeline -

+

The following figure is the design pattern hierarchical architecture:
@@ -162,9 +169,9 @@ The following figure is the design pattern hierarchical architecture:
The following figure is the storage mechanism:
-
+

Storage Mechanism -

+

The whole package is under construction and the documentation is progressively evolving. diff --git a/geochemistrypi/data_mining/cli_pipeline.py b/geochemistrypi/data_mining/cli_pipeline.py index bd8c2840..d6cb27d7 100644 --- a/geochemistrypi/data_mining/cli_pipeline.py +++ b/geochemistrypi/data_mining/cli_pipeline.py @@ -65,20 +65,20 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N print("\n[bold blue]Welcome to Geochemistry π![/bold blue]") print("[bold]Initializing...[/bold]") - # <-- User Data Loading --> - with console.status("[bold green]Data Loading...[/bold green]", spinner="dots"): - sleep(1) + # <-- User Training Data Loading --> + with console.status("[bold green]Training Data Loading...[/bold green]", spinner="dots"): + sleep(0.75) if training_data_path: - # If the user provides file name, then load the data from the file. + # If the user provides file name, then load the training data from the file. data = read_data(file_path=training_data_path, is_own_data=1) - print("[bold green]Successfully Loading Own Data![bold green]") + print("[bold green]Successfully Loading Own Training Data![bold green]") else: - print("[bold red]No Data File Provided![/bold red]") + print("[bold red]No Training Data File Provided![/bold red]") print("[bold green]Built-in Data Loading.[/bold green]") # <-- User Inference Data Loading --> with console.status("[bold green]Inference Data Loading...[/bold green]", spinner="dots"): - sleep(1) + sleep(0.75) is_built_in_inference_data = False if training_data_path and inference_data_path: # If the user provides file name, then load the inference data from the file. @@ -95,7 +95,7 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N # <-- Dependency Checking --> with console.status("[bold green]Denpendency Checking...[/bold green]", spinner="dots"): - sleep(1.5) + sleep(0.75) my_os = get_os() # Check the dependency of the basemap or cartopy to project the data on the world map later. if my_os == "Windows" or my_os == "Linux": @@ -143,9 +143,10 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N experiment = mlflow.get_experiment(experiment_id=old_experiment_id) else: new_experiment_name = Prompt.ask("✨ New Experiment", default="GeoPi - Rock Classification") - new_experiment_tag = Prompt.ask("✨ Experiment Tag Version", default="E - v1.0.0") + # new_experiment_tag = Prompt.ask("✨ Experiment Tag Version", default="E - v1.0.0") try: - new_experiment_id = mlflow.create_experiment(name=new_experiment_name, artifact_location=artifact_localtion, tags={"version": new_experiment_tag}) + # new_experiment_id = mlflow.create_experiment(name=new_experiment_name, artifact_location=artifact_localtion, tags={"version": new_experiment_tag}) + new_experiment_id = mlflow.create_experiment(name=new_experiment_name, artifact_location=artifact_localtion) except mlflow.exceptions.MlflowException as e: if "already exists" in str(e): console.print(" The experiment name already exists.", style="bold red") @@ -157,26 +158,27 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N experiment = mlflow.get_experiment(experiment_id=new_experiment_id) # print("Artifact Location: {}".format(experiment.artifact_location)) run_name = Prompt.ask("✨ Run Name", default="Xgboost Algorithm - Test 1") - run_tag = Prompt.ask("✨ Run Tag Version", default="R - v1.0.0") - run_description = Prompt.ask("✨ Run Description", default="Use xgboost for GeoPi classification.") - mlflow.start_run(run_name=run_name, experiment_id=experiment.experiment_id, tags={"version": run_tag, "description": run_description}) + # run_tag = Prompt.ask("✨ Run Tag Version", default="R - v1.0.0") + # run_description = Prompt.ask("✨ Run Description", default="Use xgboost for GeoPi classification.") + # mlflow.start_run(run_name=run_name, experiment_id=experiment.experiment_id, tags={"version": run_tag, "description": run_description}) + mlflow.start_run(run_name=run_name, experiment_id=experiment.experiment_id) create_geopi_output_dir(experiment.name, run_name) clear_output() - # <--- Built-in Data Loading ---> - logger.debug("Built-in Data Loading") - # If the user doesn't provide the training data path, then use the built-in data. + # <--- Built-in Training Data Loading ---> + logger.debug("Built-in Training Data Loading") + # If the user doesn't provide the training data path, then use the built-in training data. if not training_data_path: - print("-*-*- Built-in Data Option-*-*-") + print("-*-*- Built-in Training Data Option-*-*-") num2option(TEST_DATA_OPTION) - built_in_data_num = limit_num_input(TEST_DATA_OPTION, SECTION[0], num_input) - if built_in_data_num == 1: + built_in_training_data_num = limit_num_input(TEST_DATA_OPTION, SECTION[0], num_input) + if built_in_training_data_num == 1: training_data_path = "Data_Regression.xlsx" - elif built_in_data_num == 2: + elif built_in_training_data_num == 2: training_data_path = "Data_Classification.xlsx" - elif built_in_data_num == 3: + elif built_in_training_data_num == 3: training_data_path = "Data_Clustering.xlsx" - elif built_in_data_num == 4: + elif built_in_training_data_num == 4: training_data_path = "Data_Decomposition.xlsx" data = read_data(file_path=training_data_path) print(f"Successfully loading the built-in training data set '{training_data_path}'.") @@ -462,6 +464,9 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N print("You did not enter inference data.") inference_data_fe_selected = None clear_output() + else: + # If the model is unsupervised learning, then don't allow the user to use model inference. + inference_data_fe_selected = None # <--- Model Training ---> logger.debug("Model Training")