From 20677762198a57f653d2bdc34c77464ef760303b Mon Sep 17 00:00:00 2001
From: YuanbinLiu <lyb122502@126.com>
Date: Fri, 8 Nov 2024 07:11:59 +0000
Subject: [PATCH] fix minor issues

---
 autoplex/auto/rss/flow.py        | 373 ++++++++++++++++---------------
 autoplex/data/common/flows.py    |  12 +-
 autoplex/data/common/utils.py    |   7 +-
 autoplex/data/rss/flows.py       |   1 -
 autoplex/fitting/common/utils.py |  17 +-
 5 files changed, 210 insertions(+), 200 deletions(-)

diff --git a/autoplex/auto/rss/flow.py b/autoplex/auto/rss/flow.py
index 7075e751..40fec7db 100644
--- a/autoplex/auto/rss/flow.py
+++ b/autoplex/auto/rss/flow.py
@@ -41,191 +41,194 @@ def make(self, config_file: str | None = None, **kwargs):
             Path to the configuration file that defines the setup parameters for the whole RSS workflow.
             If not provided, the default file 'rss_default_configuration.yaml' will be used.
         **kwargs: dict, optional
-            Additional optional keyword arguments to customize the job execution. The allowed keys and values are:
-            - tag: str
-                Tag of systems. It can also be used for setting up elements and stoichiometry.
-                For example, the tag of 'SiO2' will be recognized as a 1:2 ratio of Si to O and
-                passed into the parameters of buildcell. However, note that this will be overwritten
-                if the stoichiometric ratio of elements is defined in the 'buildcell_options'.
-            - train_from_scratch : bool
-                If True, it starts the workflow from scratch.
-                If False, it resumes from a previous state.
-            - resume_from_previous_state: dict | None
-                A dictionary containing the state information required to resume a previously interrupted
-                or saved RSS workflow. When 'train_from_scratch' is set to False, this parameter is mandatory
-                for the workflow to pick up from a saved state.
-                Expected keys within this dictionary:
-                - test_error: float
-                    The test error from the last completed training step.
-                - pre_database_dir: str
-                    Path to the directory containing the pre-existing database for resuming.
-                - mlip_path: str
-                    Path to the file of a previous MLIP model.
-                - isolated_atom_energies: dict
-                    A dictionary of isolated atom energy values, with atomic numbers as keys
-                    and their energies as valuables.
-            - generated_struct_numbers: list[int]
-                Expected number of generated randomized unit cells by buildcell.
-            - buildcell_options: list[dict] | None
-                Customized parameters for buildcell. Default is None.
-            - fragment: Atoms | list[Atoms] | None
-                Fragment(s) for random structures, e.g., molecules, to be placed indivudally intact.
-                atoms.arrays should have a 'fragment_id' key with unique identifiers for each fragment if in same Atoms.
-                atoms.cell must be defined (e.g., Atoms.cell = np.eye(3)*20).
-            - fragment_numbers: list[str] | None
-                Numbers of each fragment to be included in the random structures. Defaults to 1 for all specified.
-            - num_processes_buildcell: int
-                Number of processes to use for parallel computation during buildcell generation.
-            - num_of_initial_selected_structs: list[int] | None
-                Number of structures to be sampled directly from the buildcell-generated randomized cells.
-            - num_of_rss_selected_structs: int
-                Number of structures to be selected from each RSS iteration.
-            - initial_selection_enabled: bool
-                If true, sample structures from initially generated randomized cells using CUR.
-            - rss_selection_method: str
-                Method for selecting samples from the RSS trajectories:
-                Boltzmann flat histogram in enthalpy first, then CUR.
-                Options include:
-                - 'bcur1s': Execute bcur with one shot (1s)
-                - 'bcur2i': Execute bcur with two iterations (2i)
-            - bcur_params: dict | None
-                Parameters for Boltzmann CUR selection. The default dictionary includes:
-                - soap_paras: dict
-                    SOAP descriptor parameters:
-                    - l_max: int
-                        Maximum degree of spherical harmonics (default 12).
-                    - n_max: int
-                        Maximum number of radial basis functions (default 12).
-                    - atom_sigma: float
-                        Width of Gaussian smearing (default 0.0875).
-                    - cutoff: float
-                        Radial cutoff distance (default 10.5).
-                    - cutoff_transition_width: float
-                        Width of the transition region (default 1.0).
-                    - zeta: float
-                        Exponent for dot-product SOAP kernel (default 4.0).
-                    - average: bool
-                        Whether to average the SOAP vectors (default True).
-                    - species: bool
-                        Whether to consider species information (default True).
-                - kt: float
-                    Temperature in eV for Boltzmann weighting (default 0.3).
-                - frac_of_bcur: float
-                    Fraction of Boltzmann CUR selections (default 0.8).
-                - bolt_max_num: int
-                    Maximum number of Boltzmann selections (default 3000).
-                - kernel_exp: float
-                    Exponent for the kernel (default 4.0).
-                - energy_label: str
-                    Label for the energy data (default 'energy').
-            - random_seed: int | None
-                A seed to ensure reproducibility of CUR selection. Default is None.
-            - include_isolated_atom: bool
-                If true, perform single-point calculations for isolated atoms.
-            - isolatedatom_box: list[float]
-                List of the lattice constants for an isolated atom configuration.
-            - e0_spin: bool
-                If true, include spin polarization in isolated atom and dimer calculations. Default is False.
-            - include_dimer: bool
-                If true, perform single-point calculations for dimers only once. Default is False.
-            - dimer_box: list[float]
-                The lattice constants of a dimer box.
-            - dimer_range: list[float]
-                Range of distances for dimer calculations.
-            - dimer_num: int
-                Number of different distances to consider for dimer calculations. Default is 21.
-            - custom_incar: dict | None
-                Dictionary of custom VASP input parameters. If provided, will update the
-                default parameters. Default is None.
-            - custom_potcar: dict | None
-                Dictionary of POTCAR settings to update. Keys are element symbols, values are the desired POTCAR labels.
-                Default is None.
-            - vasp_ref_file: str
-                Reference file for VASP data. Default is 'vasp_ref.extxyz'.
-            - config_types: list[str]
-                Configuration types for the VASP calculations. Default is None.
-            - rss_group: list[str]
-                Group name for RSS to setting up regularization.
-            - test_ratio: float
-                The proportion of the test set after splitting the data. The value is allowed to be set to 0;
-                in this case, the testing error would not be meaningful anymore.
-            - regularization: bool
-                If True, apply regularization. This only works for GAP to date. Default is False.
-            - scheme: str
-                Method to use for regularization. Options are:
-                - 'linear_hull': for single-composition system, use 2D convex hull (E, V)
-                - 'volume-stoichiometry': for multi-composition system, use 3D convex hull of (E, V, mole-fraction)
-            - reg_minmax: list[tuple]
-                list of tuples of (min, max) values for energy, force, virial sigmas for regularization.
-            - distillation: bool
-                If true, apply data distillation. Default is True.
-            - force_max: float | None
-                Maximum force value to exclude structures. Default is 50.
-            - force_label: str | None
-                The label of force values to use for distillation. Default is 'REF_forces'.
-            - pre_database_dir: str | None
-                Directory where the previous database was saved.
-            - mlip_type: str
-                Choose one specific MLIP type to be fitted: 'GAP' | 'J-ACE' | 'P-ACE' | 'NEQUIP' | 'M3GNET' | 'MACE'.
-                Default is 'GAP'.
-            - ref_energy_name: str
-                Reference energy name. Default is 'REF_energy'.
-            - ref_force_name: str
-                Reference force name. Default is 'REF_forces'.
-            - ref_virial_name: str
-                Reference virial name. Default is 'REF_virial'.
-            - auto_delta: bool
-                If true, apply automatic determination of delta for GAP terms. Default is False.
-            - num_processes_fit: int
-                Number of processes used for fitting. Default is 1.
-            - device_for_fitting: str
-                Device to be used for model fitting, either "cpu" or "cuda".
-            - **fit_kwargs:
-                Additional keyword arguments for the MLIP fitting process.
-            - scalar_pressure_method: str
-                Method for adding external pressures.
-                Acceptable options are:
-                - 'exp': Applies pressure using an exponential distribution.
-                - 'uniform': Applies pressure using a uniform distribution.
-            - scalar_exp_pressure: float
-                Scalar exponential pressure. Default is 100.
-            - scalar_pressure_exponential_width: float
-                Width for scalar pressure exponential. Default is 0.2.
-            - scalar_pressure_low: float
-                Low limit for scalar pressure. Default is 0.
-            - scalar_pressure_high: float
-                High limit for scalar pressure. Default is 50.
-            - max_steps: int
-                Maximum number of steps for relaxation. Default is 200.
-            - force_tol: float
-                Force residual tolerance for relaxation. Default is 0.05.
-            - stress_tol: float
-                Stress residual tolerance for relaxation. Default is 0.05.
-            - hookean_repul: bool
-                If true, apply Hookean repulsion. Default is False.
-            - hookean_paras: dict[tuple[int, int], tuple[float, float]] | None
-                Parameters for Hookean repulsion as a dictionary of tuples. Default is None.
-            - keep_symmetry: bool
-                If true, preserve symmetry during relaxation. Default is False.
-            - write_traj: bool
-                If true, write trajectory of RSS. Default is True.
-            - num_processes_rss: int
-                Number of processes used for running RSS. Default is 1.
-            - device_for_rss: str
-                Specify device to use "cuda" or "cpu" for running RSS. Default is "cpu".
-            - stop_criterion: float
-                Convergence criterion for stopping RSS iterations. Default is 0.01.
-            - max_iteration_number: int
-                Maximum number of RSS iterations to perform. Default is 25.
-            - num_groups: int
-                Number of structure groups, used for assigning tasks across multiple nodes.
-                For example, if there are 10,000 trajectories to relax and 'num_groups=10',
-                the trajectories will be divided into 10 groups and 10 independent jobs will be created,
-                with each job handling 1,000 trajectories.
-            - initial_kt: float
-                Initial temperature (in eV) for Boltzmann sampling. Default is 0.3.
-            - current_iter_index: int
-                Index for the current RSS iteration. Default is 1.
+            Additional optional keyword arguments to customize the job execution.
+
+        Keyword Arguments
+        -----------------
+        - tag: str
+            Tag of systems. It can also be used for setting up elements and stoichiometry.
+            For example, the tag of 'SiO2' will be recognized as a 1:2 ratio of Si to O and
+            passed into the parameters of buildcell. However, note that this will be overwritten
+            if the stoichiometric ratio of elements is defined in the 'buildcell_options'.
+        - train_from_scratch : bool
+            If True, it starts the workflow from scratch.
+            If False, it resumes from a previous state.
+        - resume_from_previous_state: dict | None
+            A dictionary containing the state information required to resume a previously interrupted
+            or saved RSS workflow. When 'train_from_scratch' is set to False, this parameter is mandatory
+            for the workflow to pick up from a saved state.
+            Expected keys within this dictionary:
+            - test_error: float
+                The test error from the last completed training step.
+            - pre_database_dir: str
+                Path to the directory containing the pre-existing database for resuming.
+            - mlip_path: str
+                Path to the file of a previous MLIP model.
+            - isolated_atom_energies: dict
+                A dictionary of isolated atom energy values, with atomic numbers as keys
+                and their energies as valuables.
+        - generated_struct_numbers: list[int]
+            Expected number of generated randomized unit cells by buildcell.
+        - buildcell_options: list[dict] | None
+            Customized parameters for buildcell. Default is None.
+        - fragment: Atoms | list[Atoms] | None
+            Fragment(s) for random structures, e.g., molecules, to be placed indivudally intact.
+            atoms.arrays should have a 'fragment_id' key with unique identifiers for each fragment if in same Atoms.
+            atoms.cell must be defined (e.g., Atoms.cell = np.eye(3)*20).
+        - fragment_numbers: list[str] | None
+            Numbers of each fragment to be included in the random structures. Defaults to 1 for all specified.
+        - num_processes_buildcell: int
+            Number of processes to use for parallel computation during buildcell generation.
+        - num_of_initial_selected_structs: list[int] | None
+            Number of structures to be sampled directly from the buildcell-generated randomized cells.
+        - num_of_rss_selected_structs: int
+            Number of structures to be selected from each RSS iteration.
+        - initial_selection_enabled: bool
+            If true, sample structures from initially generated randomized cells using CUR.
+        - rss_selection_method: str
+            Method for selecting samples from the RSS trajectories:
+            Boltzmann flat histogram in enthalpy first, then CUR.
+            Options include:
+            - 'bcur1s': Execute bcur with one shot (1s)
+            - 'bcur2i': Execute bcur with two iterations (2i)
+        - bcur_params: dict | None
+            Parameters for Boltzmann CUR selection. The default dictionary includes:
+            - soap_paras: dict
+                SOAP descriptor parameters:
+                - l_max: int
+                    Maximum degree of spherical harmonics (default 12).
+                - n_max: int
+                    Maximum number of radial basis functions (default 12).
+                - atom_sigma: float
+                    Width of Gaussian smearing (default 0.0875).
+                - cutoff: float
+                    Radial cutoff distance (default 10.5).
+                - cutoff_transition_width: float
+                    Width of the transition region (default 1.0).
+                - zeta: float
+                    Exponent for dot-product SOAP kernel (default 4.0).
+                - average: bool
+                    Whether to average the SOAP vectors (default True).
+                - species: bool
+                    Whether to consider species information (default True).
+            - kt: float
+                Temperature in eV for Boltzmann weighting (default 0.3).
+            - frac_of_bcur: float
+                Fraction of Boltzmann CUR selections (default 0.8).
+            - bolt_max_num: int
+                Maximum number of Boltzmann selections (default 3000).
+            - kernel_exp: float
+                Exponent for the kernel (default 4.0).
+            - energy_label: str
+                Label for the energy data (default 'energy').
+        - random_seed: int | None
+            A seed to ensure reproducibility of CUR selection. Default is None.
+        - include_isolated_atom: bool
+            If true, perform single-point calculations for isolated atoms.
+        - isolatedatom_box: list[float]
+            List of the lattice constants for an isolated atom configuration.
+        - e0_spin: bool
+            If true, include spin polarization in isolated atom and dimer calculations. Default is False.
+        - include_dimer: bool
+            If true, perform single-point calculations for dimers only once. Default is False.
+        - dimer_box: list[float]
+            The lattice constants of a dimer box.
+        - dimer_range: list[float]
+            Range of distances for dimer calculations.
+        - dimer_num: int
+            Number of different distances to consider for dimer calculations. Default is 21.
+        - custom_incar: dict | None
+            Dictionary of custom VASP input parameters. If provided, will update the
+            default parameters. Default is None.
+        - custom_potcar: dict | None
+            Dictionary of POTCAR settings to update. Keys are element symbols, values are the desired POTCAR labels.
+            Default is None.
+        - vasp_ref_file: str
+            Reference file for VASP data. Default is 'vasp_ref.extxyz'.
+        - config_types: list[str]
+            Configuration types for the VASP calculations. Default is None.
+        - rss_group: list[str]
+            Group name for RSS to setting up regularization.
+        - test_ratio: float
+            The proportion of the test set after splitting the data. The value is allowed to be set to 0;
+            in this case, the testing error would not be meaningful anymore.
+        - regularization: bool
+            If True, apply regularization. This only works for GAP to date. Default is False.
+        - scheme: str
+            Method to use for regularization. Options are:
+            - 'linear_hull': for single-composition system, use 2D convex hull (E, V)
+            - 'volume-stoichiometry': for multi-composition system, use 3D convex hull of (E, V, mole-fraction)
+        - reg_minmax: list[tuple]
+            list of tuples of (min, max) values for energy, force, virial sigmas for regularization.
+        - distillation: bool
+            If true, apply data distillation. Default is True.
+        - force_max: float | None
+            Maximum force value to exclude structures. Default is 50.
+        - force_label: str | None
+            The label of force values to use for distillation. Default is 'REF_forces'.
+        - pre_database_dir: str | None
+            Directory where the previous database was saved.
+        - mlip_type: str
+            Choose one specific MLIP type to be fitted: 'GAP' | 'J-ACE' | 'P-ACE' | 'NEQUIP' | 'M3GNET' | 'MACE'.
+            Default is 'GAP'.
+        - ref_energy_name: str
+            Reference energy name. Default is 'REF_energy'.
+        - ref_force_name: str
+            Reference force name. Default is 'REF_forces'.
+        - ref_virial_name: str
+            Reference virial name. Default is 'REF_virial'.
+        - auto_delta: bool
+            If true, apply automatic determination of delta for GAP terms. Default is False.
+        - num_processes_fit: int
+            Number of processes used for fitting. Default is 1.
+        - device_for_fitting: str
+            Device to be used for model fitting, either "cpu" or "cuda".
+        - **fit_kwargs:
+            Additional keyword arguments for the MLIP fitting process.
+        - scalar_pressure_method: str
+            Method for adding external pressures.
+            Acceptable options are:
+            - 'exp': Applies pressure using an exponential distribution.
+            - 'uniform': Applies pressure using a uniform distribution.
+        - scalar_exp_pressure: float
+            Scalar exponential pressure. Default is 100.
+        - scalar_pressure_exponential_width: float
+            Width for scalar pressure exponential. Default is 0.2.
+        - scalar_pressure_low: float
+            Low limit for scalar pressure. Default is 0.
+        - scalar_pressure_high: float
+            High limit for scalar pressure. Default is 50.
+        - max_steps: int
+            Maximum number of steps for relaxation. Default is 200.
+        - force_tol: float
+            Force residual tolerance for relaxation. Default is 0.05.
+        - stress_tol: float
+            Stress residual tolerance for relaxation. Default is 0.05.
+        - hookean_repul: bool
+            If true, apply Hookean repulsion. Default is False.
+        - hookean_paras: dict[tuple[int, int], tuple[float, float]] | None
+            Parameters for Hookean repulsion as a dictionary of tuples. Default is None.
+        - keep_symmetry: bool
+            If true, preserve symmetry during relaxation. Default is False.
+        - write_traj: bool
+            If true, write trajectory of RSS. Default is True.
+        - num_processes_rss: int
+            Number of processes used for running RSS. Default is 1.
+        - device_for_rss: str
+            Specify device to use "cuda" or "cpu" for running RSS. Default is "cpu".
+        - stop_criterion: float
+            Convergence criterion for stopping RSS iterations. Default is 0.01.
+        - max_iteration_number: int
+            Maximum number of RSS iterations to perform. Default is 25.
+        - num_groups: int
+            Number of structure groups, used for assigning tasks across multiple nodes.
+            For example, if there are 10,000 trajectories to relax and 'num_groups=10',
+            the trajectories will be divided into 10 groups and 10 independent jobs will be created,
+            with each job handling 1,000 trajectories.
+        - initial_kt: float
+            Initial temperature (in eV) for Boltzmann sampling. Default is 0.3.
+        - current_iter_index: int
+            Index for the current RSS iteration. Default is 1.
 
         Output
         ------
diff --git a/autoplex/data/common/flows.py b/autoplex/data/common/flows.py
index 48212169..e5550cc1 100644
--- a/autoplex/data/common/flows.py
+++ b/autoplex/data/common/flows.py
@@ -17,7 +17,10 @@
     GAPRelaxMaker,
 )
 from atomate2.vasp.jobs.core import StaticMaker
-from atomate2.vasp.powerups import update_user_incar_settings
+from atomate2.vasp.powerups import (
+    update_user_incar_settings,
+    update_user_potcar_settings,
+)
 from atomate2.vasp.sets.core import StaticSetGenerator
 from custodian.vasp.handlers import (
     FrozenJobErrorHandler,
@@ -41,9 +44,12 @@
     get_supercell_job,
     plot_force_distribution,
 )
-from autoplex.data.common.utils import ElementCollection
+from autoplex.data.common.utils import (
+    ElementCollection,
+    flatten,
+)
 
-__all__ = ["GenerateTrainingDataForTesting", "DFTStaticMaker"]
+__all__ = ["GenerateTrainingDataForTesting", "DFTStaticLabelling"]
 
 logging.basicConfig(level=logging.DEBUG, format="[%(levelname)s] %(message)s")
 
diff --git a/autoplex/data/common/utils.py b/autoplex/data/common/utils.py
index 887e9530..f3b51964 100644
--- a/autoplex/data/common/utils.py
+++ b/autoplex/data/common/utils.py
@@ -79,15 +79,16 @@ def flatten(atoms_object: Atoms | Iterable, recursive: bool = False) -> list[Ato
 def rms_dict(x_ref: np.ndarray | list, x_pred: np.ndarray | list) -> dict:
     """Compute RMSE and standard deviation of predictions with reference data.
 
-    x_ref and x_pred should be of same shape.
+    Adapted and adjusted from libatoms GAP tutorial page
+    https://libatoms.github.io/GAP/gap_fitting_tutorial.html#make-simple-plots-of-the-energies-and-forces-on-the-EMT-and-GAP-datas
 
     Parameters
     ----------
-    ----------1·
-    x_ref : np.ndarray.
+    x_ref: np.ndarray.
         list of reference data.
     x_pred: np.ndarray.
         list of prediction.
+    Note that x_ref and x_pred should be of same shape.
 
     Returns
     -------
diff --git a/autoplex/data/rss/flows.py b/autoplex/data/rss/flows.py
index 1ea06f8d..93c53909 100644
--- a/autoplex/data/rss/flows.py
+++ b/autoplex/data/rss/flows.py
@@ -108,4 +108,3 @@ def make(self):
             replace=Flow(job_list),
             output=final_structures,
         )
-    
\ No newline at end of file
diff --git a/autoplex/fitting/common/utils.py b/autoplex/fitting/common/utils.py
index a115b584..c5ffb089 100644
--- a/autoplex/fitting/common/utils.py
+++ b/autoplex/fitting/common/utils.py
@@ -53,6 +53,7 @@
 
 current_dir = Path(__file__).absolute().parent
 GAP_DEFAULTS_FILE_PATH = current_dir / "gap-defaults.json"
+MLIP_PHONON_DEFAULTS_FILE_PATH = current_dir / "gap-defaults.json"
 MLIP_DEFAULTS_FILE_PATH = current_dir / "mlip-defaults.json"
 
 
@@ -276,8 +277,8 @@ def jace_fitting(
         Additional keyword arguments for ACE fitting with keys same as
         those in mlip-defaults.json.
 
-    Tunable hyperparameters
-    -----------------------
+    Keyword Arguments
+    -----------------
     order: int
         Order of ACE.
     totaldegree: int
@@ -459,8 +460,8 @@ def nequip_fitting(
         Additional keyword arguments for NequIP fitting with keys same as
         those in mlip-defaults.json.
 
-    Tunable hyperparameters
-    -----------------------
+    Keyword Arguments
+    -----------------
     r_max: float
         Cutoff radius in length units
     num_layers: int
@@ -734,8 +735,8 @@ def m3gnet_fitting(
         Additional keyword arguments for M3GNet fitting with keys same as
         those in mlip-defaults.json.
 
-    Tunable hyperparameters
-    -----------------------
+    Keyword Arguments
+    -----------------
     exp_name: str
         Name of the experiment, used for saving model checkpoints and logs.
     results_dir: str
@@ -1096,8 +1097,8 @@ def mace_fitting(
         Additional keyword arguments for MACE fitting with keys same as
         those in mlip-defaults.json.
 
-    Tunable hyperparameters
-    -----------------------
+    Keyword Arguments
+    -----------------
     model: str
         type of model to be trained.
     config_type_weights: str