Source code for pyiron_contrib.atomistics.atomistics.job.trainingcontainer

# coding: utf-8
# Copyright (c) Max-Planck-Institut für Eisenforschung GmbH - Computational Materials Design (CM) Department
# Distributed under the terms of "New BSD License", see the LICENSE file.

"""
Store structures together with energies and forces for potential fitting applications.

Basic usage:

>>> pr = Project("training")
>>> container = pr.create.job.TrainingContainer("small_structures")

Let's make a structure and invent some forces

>>> structure = pr.create.structure.ase_bulk("Fe")
>>> forces = numpy.array([-1, 1, -1])
>>> container.add_structure(structure, energy=-1.234, forces=forces, identifier="Fe_bcc")

If you have a lot of precomputed structures you may also add them in bulk from a pandas DataFrame

>>> df = pandas.DataFrame({ "name": "Fe_bcc", "atoms": structure, "energy": -1.234, "forces": forces })
>>> container.include_dataset(df)

You can retrieve the full database with :method:`~.TrainingContainer.to_pandas()` like this

>>> container.to_pandas()
name    atoms   energy  forces  number_of_atoms
Fe_bcc  ...
"""

from typing import Callable, Dict, Any, Optional

from warnings import catch_warnings

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from ase.atoms import Atoms as ASEAtoms

from pyiron_contrib.atomistics.atomistics.job.structurestorage import StructureStorage
from pyiron_atomistics.atomistics.structure.atoms import Atoms
from pyiron_atomistics.atomistics.structure.has_structure import HasStructure
from pyiron_atomistics.atomistics.structure.neighbors import NeighborsTrajectory
from pyiron_base import GenericJob, DataContainer, deprecate


[docs]class TrainingContainer(GenericJob, HasStructure):
    """
    Stores ASE structures with energies and forces.
    """

    def __init__(self, project, job_name):
        super().__init__(project=project, job_name=job_name)
        self.__name__ = "TrainingContainer"
        self.__hdf_version__ = "0.3.0"
        self._container = TrainingStorage()

        self.input = DataContainer(
            {"save_neighbors": True, "num_neighbors": 12},
            table_name="parameters"
        )

[docs]    def include_job(self, job, iteration_step=-1):
        """
        Add structure, energy and forces from job.

        Args:
            job (:class:`.AtomisticGenericJob`): job to take structure from
            iteration_step (int, optional): if job has multiple steps, this
            selects which to add
        """
        self._container.include_job(job, iteration_step)

[docs]    def include_structure(
        self,
        structure,
        energy=None,
        name=None,
        **properties
    ):
        """
        Add new structure to structure list and save energy and forces with it.

        For consistency with the rest of pyiron, energy should be in units of eV
        and forces in eV/A, but no conversion is performed.

        Args:
            structure_or_job (:class:`~.Atoms`): structure to add
            energy (float): energy of the whole structure
            forces (Nx3 array of float, optional): per atom forces, where N is
                the number of atoms in the structure
            stress (6 array of float, optional): per structure stresses in voigt
                notation
            name (str, optional): name describing the structure
        """
        self._container.include_structure(structure, name=name, energy=energy,
                                          **properties)

[docs]    def add_structure(
        self, structure, energy, forces=None, stress=None, identifier=None, **arrays
    ):
        """
        Add new structure to structure list and save energy and forces with it.

        For consistency with the rest of pyiron, energy should be in units of eV and forces in eV/A, but no conversion
        is performed.

        Args:
            structure_or_job (:class:`~.Atoms`): structure to add
            energy (float): energy of the whole structure
            forces (Nx3 array of float, optional): per atom forces, where N is the number of atoms in the structure
            stress (6 array of float, optional): per structure stresses in voigt notation
            name (str, optional): name describing the structure
        """
        self._container.add_structure(
            structure, energy, identifier=identifier, forces=forces, stress=stress, **arrays
        )

[docs]    def include_dataset(self, dataset):
        """
        Add a pandas DataFrame to the saved structures.

        The dataframe should have the following columns:
            - name: human readable name of the structure
            - atoms(:class:`ase.Atoms`): the atomic structure
            - energy(float): energy of the whole structure
            - forces (Nx3 array of float): per atom forces, where N is the number of atoms in the structure
            - stress (6 array of float): per structure stress in voigt notation
        """
        self._container.include_dataset(dataset)

    def _get_structure(self, frame=-1, wrap_atoms=True):
        return self._container.get_structure(frame=frame, wrap_atoms=wrap_atoms)

    def _number_of_structures(self):
        return self._container.number_of_structures

[docs]    def get_neighbors(self, num_neighbors=None):
        """
        Calculate and add neighbor information in each structure.

        If input.save_neighbors is True the data is automatically added to the internal storage and will be saved
        together with the normal structure data.

        Args:
            num_neighbors (int, optional): Number of neighbors to collect, if not given use value from input

        Returns:
            NeighborsTrajectory: neighbor information
        """
        if num_neighbors is None:
            num_neighbors = self.input.num_neighbors
        n = NeighborsTrajectory(
            has_structure=self,
            store=self._container if self.input.save_neighbors else None,
            num_neighbors=num_neighbors,
        )
        n.compute_neighbors()
        return n

[docs]    def get_elements(self):
        """
        Return a list of chemical elements in the training set.

        Returns:
            :class:`list`: list of unique elements in the training set as strings of their standard abbreviations
        """
        return self._container.get_elements()

[docs]    def to_pandas(self):
        """
        Export list of structure to pandas table for external fitting codes.

        The table contains the following columns:
            - 'name': human-readable name of the structure
            - 'ase_atoms': the structure as a :class:`.Atoms` object
            - 'energy': the energy of the full structure
            - 'forces': the per atom forces as a :class:`numpy.ndarray`, shape Nx3
            - 'stress': the per structure stress as a :class:`numpy.ndarray`, shape 6
            - 'number_of_atoms': the number of atoms in the structure, N

        Returns:
            :class:`pandas.DataFrame`: collected structures
        """
        return self._container.to_pandas()

[docs]    def to_list(self, filter_function=None):
        """
        Returns the data as lists of pyiron structures, energies, forces, and the number of atoms

        Args:
            filter_function (function): Function applied to the dataset (which is a pandas DataFrame) to filter it

        Returns:
            tuple: list of structures, energies, forces, and the number of atoms
        """
        return self._container.to_list(filter_function)

[docs]    def to_dict(self):
        return self._container.to_dict()

[docs]    def write_input(self):
        pass

[docs]    def collect_output(self):
        pass

[docs]    def run_static(self):
        if self.input.save_neighbors:
            self.get_neighbors()
        self.status.finished = True

[docs]    def run_if_interactive(self):
        if self.input.save_neighbors:
            self.get_neighbors()
        self.to_hdf()
        self.status.finished = True

[docs]    def to_hdf(self, hdf=None, group_name=None):
        super().to_hdf(hdf=hdf, group_name=group_name)
        self.input.to_hdf(self.project_hdf5)
        self._container.to_hdf(self.project_hdf5, "structures")

[docs]    def from_hdf(self, hdf=None, group_name=None):
        super().from_hdf(hdf=hdf, group_name=group_name)
        hdf_version = self.project_hdf5.get("HDF_VERSION", "0.1.0")
        if hdf_version == "0.1.0":
            table = pd.read_hdf(
                self.project_hdf5.file_name, self.name + "/output/structure_table"
            )
            self.include_dataset(table)
        else:
            self._container = TrainingStorage()
            self._container.from_hdf(self.project_hdf5, "structures")
            if hdf_version == "0.3.0":
                self.input.from_hdf(self.project_hdf5, "parameters")

[docs]    def sample(
        self, name: str, selector: Callable[[StructureStorage, int], bool],
        delete_existing_job: bool = False
    ) -> "TrainingContainer":
        """
        Create a new TrainingContainer with structures filtered by selector.

        `self` must have status `finished`.  `selector` is passed the underlying :class:`StructureStorage` of this
        container and the index of the structure and return a boolean whether to include the structure in the new
        container or not.  The new container is saved and run.

        Args:
            name (str): name of the new TrainingContainer
            selector (Callable[[StructureStorage, int], bool]): callable that selects structure to include
            delete_existing_job (bool): if job with name exist, remove it first

        Returns:
            :class:`.TrainingContainer`: new container with selected structures

        Raises:
            ValueError: if a job with the given `name` already exists.
        """
        if not self.status.finished:
            raise ValueError(f"Job must be finished, not '{self.status}'!")
        cont = self.project.create.job.TrainingContainer(name, delete_existing_job=delete_existing_job)
        if not cont.status.initialized:
            raise ValueError(f"Job '{name}' already exists with status: {cont.status}!")
        cont._container = self._container.sample(selector)
        cont.run()
        return cont

    @property
    def plot(self):
        """
        :class:`.TrainingPlots`: plotting interface
        """
        return TrainingPlots(self._container)

[docs]    def iter(self, *arrays, wrap_atoms=True):
        """
        Iterate over all structures in this object and all arrays that are defined

        Args:
            wrap_atoms (bool): True if the atoms are to be wrapped back into the unit cell; passed to
                               :meth:`.get_structure()`
            *arrays (str): name of arrays that should be iterated over

        Yields:
            :class:`pyiron_atomistics.atomistitcs.structure.atoms.Atoms`, arrays: every structure attached to the object and queried arrays
        """
        yield from self._container.iter(*arrays, wrap_atoms=wrap_atoms)


[docs]class TrainingPlots:
    """
    Simple interface to plot various properties of the structures inside the given :class:`.TrainingContainer`.
    """

    __slots__ = "_train"

    def __init__(self, train):
        self._train = train

    def _calc_spacegroups(self, symprec=1e-3):
        """
        Calculate space groups of all structures.

        Args:
            symprec (float): symmetry precision given to spglib

        Returns:
            DataFrame: contains columns 'crystal_system' (str) and 'space_group' (int) for each structure
        """

        def get_crystal_system(num):
            if num in range(1, 3):
                return "triclinic"
            elif num in range(3, 16):
                return "monoclinic"
            elif num in range(16, 75):
                return "orthorombic"
            elif num in range(75, 143):
                return "trigonal"
            elif num in range(143, 168):
                return "tetragonal"
            elif num in range(168, 195):
                return "hexagonal"
            elif num in range(195, 230):
                return "cubic"

        def extract(s):
            spg = s.get_symmetry(symprec=symprec).spacegroup["Number"]
            return {"space_group": spg, "crystal_system": get_crystal_system(spg)}

        return pd.DataFrame(map(extract, self._train.iter_structures()))

[docs]    def cell(self, angle_in_degrees=True):
        """
        Plot histograms of cell parameters.

        Plotted are atomic volume, density, cell vector lengths and cell vector angles in separate subplots all on a
        log-scale.

        Args:
            angle_in_degrees (bool): whether unit for angles is degree or radians

        Returns:
            `DataFrame`: contains the plotted information in the columns:
                            - a: length of first vector
                            - b: length of second vector
                            - c: length of third vector
                            - alpha: angle between first and second vector
                            - beta: angle between second and third vector
                            - gamma: angle between third and first vector
                            - V: volume of the cell
                            - N: number of atoms in the cell
        """
        N = self._train.get_array("length")
        C = self._train.get_array("cell")

        def get_angle(cell, idx=0):
            return np.arccos(
                np.dot(cell[idx], cell[(idx + 1) % 3])
                / np.linalg.norm(cell[idx])
                / np.linalg.norm(cell[(idx + 1) % 3])
            )

        def extract(n, c):
            return {
                "a": np.linalg.norm(c[0]),
                "b": np.linalg.norm(c[1]),
                "c": np.linalg.norm(c[2]),
                "alpha": get_angle(c, 0),
                "beta": get_angle(c, 1),
                "gamma": get_angle(c, 2),
            }

        df = pd.DataFrame([extract(n, c) for n, c in zip(N, C)])
        df["V"] = np.linalg.det(C)
        df["N"] = N
        if angle_in_degrees:
            df["alpha"] = np.rad2deg(df["alpha"])
            df["beta"] = np.rad2deg(df["beta"])
            df["gamma"] = np.rad2deg(df["gamma"])

        plt.subplot(1, 4, 1)
        plt.title("Atomic Volume")
        plt.hist(df.V / df.N, bins=20, log=True)
        plt.xlabel(r"$V$ [$\AA^3$]")

        plt.subplot(1, 4, 2)
        plt.title("Density")
        plt.hist(df.N / df.V, bins=20, log=True)
        plt.xlabel(r"$\rho$ [$\AA^{-3}$]")

        plt.subplot(1, 4, 3)
        plt.title("Lattice Vector Lengths")
        plt.hist([df.a, df.b, df.c], log=True)
        plt.xlabel(r"$a,b,c$ [$\AA$]")

        plt.subplot(1, 4, 4)
        plt.title("Lattice Vector Angles")
        plt.hist([df.alpha, df.beta, df.gamma], log=True)
        if angle_in_degrees:
            label = r"$\alpha,\beta,\gamma$ [°]"
        else:
            label = r"$\alpha,\beta,\gamma$ [rad]"
        plt.xlabel(label)

        return df

[docs]    def spacegroups(self, symprec=1e-3):
        """
        Plot histograms of space groups and crystal systems.

        Spacegroups and crystal systems are plotted in separate subplots.

        Args:
            symprec (float): precision of the symmetry search (passed to spglib)

        Returns:
            DataFrame: contains two columns "space_group", "crystal_system"
                       for each structure in `train`
        """

        df = self._calc_spacegroups(symprec=symprec)
        plt.subplot(1, 2, 1)
        plt.hist(df.space_group, bins=230)
        plt.xlabel("Space Group")

        plt.subplot(1, 2, 2)
        l, h = np.unique(df.crystal_system, return_counts=True)
        sort_key = {
            "triclinic": 1,
            "monoclinic": 3,
            "orthorombic": 16,
            "trigonal": 75,
            "tetragonal": 143,
            "hexagonal": 168,
            "cubic": 195,
        }
        I = np.argsort([sort_key[ll] for ll in l])
        plt.bar(l[I], h[I])
        plt.xlabel("Crystal System")
        plt.xticks(rotation=35)
        return df

[docs]    def energy_volume(self, crystal_systems=False):
        """
        Plot volume vs. energy.

        Volume and energy are normalized per atom before plotting.

        Args:
            crystal_systems (bool): if True, plot & label structures of different crystal systems separately.

        Returns:
            DataFrame: contains atomic energy and volumes in the columns 'E' and 'V'; if `crystal_systems` is given,
                       also contain space groups and crystal systems of each structure
        """

        N = self._train.get_array("length")
        E = self._train.get_array("energy") / N
        C = self._train.get_array("cell")
        V = np.linalg.det(C) / N

        df = pd.DataFrame({"V": V, "E": E})

        if crystal_systems:
            spg = self._calc_spacegroups()
            df = df.join(spg)
            for cs, dd in df.groupby("crystal_system"):
                plt.scatter(dd.V, dd.E, label=cs)
            plt.legend()
        else:
            plt.scatter(df.V, df.E)
        plt.xlabel(r"Atomic Volume [$\AA^3$]")
        plt.ylabel(r"Atomic Energy [eV]")

        return df

[docs]    def coordination(self, num_shells=4, log=True):
        """
        Plot histogram of coordination in neighbor shells.

        Computes one histogram of the number of neighbors in each neighbor shell up to `num_shells` and then plots them
        together.

        Args:
            num_shells (int): maximum shell to plot
            log (float): plot histogram values on a log scale
        """
        if not self._train.has_array("shells"):
            raise ValueError(
                "TrainingContainer contains no neighbor information, call TrainingContainer.get_neighbors first!"
            )
        shells = self._train.get_array("shells")
        shell_index = (
            shells[np.newaxis, :, :]
            == np.arange(1, num_shells + 1)[:, np.newaxis, np.newaxis]
        )
        neigh_count = shell_index.sum(axis=-1)
        ticks = np.arange(neigh_count.min(), neigh_count.max() + 1)
        plt.hist(
            neigh_count.T,
            bins=ticks - 0.5,
            log=True,
            label=[f"{i}." for i in range(1, num_shells + 1)],
        )
        plt.xticks(ticks)
        plt.xlabel("Number of Neighbors")
        plt.legend(title="Shell")
        plt.title("Neighbor Coordination in Shells")

[docs]    def shell_distances(self, num_shells=4):
        """
        Plot a violin plot of the neighbor distances in shells up to `num_shells`.

        Args:
            num_shells (int): maximum shell to plot
        """
        if not self._train.has_array("shells") or not self._train.has_array(
            "distances"
        ):
            raise ValueError(
                "TrainingContainer contains no neighbor information, call TrainingContainer.get_neighbors first!"
            )
        dists = self._train.get_array("distances")
        R = dists.flatten()
        shells = self._train.get_array("shells")
        S = shells.ravel()
        d = pd.DataFrame(
            {"distance": R[S < num_shells + 1], "shells": S[S < num_shells + 1]}
        )
        sns.violinplot(y=d.shells, x=d.distance, scale="width", orient="h")
        plt.xlabel(r"Distance [$\AA$]")
        plt.ylabel("Shell")

[docs]    def forces(self, axis: Optional[int] = None):
        """
        Plot a histogram of all forces.

        Args:
            axis (int, optional): plot only forces along this axis, if not given plot all forces
        """
        f = self._train.get_array("forces")
        if axis is not None:
            f = f[:, axis]
        else:
            f = f.ravel()
        plt.hist(f, bins=20)
        plt.xlabel(r"Force [eV/$\mathrm{\AA}$]")


[docs]class TrainingStorage(StructureStorage):
    def __init__(self):
        super().__init__()
        self.add_array("energy", dtype=np.float64, per="chunk", fill=np.nan)
        self._table_cache = None
        self.to_pandas()

[docs]    def to_pandas(self):
        """
        Export list of structure to pandas table for external fitting codes.

        The table contains the following columns:
            - 'name': human-readable name of the structure
            - 'ase_atoms': the structure as a :class:`.Atoms` object
            - 'energy': the energy of the full structure
            - 'forces': the per atom forces as a :class:`numpy.ndarray`, shape Nx3
            - 'stress': the per structure stress as a :class:`numpy.ndarray`, shape 6
            - 'number_of_atoms': the number of atoms in the structure, N

        Returns:
            :class:`pandas.DataFrame`: collected structures
        """
        if self._table_cache is None or len(self._table_cache) != len(self):
            self._table_cache = pd.DataFrame(
                {
                    "name": [self.get_array("identifier", i) for i in range(len(self))],
                    "atoms": [self.get_structure(i) for i in range(len(self))],
                    "energy": [self.get_array("energy", i) for i in range(len(self))],
                }
            )
            if self.has_array("forces"):
                self._table_cache["forces"] = [self.get_array("forces", i) for i in range(len(self))]
            if self.has_array("stress"):
                self._table_cache["stress"] = [self.get_array("stress", i) for i in range(len(self))]
            self._table_cache["number_of_atoms"] = [
                len(s) for s in self._table_cache.atoms
            ]
        return self._table_cache

[docs]    def include_job(self, job, iteration_step=-1):
        """
        Add structure, energy and forces from job.

        Args:
            job (:class:`.AtomisticGenericJob`): job to take structure from
            iteration_step (int, optional): if job has multiple steps, this selects which to add
        """
        energy = job.output.energy_pot[iteration_step]
        ff = job.output.forces
        if ff is not None:
            forces = ff[iteration_step]
        else:
            forces = None
        # HACK: VASP work-around, current contents of pressures are meaningless, correct values are in
        # output/generic/stresses
        pp = job["output/generic/stresses"]
        if pp is None:
            pp = job.output.pressures
        if pp is not None:
            stress = pp[iteration_step]
        else:
            stress = None
        if stress is not None:
            stress = np.asarray(stress)
            if stress.shape == (3, 3):
                stress = np.array(
                    [
                        stress[0, 0],
                        stress[1, 1],
                        stress[2, 2],
                        stress[1, 2],
                        stress[0, 2],
                        stress[0, 1],
                    ]
                )
        self.include_structure(
            job.get_structure(iteration_step=iteration_step),
            energy=energy,
            forces=forces,
            stress=stress,
            name=job.name,
        )

[docs]    @deprecate("Use add_structure instead")
    def include_structure(
        self,
        structure,
        energy,
        name=None,
        **properties
    ):
        """
        Add new structure to structure list and save energy and forces with it.

        For consistency with the rest of pyiron, energy should be in units of eV and forces in eV/A, but no conversion
        is performed.

        Args:
            structure_or_job (:class:`~.Atoms`): structure to add
            energy (float): energy of the whole structure
            forces (Nx3 array of float, optional): per atom forces, where N is the number of atoms in the structure
            stress (6 array of float, optional): per structure stresses in voigt notation
            name (str, optional): name describing the structure
        """
        self.add_structure(structure, identifier=name, energy=energy,
                           **properties)

[docs]    def add_structure(
        self,
        structure: Atoms,
        energy,
        identifier=None,
        **arrays
    ) -> None:
        if "forces" in arrays and not self.has_array("forces"):
            self.add_array("forces", shape=(3,), dtype=np.float64, per="element",
                           fill=np.nan)
        if "stress" in arrays and not self.has_array("stress"):
            # save stress in voigt notation
            self.add_array("stress", shape=(6,), dtype=np.float64, per="chunk",
                           fill=np.nan)
        super().add_structure(structure, identifier=identifier, energy=energy,
                              **arrays)

[docs]    def include_dataset(self, dataset):
        """
        Add a pandas DataFrame to the saved structures.

        The dataframe should have the following columns:
            - name: human readable name of the structure
            - atoms(:class:`ase.Atoms`): the atomic structure
            - energy(float): energy of the whole structure
            - forces (Nx3 array of float): per atom forces, where N is the number of atoms in the structure
            - charges (Nx3 array of floats):
            - stress (6 array of float): per structure stress in voigt notation
        """
        if (
            "name" not in dataset.columns
            or "atoms" not in dataset.columns
            or "energy" not in dataset.columns
        ):
            raise ValueError(
                "At least columns 'name', 'atoms' and 'energy' must be present in dataset!"
            )
        for row in dataset.itertuples(index=False):
            kwargs = {}
            if hasattr(row, "forces"):
                kwargs["forces"] = row.forces
            if hasattr(row, "stress"):
                kwargs["stress"] = row.stress
            self.add_structure(
                row.atoms, energy=row.energy, identifier=row.name, **kwargs
            )

[docs]    def to_list(self, filter_function=None):
        """
        Returns the data as lists of pyiron structures, energies, forces, and the number of atoms

        Args:
            filter_function (function): Function applied to the dataset (which is a pandas DataFrame) to filter it

        Returns:
            tuple: list of structures, energies, forces, and the number of atoms
        """
        data_table = self.to_pandas()
        if filter_function is not None:
            data_table = filter_function(data_table)
        structure_list = data_table.atoms.to_list()
        energy_list = data_table.energy.to_list()
        if "forces" not in data_table.columns:
            raise ValueError("no forces defined in storage; call to_dict() instead.")
        force_list = data_table.forces.to_list()
        num_atoms_list = data_table.number_of_atoms.to_list()

        return (structure_list, energy_list, force_list, num_atoms_list)

[docs]    def to_dict(self) -> Dict[str, Any]:
        """Return a dictionary of all structures and training properties."""
        dict_arrays = {}

        # Get structure information.
        dict_arrays['structure'] = list(self.iter_structures())

        # Some arrays are only for internal usage or structure information that
        # was already saved in dict['structure'].
        internal_arrays = ['start_index', 'length', 'cell', 'pbc', 'positions',
                           'symbols']
        for array in self.list_arrays():
            # Skip internal arrays.
            if array in internal_arrays:
                continue

            dict_arrays[array] = self.get_array_ragged(array)
        return dict_arrays

[docs]    def iter(self, *arrays, wrap_atoms=True):
        """
        Iterate over all structures in this object and all arrays that are defined

        Args:
            wrap_atoms (bool): True if the atoms are to be wrapped back into the unit cell; passed to
                               :meth:`.get_structure()`
            *arrays (str): name of arrays that should be iterated over

        Yields:
            :class:`pyiron_atomistics.atomistitcs.structure.atoms.Atoms`, arrays: every structure attached to the object and queried arrays
        """
        array_vals = (self.get_array_ragged(a) for a in arrays)
        yield from zip(self.iter_structures(), *array_vals)

    @property
    def plot(self):
        """
        :class:`.TrainingPlots`: plotting interface
        """
        return TrainingPlots(self)