Source code for pyiron_contrib.atomistics.atomistics.job.trainingcontainer

# coding: utf-8
# Copyright (c) Max-Planck-Institut für Eisenforschung GmbH - Computational Materials Design (CM) Department
# Distributed under the terms of "New BSD License", see the LICENSE file.

"""
Store structures together with energies and forces for potential fitting applications.

Basic usage:

>>> pr = Project("training")
>>> container = pr.create.job.TrainingContainer("small_structures")

Let's make a structure and invent some forces

>>> structure = pr.create.structure.ase_bulk("Fe")
>>> forces = numpy.array([-1, 1, -1])
>>> container.add_structure(structure, energy=-1.234, forces=forces, identifier="Fe_bcc")

If you have a lot of precomputed structures you may also add them in bulk from a pandas DataFrame

>>> df = pandas.DataFrame({ "name": "Fe_bcc", "atoms": structure, "energy": -1.234, "forces": forces })
>>> container.include_dataset(df)

You can retrieve the full database with :method:`~.TrainingContainer.to_pandas()` like this

>>> container.to_pandas()
name    atoms   energy  forces  number_of_atoms
Fe_bcc  ...
"""

from typing import Callable, Dict, Any, Optional

from warnings import catch_warnings

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from ase.atoms import Atoms as ASEAtoms

from pyiron_contrib.atomistics.atomistics.job.structurestorage import StructureStorage
from pyiron_atomistics.atomistics.structure.atoms import Atoms
from pyiron_atomistics.atomistics.structure.has_structure import HasStructure
from pyiron_atomistics.atomistics.structure.neighbors import NeighborsTrajectory
from pyiron_base import GenericJob, DataContainer, deprecate


[docs]class TrainingContainer(GenericJob, HasStructure): """ Stores ASE structures with energies and forces. """ def __init__(self, project, job_name): super().__init__(project=project, job_name=job_name) self.__name__ = "TrainingContainer" self.__hdf_version__ = "0.3.0" self._container = TrainingStorage() self.input = DataContainer( {"save_neighbors": True, "num_neighbors": 12}, table_name="parameters" )
[docs] def include_job(self, job, iteration_step=-1): """ Add structure, energy and forces from job. Args: job (:class:`.AtomisticGenericJob`): job to take structure from iteration_step (int, optional): if job has multiple steps, this selects which to add """ self._container.include_job(job, iteration_step)
[docs] def include_structure( self, structure, energy=None, name=None, **properties ): """ Add new structure to structure list and save energy and forces with it. For consistency with the rest of pyiron, energy should be in units of eV and forces in eV/A, but no conversion is performed. Args: structure_or_job (:class:`~.Atoms`): structure to add energy (float): energy of the whole structure forces (Nx3 array of float, optional): per atom forces, where N is the number of atoms in the structure stress (6 array of float, optional): per structure stresses in voigt notation name (str, optional): name describing the structure """ self._container.include_structure(structure, name=name, energy=energy, **properties)
[docs] def add_structure( self, structure, energy, forces=None, stress=None, identifier=None, **arrays ): """ Add new structure to structure list and save energy and forces with it. For consistency with the rest of pyiron, energy should be in units of eV and forces in eV/A, but no conversion is performed. Args: structure_or_job (:class:`~.Atoms`): structure to add energy (float): energy of the whole structure forces (Nx3 array of float, optional): per atom forces, where N is the number of atoms in the structure stress (6 array of float, optional): per structure stresses in voigt notation name (str, optional): name describing the structure """ self._container.add_structure( structure, energy, identifier=identifier, forces=forces, stress=stress, **arrays )
[docs] def include_dataset(self, dataset): """ Add a pandas DataFrame to the saved structures. The dataframe should have the following columns: - name: human readable name of the structure - atoms(:class:`ase.Atoms`): the atomic structure - energy(float): energy of the whole structure - forces (Nx3 array of float): per atom forces, where N is the number of atoms in the structure - stress (6 array of float): per structure stress in voigt notation """ self._container.include_dataset(dataset)
def _get_structure(self, frame=-1, wrap_atoms=True): return self._container.get_structure(frame=frame, wrap_atoms=wrap_atoms) def _number_of_structures(self): return self._container.number_of_structures
[docs] def get_neighbors(self, num_neighbors=None): """ Calculate and add neighbor information in each structure. If input.save_neighbors is True the data is automatically added to the internal storage and will be saved together with the normal structure data. Args: num_neighbors (int, optional): Number of neighbors to collect, if not given use value from input Returns: NeighborsTrajectory: neighbor information """ if num_neighbors is None: num_neighbors = self.input.num_neighbors n = NeighborsTrajectory( has_structure=self, store=self._container if self.input.save_neighbors else None, num_neighbors=num_neighbors, ) n.compute_neighbors() return n
[docs] def get_elements(self): """ Return a list of chemical elements in the training set. Returns: :class:`list`: list of unique elements in the training set as strings of their standard abbreviations """ return self._container.get_elements()
[docs] def to_pandas(self): """ Export list of structure to pandas table for external fitting codes. The table contains the following columns: - 'name': human-readable name of the structure - 'ase_atoms': the structure as a :class:`.Atoms` object - 'energy': the energy of the full structure - 'forces': the per atom forces as a :class:`numpy.ndarray`, shape Nx3 - 'stress': the per structure stress as a :class:`numpy.ndarray`, shape 6 - 'number_of_atoms': the number of atoms in the structure, N Returns: :class:`pandas.DataFrame`: collected structures """ return self._container.to_pandas()
[docs] def to_list(self, filter_function=None): """ Returns the data as lists of pyiron structures, energies, forces, and the number of atoms Args: filter_function (function): Function applied to the dataset (which is a pandas DataFrame) to filter it Returns: tuple: list of structures, energies, forces, and the number of atoms """ return self._container.to_list(filter_function)
[docs] def to_dict(self): return self._container.to_dict()
[docs] def write_input(self): pass
[docs] def collect_output(self): pass
[docs] def run_static(self): if self.input.save_neighbors: self.get_neighbors() self.status.finished = True
[docs] def run_if_interactive(self): if self.input.save_neighbors: self.get_neighbors() self.to_hdf() self.status.finished = True
[docs] def to_hdf(self, hdf=None, group_name=None): super().to_hdf(hdf=hdf, group_name=group_name) self.input.to_hdf(self.project_hdf5) self._container.to_hdf(self.project_hdf5, "structures")
[docs] def from_hdf(self, hdf=None, group_name=None): super().from_hdf(hdf=hdf, group_name=group_name) hdf_version = self.project_hdf5.get("HDF_VERSION", "0.1.0") if hdf_version == "0.1.0": table = pd.read_hdf( self.project_hdf5.file_name, self.name + "/output/structure_table" ) self.include_dataset(table) else: self._container = TrainingStorage() self._container.from_hdf(self.project_hdf5, "structures") if hdf_version == "0.3.0": self.input.from_hdf(self.project_hdf5, "parameters")
[docs] def sample( self, name: str, selector: Callable[[StructureStorage, int], bool], delete_existing_job: bool = False ) -> "TrainingContainer": """ Create a new TrainingContainer with structures filtered by selector. `self` must have status `finished`. `selector` is passed the underlying :class:`StructureStorage` of this container and the index of the structure and return a boolean whether to include the structure in the new container or not. The new container is saved and run. Args: name (str): name of the new TrainingContainer selector (Callable[[StructureStorage, int], bool]): callable that selects structure to include delete_existing_job (bool): if job with name exist, remove it first Returns: :class:`.TrainingContainer`: new container with selected structures Raises: ValueError: if a job with the given `name` already exists. """ if not self.status.finished: raise ValueError(f"Job must be finished, not '{self.status}'!") cont = self.project.create.job.TrainingContainer(name, delete_existing_job=delete_existing_job) if not cont.status.initialized: raise ValueError(f"Job '{name}' already exists with status: {cont.status}!") cont._container = self._container.sample(selector) cont.run() return cont
@property def plot(self): """ :class:`.TrainingPlots`: plotting interface """ return TrainingPlots(self._container)
[docs] def iter(self, *arrays, wrap_atoms=True): """ Iterate over all structures in this object and all arrays that are defined Args: wrap_atoms (bool): True if the atoms are to be wrapped back into the unit cell; passed to :meth:`.get_structure()` *arrays (str): name of arrays that should be iterated over Yields: :class:`pyiron_atomistics.atomistitcs.structure.atoms.Atoms`, arrays: every structure attached to the object and queried arrays """ yield from self._container.iter(*arrays, wrap_atoms=wrap_atoms)
[docs]class TrainingPlots: """ Simple interface to plot various properties of the structures inside the given :class:`.TrainingContainer`. """ __slots__ = "_train" def __init__(self, train): self._train = train def _calc_spacegroups(self, symprec=1e-3): """ Calculate space groups of all structures. Args: symprec (float): symmetry precision given to spglib Returns: DataFrame: contains columns 'crystal_system' (str) and 'space_group' (int) for each structure """ def get_crystal_system(num): if num in range(1, 3): return "triclinic" elif num in range(3, 16): return "monoclinic" elif num in range(16, 75): return "orthorombic" elif num in range(75, 143): return "trigonal" elif num in range(143, 168): return "tetragonal" elif num in range(168, 195): return "hexagonal" elif num in range(195, 230): return "cubic" def extract(s): spg = s.get_symmetry(symprec=symprec).spacegroup["Number"] return {"space_group": spg, "crystal_system": get_crystal_system(spg)} return pd.DataFrame(map(extract, self._train.iter_structures()))
[docs] def cell(self, angle_in_degrees=True): """ Plot histograms of cell parameters. Plotted are atomic volume, density, cell vector lengths and cell vector angles in separate subplots all on a log-scale. Args: angle_in_degrees (bool): whether unit for angles is degree or radians Returns: `DataFrame`: contains the plotted information in the columns: - a: length of first vector - b: length of second vector - c: length of third vector - alpha: angle between first and second vector - beta: angle between second and third vector - gamma: angle between third and first vector - V: volume of the cell - N: number of atoms in the cell """ N = self._train.get_array("length") C = self._train.get_array("cell") def get_angle(cell, idx=0): return np.arccos( np.dot(cell[idx], cell[(idx + 1) % 3]) / np.linalg.norm(cell[idx]) / np.linalg.norm(cell[(idx + 1) % 3]) ) def extract(n, c): return { "a": np.linalg.norm(c[0]), "b": np.linalg.norm(c[1]), "c": np.linalg.norm(c[2]), "alpha": get_angle(c, 0), "beta": get_angle(c, 1), "gamma": get_angle(c, 2), } df = pd.DataFrame([extract(n, c) for n, c in zip(N, C)]) df["V"] = np.linalg.det(C) df["N"] = N if angle_in_degrees: df["alpha"] = np.rad2deg(df["alpha"]) df["beta"] = np.rad2deg(df["beta"]) df["gamma"] = np.rad2deg(df["gamma"]) plt.subplot(1, 4, 1) plt.title("Atomic Volume") plt.hist(df.V / df.N, bins=20, log=True) plt.xlabel(r"$V$ [$\AA^3$]") plt.subplot(1, 4, 2) plt.title("Density") plt.hist(df.N / df.V, bins=20, log=True) plt.xlabel(r"$\rho$ [$\AA^{-3}$]") plt.subplot(1, 4, 3) plt.title("Lattice Vector Lengths") plt.hist([df.a, df.b, df.c], log=True) plt.xlabel(r"$a,b,c$ [$\AA$]") plt.subplot(1, 4, 4) plt.title("Lattice Vector Angles") plt.hist([df.alpha, df.beta, df.gamma], log=True) if angle_in_degrees: label = r"$\alpha,\beta,\gamma$ [°]" else: label = r"$\alpha,\beta,\gamma$ [rad]" plt.xlabel(label) return df
[docs] def spacegroups(self, symprec=1e-3): """ Plot histograms of space groups and crystal systems. Spacegroups and crystal systems are plotted in separate subplots. Args: symprec (float): precision of the symmetry search (passed to spglib) Returns: DataFrame: contains two columns "space_group", "crystal_system" for each structure in `train` """ df = self._calc_spacegroups(symprec=symprec) plt.subplot(1, 2, 1) plt.hist(df.space_group, bins=230) plt.xlabel("Space Group") plt.subplot(1, 2, 2) l, h = np.unique(df.crystal_system, return_counts=True) sort_key = { "triclinic": 1, "monoclinic": 3, "orthorombic": 16, "trigonal": 75, "tetragonal": 143, "hexagonal": 168, "cubic": 195, } I = np.argsort([sort_key[ll] for ll in l]) plt.bar(l[I], h[I]) plt.xlabel("Crystal System") plt.xticks(rotation=35) return df
[docs] def energy_volume(self, crystal_systems=False): """ Plot volume vs. energy. Volume and energy are normalized per atom before plotting. Args: crystal_systems (bool): if True, plot & label structures of different crystal systems separately. Returns: DataFrame: contains atomic energy and volumes in the columns 'E' and 'V'; if `crystal_systems` is given, also contain space groups and crystal systems of each structure """ N = self._train.get_array("length") E = self._train.get_array("energy") / N C = self._train.get_array("cell") V = np.linalg.det(C) / N df = pd.DataFrame({"V": V, "E": E}) if crystal_systems: spg = self._calc_spacegroups() df = df.join(spg) for cs, dd in df.groupby("crystal_system"): plt.scatter(dd.V, dd.E, label=cs) plt.legend() else: plt.scatter(df.V, df.E) plt.xlabel(r"Atomic Volume [$\AA^3$]") plt.ylabel(r"Atomic Energy [eV]") return df
[docs] def coordination(self, num_shells=4, log=True): """ Plot histogram of coordination in neighbor shells. Computes one histogram of the number of neighbors in each neighbor shell up to `num_shells` and then plots them together. Args: num_shells (int): maximum shell to plot log (float): plot histogram values on a log scale """ if not self._train.has_array("shells"): raise ValueError( "TrainingContainer contains no neighbor information, call TrainingContainer.get_neighbors first!" ) shells = self._train.get_array("shells") shell_index = ( shells[np.newaxis, :, :] == np.arange(1, num_shells + 1)[:, np.newaxis, np.newaxis] ) neigh_count = shell_index.sum(axis=-1) ticks = np.arange(neigh_count.min(), neigh_count.max() + 1) plt.hist( neigh_count.T, bins=ticks - 0.5, log=True, label=[f"{i}." for i in range(1, num_shells + 1)], ) plt.xticks(ticks) plt.xlabel("Number of Neighbors") plt.legend(title="Shell") plt.title("Neighbor Coordination in Shells")
[docs] def shell_distances(self, num_shells=4): """ Plot a violin plot of the neighbor distances in shells up to `num_shells`. Args: num_shells (int): maximum shell to plot """ if not self._train.has_array("shells") or not self._train.has_array( "distances" ): raise ValueError( "TrainingContainer contains no neighbor information, call TrainingContainer.get_neighbors first!" ) dists = self._train.get_array("distances") R = dists.flatten() shells = self._train.get_array("shells") S = shells.ravel() d = pd.DataFrame( {"distance": R[S < num_shells + 1], "shells": S[S < num_shells + 1]} ) sns.violinplot(y=d.shells, x=d.distance, scale="width", orient="h") plt.xlabel(r"Distance [$\AA$]") plt.ylabel("Shell")
[docs] def forces(self, axis: Optional[int] = None): """ Plot a histogram of all forces. Args: axis (int, optional): plot only forces along this axis, if not given plot all forces """ f = self._train.get_array("forces") if axis is not None: f = f[:, axis] else: f = f.ravel() plt.hist(f, bins=20) plt.xlabel(r"Force [eV/$\mathrm{\AA}$]")
[docs]class TrainingStorage(StructureStorage): def __init__(self): super().__init__() self.add_array("energy", dtype=np.float64, per="chunk", fill=np.nan) self._table_cache = None self.to_pandas()
[docs] def to_pandas(self): """ Export list of structure to pandas table for external fitting codes. The table contains the following columns: - 'name': human-readable name of the structure - 'ase_atoms': the structure as a :class:`.Atoms` object - 'energy': the energy of the full structure - 'forces': the per atom forces as a :class:`numpy.ndarray`, shape Nx3 - 'stress': the per structure stress as a :class:`numpy.ndarray`, shape 6 - 'number_of_atoms': the number of atoms in the structure, N Returns: :class:`pandas.DataFrame`: collected structures """ if self._table_cache is None or len(self._table_cache) != len(self): self._table_cache = pd.DataFrame( { "name": [self.get_array("identifier", i) for i in range(len(self))], "atoms": [self.get_structure(i) for i in range(len(self))], "energy": [self.get_array("energy", i) for i in range(len(self))], } ) if self.has_array("forces"): self._table_cache["forces"] = [self.get_array("forces", i) for i in range(len(self))] if self.has_array("stress"): self._table_cache["stress"] = [self.get_array("stress", i) for i in range(len(self))] self._table_cache["number_of_atoms"] = [ len(s) for s in self._table_cache.atoms ] return self._table_cache
[docs] def include_job(self, job, iteration_step=-1): """ Add structure, energy and forces from job. Args: job (:class:`.AtomisticGenericJob`): job to take structure from iteration_step (int, optional): if job has multiple steps, this selects which to add """ energy = job.output.energy_pot[iteration_step] ff = job.output.forces if ff is not None: forces = ff[iteration_step] else: forces = None # HACK: VASP work-around, current contents of pressures are meaningless, correct values are in # output/generic/stresses pp = job["output/generic/stresses"] if pp is None: pp = job.output.pressures if pp is not None: stress = pp[iteration_step] else: stress = None if stress is not None: stress = np.asarray(stress) if stress.shape == (3, 3): stress = np.array( [ stress[0, 0], stress[1, 1], stress[2, 2], stress[1, 2], stress[0, 2], stress[0, 1], ] ) self.include_structure( job.get_structure(iteration_step=iteration_step), energy=energy, forces=forces, stress=stress, name=job.name, )
[docs] @deprecate("Use add_structure instead") def include_structure( self, structure, energy, name=None, **properties ): """ Add new structure to structure list and save energy and forces with it. For consistency with the rest of pyiron, energy should be in units of eV and forces in eV/A, but no conversion is performed. Args: structure_or_job (:class:`~.Atoms`): structure to add energy (float): energy of the whole structure forces (Nx3 array of float, optional): per atom forces, where N is the number of atoms in the structure stress (6 array of float, optional): per structure stresses in voigt notation name (str, optional): name describing the structure """ self.add_structure(structure, identifier=name, energy=energy, **properties)
[docs] def add_structure( self, structure: Atoms, energy, identifier=None, **arrays ) -> None: if "forces" in arrays and not self.has_array("forces"): self.add_array("forces", shape=(3,), dtype=np.float64, per="element", fill=np.nan) if "stress" in arrays and not self.has_array("stress"): # save stress in voigt notation self.add_array("stress", shape=(6,), dtype=np.float64, per="chunk", fill=np.nan) super().add_structure(structure, identifier=identifier, energy=energy, **arrays)
[docs] def include_dataset(self, dataset): """ Add a pandas DataFrame to the saved structures. The dataframe should have the following columns: - name: human readable name of the structure - atoms(:class:`ase.Atoms`): the atomic structure - energy(float): energy of the whole structure - forces (Nx3 array of float): per atom forces, where N is the number of atoms in the structure - charges (Nx3 array of floats): - stress (6 array of float): per structure stress in voigt notation """ if ( "name" not in dataset.columns or "atoms" not in dataset.columns or "energy" not in dataset.columns ): raise ValueError( "At least columns 'name', 'atoms' and 'energy' must be present in dataset!" ) for row in dataset.itertuples(index=False): kwargs = {} if hasattr(row, "forces"): kwargs["forces"] = row.forces if hasattr(row, "stress"): kwargs["stress"] = row.stress self.add_structure( row.atoms, energy=row.energy, identifier=row.name, **kwargs )
[docs] def to_list(self, filter_function=None): """ Returns the data as lists of pyiron structures, energies, forces, and the number of atoms Args: filter_function (function): Function applied to the dataset (which is a pandas DataFrame) to filter it Returns: tuple: list of structures, energies, forces, and the number of atoms """ data_table = self.to_pandas() if filter_function is not None: data_table = filter_function(data_table) structure_list = data_table.atoms.to_list() energy_list = data_table.energy.to_list() if "forces" not in data_table.columns: raise ValueError("no forces defined in storage; call to_dict() instead.") force_list = data_table.forces.to_list() num_atoms_list = data_table.number_of_atoms.to_list() return (structure_list, energy_list, force_list, num_atoms_list)
[docs] def to_dict(self) -> Dict[str, Any]: """Return a dictionary of all structures and training properties.""" dict_arrays = {} # Get structure information. dict_arrays['structure'] = list(self.iter_structures()) # Some arrays are only for internal usage or structure information that # was already saved in dict['structure']. internal_arrays = ['start_index', 'length', 'cell', 'pbc', 'positions', 'symbols'] for array in self.list_arrays(): # Skip internal arrays. if array in internal_arrays: continue dict_arrays[array] = self.get_array_ragged(array) return dict_arrays
[docs] def iter(self, *arrays, wrap_atoms=True): """ Iterate over all structures in this object and all arrays that are defined Args: wrap_atoms (bool): True if the atoms are to be wrapped back into the unit cell; passed to :meth:`.get_structure()` *arrays (str): name of arrays that should be iterated over Yields: :class:`pyiron_atomistics.atomistitcs.structure.atoms.Atoms`, arrays: every structure attached to the object and queried arrays """ array_vals = (self.get_array_ragged(a) for a in arrays) yield from zip(self.iter_structures(), *array_vals)
@property def plot(self): """ :class:`.TrainingPlots`: plotting interface """ return TrainingPlots(self)