Source code for propertyestimator.storage.dataclasses

"""
A collection of classes representing data stored by a storage backend.
"""


[docs]class BaseStoredData:
    """A base representation of cached data to be stored by
    a storage backend.

    The expectation is that stored data will exist in storage
    as two parts:

        1) A JSON serialized representation of this class (or
           a subclass), which contains lightweight information
           such as the state and composition of the system. Any
           larger pieces of data, such as coordinates or
           trajectories, should be referenced by this class as
           a filename.

        2) A directory like structure (either directly a directory,
           or some NetCDF like compressed archive) of ancillary
           files which do not easily lend themselves to be
           serialized within a JSON object, whose files are referenced
           by name by the data object.

    Attributes
    ----------
    substance: Substance
        A description of the composition of the stored system.
    thermodynamic_state: ThermodynamicState
        The state at which the data was collected.
    source_calculation_id: str
        The server id of the calculation which yielded this
        data.
    provenance: dict of str and Any
        A dictionary containing the provenance information about
        how this data was generated.
    force_field_id: str
        The server assigned unique id of the force field parameters
        used to generate the data.
    """

[docs]    def __init__(self):
        """Constructs a new BaseStoredData object"""
        self.substance = None
        self.thermodynamic_state = None

        self.source_calculation_id = None
        self.provenance = None

        self.force_field_id = None

[docs]    def can_merge(self, other_data):
        """Checks whether this piece of data stores the same
        amount of compatible information (or more) than another
        piece of stored data, and hence whether the two can be
        merged together.

        Parameters
        ----------
        other_data: BaseStoredData
            The other stored data to compare against.

        Returns
        -------
        bool
            Returns `True` if this piece of data stores the same
            amount of information or more than another piece of
            data, or false if it contains less or incompatible data.
        """

        if other_data is None:
            return False

        if type(self) != type(other_data):
            return False

        if self.thermodynamic_state != other_data.thermodynamic_state:
            return False

        if self.force_field_id != other_data.force_field_id:
            return False

        if self.substance != other_data.substance:
            return False

        return True

[docs]    @classmethod
    def merge(cls, stored_data_1, stored_data_2):
        """Collapse two pieces of compatible stored data
        into one.

        Parameters
        ----------
        stored_data_1: BaseStoredData
            The first piece of stored data.
        stored_data_2: BaseStoredData
            The second piece of stored data.

        Returns
        -------
        BaseStoredData
            The merged stored data.
        """
        raise NotImplementedError()

    def __getstate__(self):
        return {
            "substance": self.substance,
            "thermodynamic_state": self.thermodynamic_state,
            "source_calculation_id": self.source_calculation_id,
            "provenance": self.provenance,
            "force_field_id": self.force_field_id,
        }

    def __setstate__(self, state):
        self.substance = state["substance"]
        self.thermodynamic_state = state["thermodynamic_state"]

        self.source_calculation_id = state["source_calculation_id"]
        self.provenance = state["provenance"]

        self.force_field_id = state["force_field_id"]


[docs]class StoredSimulationData(BaseStoredData):
    """A representation of data which has been cached
    from a single previous simulation.

    Notes
    -----
    The ancillary directory which stores larger information such
    as trajectories should be of the form:

    .. code-block::

        |--- data_object.json
        |--- data_directory
             |--- coordinate_file_name.pdb
             |--- trajectory_file_name.dcd
             |--- statistics_file_name.csv

    Attributes
    ----------
    coordinate_file_name: str
        The name of a coordinate file which encodes the toplogy
        information of the system.
    trajectory_file_name: str
        The name of a .dcd trajectory file containing
        configurations generated by the simulation.
    statistics_file_name: str
        The name of a `StatisticsArray` csv file, containing
        statistics generated by the simulation.
    statistical_inefficiency: float
        The statistical inefficiency of the collected data.
    total_number_of_molecules: int
        The total number of molecules in the system.
    """

[docs]    def __init__(self):
        """Constructs a new StoredSimulationData object"""
        super().__init__()

        self.coordinate_file_name = None
        self.trajectory_file_name = None

        self.statistics_file_name = None
        self.statistical_inefficiency = 0.0

        self.total_number_of_molecules = None

[docs]    @classmethod
    def merge(cls, stored_data_1, stored_data_2):
        """Collapse two pieces of compatible stored data
        into one, by only retaining the data with the longest
        autocorrelation time.

        Parameters
        ----------
        stored_data_1: StoredSimulationData
            The first piece of stored data.
        stored_data_2: StoredSimulationData
            The second piece of stored data.

        Returns
        -------
        StoredSimulationData
            The merged stored data.
        """

        # Make sure the two objects can actually be merged.
        if not stored_data_1.can_merge(stored_data_2):

            raise ValueError(
                "The two pieces of data are incompatible and cannot "
                "be merged into one."
            )

        if (
            stored_data_1.statistical_inefficiency
            < stored_data_2.statistical_inefficiency
        ):
            return stored_data_2

        return stored_data_1

    def __getstate__(self):
        base_state = super(StoredSimulationData, self).__getstate__()

        base_state.update(
            {
                "coordinate_file_name": self.coordinate_file_name,
                "trajectory_file_name": self.trajectory_file_name,
                "statistics_file_name": self.statistics_file_name,
                "statistical_inefficiency": self.statistical_inefficiency,
                "total_number_of_molecules": self.total_number_of_molecules,
            }
        )

        return base_state

    def __setstate__(self, state):
        super(StoredSimulationData, self).__setstate__(state)

        self.coordinate_file_name = state["coordinate_file_name"]
        self.trajectory_file_name = state["trajectory_file_name"]

        self.statistics_file_name = state["statistics_file_name"]
        self.statistical_inefficiency = state["statistical_inefficiency"]

        self.total_number_of_molecules = state["total_number_of_molecules"]


[docs]class StoredDataCollection(BaseStoredData):
    """A collection of stored `StoredSimulationData` objects, all
    generated at the same state and using the same force field
    parameters.

    The ancillary directory which stores larger information such
    as trajectories should be of the form:

    .. code-block::

        |--- data_object.json
        |--- data_directory
            |--- data_key_1
                 |--- coordinate_file_name.pdb
                 |--- trajectory_file_name.dcd
                 |--- statistics_file_name.csv
            |--- data_key_2
                 |--- coordinate_file_name.pdb
                 |--- trajectory_file_name.dcd
                 |--- statistics_file_name.csv
            |--- data_key_3
                 |--- coordinate_file_name.pdb
                 |--- trajectory_file_name.dcd
                 |--- statistics_file_name.csv

    Attributes
    ----------
    data: dict of str and StoredSimulationData
        A dictionary of stored simulation data objects which
        have been given a unique key.
    """

[docs]    def __init__(self):
        """Constructs a new StoredDataCollection object"""
        super().__init__()
        self.data = {}

[docs]    def can_merge(self, other_data_collection):
        """
        Parameters
        ----------
        other_data_collection: StoredDataCollection
            The other stored data to compare against.
        """

        if not super(StoredDataCollection, self).can_merge(other_data_collection):
            return False

        if len(self.data) != len(other_data_collection.data):
            return False

        for data_key in self.data:

            if data_key not in other_data_collection.data:
                return False

            self_data = self.data[data_key]
            other_data = other_data_collection.data[data_key]

            if self_data.can_merge(other_data):
                continue

            return False

        return True

[docs]    @classmethod
    def merge(cls, stored_data_1, stored_data_2):
        """Collapse two pieces of compatible stored data
        into one, by only retaining the data with the longest
        autocorrelation time.

        Parameters
        ----------
        stored_data_1: StoredDataCollection
            The first piece of stored data.
        stored_data_2: StoredDataCollection
            The second piece of stored data.

        Returns
        -------
        StoredDataCollection
            The merged stored data.
        """

        # Make sure the two objects can actually be merged.
        if not stored_data_1.can_merge(stored_data_2):

            raise ValueError(
                "The two pieces of data are incompatible and "
                "cannot be merged into one."
            )

        merged_data = cls()
        merged_data.force_field_id = stored_data_1.force_field_id

        for data_key in stored_data_1.data:

            merged_data.data[data_key] = stored_data_1.data[data_key].merge(
                stored_data_1.data[data_key], stored_data_2.data[data_key]
            )

        return merged_data

    def __getstate__(self):

        state = super(StoredDataCollection, self).__getstate__()
        state.update({"data": self.data})
        return state

    def __setstate__(self, state):
        super(StoredDataCollection, self).__setstate__(state)
        self.data = state["data"]