"""
A collection of classes representing data stored by a storage backend.
"""
[docs]class BaseStoredData:
"""A base representation of cached data to be stored by
a storage backend.
The expectation is that stored data will exist in storage
as two parts:
1) A JSON serialized representation of this class (or
a subclass), which contains lightweight information
such as the state and composition of the system. Any
larger pieces of data, such as coordinates or
trajectories, should be referenced by this class as
a filename.
2) A directory like structure (either directly a directory,
or some NetCDF like compressed archive) of ancillary
files which do not easily lend themselves to be
serialized within a JSON object, whose files are referenced
by name by the data object.
Attributes
----------
substance: Substance
A description of the composition of the stored system.
thermodynamic_state: ThermodynamicState
The state at which the data was collected.
source_calculation_id: str
The server id of the calculation which yielded this
data.
provenance: dict of str and Any
A dictionary containing the provenance information about
how this data was generated.
force_field_id: str
The server assigned unique id of the force field parameters
used to generate the data.
"""
[docs] def __init__(self):
"""Constructs a new BaseStoredData object"""
self.substance = None
self.thermodynamic_state = None
self.source_calculation_id = None
self.provenance = None
self.force_field_id = None
[docs] def can_merge(self, other_data):
"""Checks whether this piece of data stores the same
amount of compatible information (or more) than another
piece of stored data, and hence whether the two can be
merged together.
Parameters
----------
other_data: BaseStoredData
The other stored data to compare against.
Returns
-------
bool
Returns `True` if this piece of data stores the same
amount of information or more than another piece of
data, or false if it contains less or incompatible data.
"""
if other_data is None:
return False
if type(self) != type(other_data):
return False
if self.thermodynamic_state != other_data.thermodynamic_state:
return False
if self.force_field_id != other_data.force_field_id:
return False
if self.substance != other_data.substance:
return False
return True
[docs] @classmethod
def merge(cls, stored_data_1, stored_data_2):
"""Collapse two pieces of compatible stored data
into one.
Parameters
----------
stored_data_1: BaseStoredData
The first piece of stored data.
stored_data_2: BaseStoredData
The second piece of stored data.
Returns
-------
BaseStoredData
The merged stored data.
"""
raise NotImplementedError()
def __getstate__(self):
return {
"substance": self.substance,
"thermodynamic_state": self.thermodynamic_state,
"source_calculation_id": self.source_calculation_id,
"provenance": self.provenance,
"force_field_id": self.force_field_id,
}
def __setstate__(self, state):
self.substance = state["substance"]
self.thermodynamic_state = state["thermodynamic_state"]
self.source_calculation_id = state["source_calculation_id"]
self.provenance = state["provenance"]
self.force_field_id = state["force_field_id"]
[docs]class StoredSimulationData(BaseStoredData):
"""A representation of data which has been cached
from a single previous simulation.
Notes
-----
The ancillary directory which stores larger information such
as trajectories should be of the form:
.. code-block::
|--- data_object.json
|--- data_directory
|--- coordinate_file_name.pdb
|--- trajectory_file_name.dcd
|--- statistics_file_name.csv
Attributes
----------
coordinate_file_name: str
The name of a coordinate file which encodes the toplogy
information of the system.
trajectory_file_name: str
The name of a .dcd trajectory file containing
configurations generated by the simulation.
statistics_file_name: str
The name of a `StatisticsArray` csv file, containing
statistics generated by the simulation.
statistical_inefficiency: float
The statistical inefficiency of the collected data.
total_number_of_molecules: int
The total number of molecules in the system.
"""
[docs] def __init__(self):
"""Constructs a new StoredSimulationData object"""
super().__init__()
self.coordinate_file_name = None
self.trajectory_file_name = None
self.statistics_file_name = None
self.statistical_inefficiency = 0.0
self.total_number_of_molecules = None
[docs] @classmethod
def merge(cls, stored_data_1, stored_data_2):
"""Collapse two pieces of compatible stored data
into one, by only retaining the data with the longest
autocorrelation time.
Parameters
----------
stored_data_1: StoredSimulationData
The first piece of stored data.
stored_data_2: StoredSimulationData
The second piece of stored data.
Returns
-------
StoredSimulationData
The merged stored data.
"""
# Make sure the two objects can actually be merged.
if not stored_data_1.can_merge(stored_data_2):
raise ValueError(
"The two pieces of data are incompatible and cannot "
"be merged into one."
)
if (
stored_data_1.statistical_inefficiency
< stored_data_2.statistical_inefficiency
):
return stored_data_2
return stored_data_1
def __getstate__(self):
base_state = super(StoredSimulationData, self).__getstate__()
base_state.update(
{
"coordinate_file_name": self.coordinate_file_name,
"trajectory_file_name": self.trajectory_file_name,
"statistics_file_name": self.statistics_file_name,
"statistical_inefficiency": self.statistical_inefficiency,
"total_number_of_molecules": self.total_number_of_molecules,
}
)
return base_state
def __setstate__(self, state):
super(StoredSimulationData, self).__setstate__(state)
self.coordinate_file_name = state["coordinate_file_name"]
self.trajectory_file_name = state["trajectory_file_name"]
self.statistics_file_name = state["statistics_file_name"]
self.statistical_inefficiency = state["statistical_inefficiency"]
self.total_number_of_molecules = state["total_number_of_molecules"]
[docs]class StoredDataCollection(BaseStoredData):
"""A collection of stored `StoredSimulationData` objects, all
generated at the same state and using the same force field
parameters.
The ancillary directory which stores larger information such
as trajectories should be of the form:
.. code-block::
|--- data_object.json
|--- data_directory
|--- data_key_1
|--- coordinate_file_name.pdb
|--- trajectory_file_name.dcd
|--- statistics_file_name.csv
|--- data_key_2
|--- coordinate_file_name.pdb
|--- trajectory_file_name.dcd
|--- statistics_file_name.csv
|--- data_key_3
|--- coordinate_file_name.pdb
|--- trajectory_file_name.dcd
|--- statistics_file_name.csv
Attributes
----------
data: dict of str and StoredSimulationData
A dictionary of stored simulation data objects which
have been given a unique key.
"""
[docs] def __init__(self):
"""Constructs a new StoredDataCollection object"""
super().__init__()
self.data = {}
[docs] def can_merge(self, other_data_collection):
"""
Parameters
----------
other_data_collection: StoredDataCollection
The other stored data to compare against.
"""
if not super(StoredDataCollection, self).can_merge(other_data_collection):
return False
if len(self.data) != len(other_data_collection.data):
return False
for data_key in self.data:
if data_key not in other_data_collection.data:
return False
self_data = self.data[data_key]
other_data = other_data_collection.data[data_key]
if self_data.can_merge(other_data):
continue
return False
return True
[docs] @classmethod
def merge(cls, stored_data_1, stored_data_2):
"""Collapse two pieces of compatible stored data
into one, by only retaining the data with the longest
autocorrelation time.
Parameters
----------
stored_data_1: StoredDataCollection
The first piece of stored data.
stored_data_2: StoredDataCollection
The second piece of stored data.
Returns
-------
StoredDataCollection
The merged stored data.
"""
# Make sure the two objects can actually be merged.
if not stored_data_1.can_merge(stored_data_2):
raise ValueError(
"The two pieces of data are incompatible and "
"cannot be merged into one."
)
merged_data = cls()
merged_data.force_field_id = stored_data_1.force_field_id
for data_key in stored_data_1.data:
merged_data.data[data_key] = stored_data_1.data[data_key].merge(
stored_data_1.data[data_key], stored_data_2.data[data_key]
)
return merged_data
def __getstate__(self):
state = super(StoredDataCollection, self).__getstate__()
state.update({"data": self.data})
return state
def __setstate__(self, state):
super(StoredDataCollection, self).__setstate__(state)
self.data = state["data"]