Source code for propertyestimator.storage.dataclasses

"""
A collection of classes representing data stored by a storage backend.
"""


[docs]class BaseStoredData: """A base representation of cached data to be stored by a storage backend. The expectation is that stored data will exist in storage as two parts: 1) A JSON serialized representation of this class (or a subclass), which contains lightweight information such as the state and composition of the system. Any larger pieces of data, such as coordinates or trajectories, should be referenced by this class as a filename. 2) A directory like structure (either directly a directory, or some NetCDF like compressed archive) of ancillary files which do not easily lend themselves to be serialized within a JSON object, whose files are referenced by name by the data object. Attributes ---------- substance: Substance A description of the composition of the stored system. thermodynamic_state: ThermodynamicState The state at which the data was collected. source_calculation_id: str The server id of the calculation which yielded this data. provenance: dict of str and Any A dictionary containing the provenance information about how this data was generated. force_field_id: str The server assigned unique id of the force field parameters used to generate the data. """
[docs] def __init__(self): """Constructs a new BaseStoredData object""" self.substance = None self.thermodynamic_state = None self.source_calculation_id = None self.provenance = None self.force_field_id = None
[docs] def can_merge(self, other_data): """Checks whether this piece of data stores the same amount of compatible information (or more) than another piece of stored data, and hence whether the two can be merged together. Parameters ---------- other_data: BaseStoredData The other stored data to compare against. Returns ------- bool Returns `True` if this piece of data stores the same amount of information or more than another piece of data, or false if it contains less or incompatible data. """ if other_data is None: return False if type(self) != type(other_data): return False if self.thermodynamic_state != other_data.thermodynamic_state: return False if self.force_field_id != other_data.force_field_id: return False if self.substance != other_data.substance: return False return True
[docs] @classmethod def merge(cls, stored_data_1, stored_data_2): """Collapse two pieces of compatible stored data into one. Parameters ---------- stored_data_1: BaseStoredData The first piece of stored data. stored_data_2: BaseStoredData The second piece of stored data. Returns ------- BaseStoredData The merged stored data. """ raise NotImplementedError()
def __getstate__(self): return { 'substance': self.substance, 'thermodynamic_state': self.thermodynamic_state, 'source_calculation_id': self.source_calculation_id, 'provenance': self.provenance, 'force_field_id': self.force_field_id, } def __setstate__(self, state): self.substance = state['substance'] self.thermodynamic_state = state['thermodynamic_state'] self.source_calculation_id = state['source_calculation_id'] self.provenance = state['provenance'] self.force_field_id = state['force_field_id']
[docs]class StoredSimulationData(BaseStoredData): """A representation of data which has been cached from a single previous simulation. Notes ----- The ancillary directory which stores larger information such as trajectories should be of the form: .. code-block:: |--- data_object.json |--- data_directory |--- coordinate_file_name.pdb |--- trajectory_file_name.dcd |--- statistics_file_name.csv Attributes ---------- coordinate_file_name: str The name of a coordinate file which encodes the toplogy information of the system. trajectory_file_name: str The name of a .dcd trajectory file containing configurations generated by the simulation. statistics_file_name: str The name of a `StatisticsArray` csv file, containing statistics generated by the simulation. statistical_inefficiency: float The statistical inefficiency of the collected data. total_number_of_molecules: int The total number of molecules in the system. """
[docs] def __init__(self): """Constructs a new StoredSimulationData object""" super().__init__() self.coordinate_file_name = None self.trajectory_file_name = None self.statistics_file_name = None self.statistical_inefficiency = 0.0 self.total_number_of_molecules = None
[docs] @classmethod def merge(cls, stored_data_1, stored_data_2): """Collapse two pieces of compatible stored data into one, by only retaining the data with the longest autocorrelation time. Parameters ---------- stored_data_1: StoredSimulationData The first piece of stored data. stored_data_2: StoredSimulationData The second piece of stored data. Returns ------- StoredSimulationData The merged stored data. """ # Make sure the two objects can actually be merged. if not stored_data_1.can_merge(stored_data_2): raise ValueError('The two pieces of data are incompatible and cannot ' 'be merged into one.') if stored_data_1.statistical_inefficiency < stored_data_2.statistical_inefficiency: return stored_data_2 return stored_data_1
def __getstate__(self): base_state = super(StoredSimulationData, self).__getstate__() base_state.update({ 'coordinate_file_name': self.coordinate_file_name, 'trajectory_file_name': self.trajectory_file_name, 'statistics_file_name': self.statistics_file_name, 'statistical_inefficiency': self.statistical_inefficiency, 'total_number_of_molecules': self.total_number_of_molecules }) return base_state def __setstate__(self, state): super(StoredSimulationData, self).__setstate__(state) self.coordinate_file_name = state['coordinate_file_name'] self.trajectory_file_name = state['trajectory_file_name'] self.statistics_file_name = state['statistics_file_name'] self.statistical_inefficiency = state['statistical_inefficiency'] self.total_number_of_molecules = state['total_number_of_molecules']
[docs]class StoredDataCollection(BaseStoredData): """A collection of stored `StoredSimulationData` objects, all generated at the same state and using the same force field parameters. The ancillary directory which stores larger information such as trajectories should be of the form: .. code-block:: |--- data_object.json |--- data_directory |--- data_key_1 |--- coordinate_file_name.pdb |--- trajectory_file_name.dcd |--- statistics_file_name.csv |--- data_key_2 |--- coordinate_file_name.pdb |--- trajectory_file_name.dcd |--- statistics_file_name.csv |--- data_key_3 |--- coordinate_file_name.pdb |--- trajectory_file_name.dcd |--- statistics_file_name.csv Attributes ---------- data: dict of str and StoredSimulationData A dictionary of stored simulation data objects which have been given a unique key. """
[docs] def __init__(self): """Constructs a new StoredDataCollection object""" super().__init__() self.data = {}
[docs] def can_merge(self, other_data_collection): """ Parameters ---------- other_data_collection: StoredDataCollection The other stored data to compare against. """ if not super(StoredDataCollection, self).can_merge(other_data_collection): return False if len(self.data) != len(other_data_collection.data): return False for data_key in self.data: if data_key not in other_data_collection.data: return False self_data = self.data[data_key] other_data = other_data_collection.data[data_key] if self_data.can_merge(other_data): continue return False return True
[docs] @classmethod def merge(cls, stored_data_1, stored_data_2): """Collapse two pieces of compatible stored data into one, by only retaining the data with the longest autocorrelation time. Parameters ---------- stored_data_1: StoredDataCollection The first piece of stored data. stored_data_2: StoredDataCollection The second piece of stored data. Returns ------- StoredDataCollection The merged stored data. """ # Make sure the two objects can actually be merged. if not stored_data_1.can_merge(stored_data_2): raise ValueError('The two pieces of data are incompatible and ' 'cannot be merged into one.') merged_data = cls() merged_data.force_field_id = stored_data_1.force_field_id for data_key in stored_data_1.data: merged_data.data[data_key] = stored_data_1.data[data_key].merge(stored_data_1.data[data_key], stored_data_2.data[data_key]) return merged_data
def __getstate__(self): state = super(StoredDataCollection, self).__getstate__() state.update({'data': self.data}) return state def __setstate__(self, state): super(StoredDataCollection, self).__setstate__(state) self.data = state['data']