Defines the base API for the property estimator storage backend.

import hashlib
import uuid
from os import path

from import StoredSimulationData
from import BaseStoredData

[docs]class PropertyEstimatorStorage: """An abstract base representation of how the property estimator will interact with and store simulation data. Notes ----- Any inheriting class must provide an implementation for the `store_object`, `retrieve_object` and `has_object` methods """
[docs] def __init__(self): """Constructs a new PropertyEstimatorStorage object. """ self._stored_object_keys = set() self._stored_object_keys_file = 'internal_object_keys' # Store a map between the unique id of a force field, # and its hash value for easy comparision of force fields. self._force_field_id_map = {} self._force_field_id_map_file = 'internal_force_field_map' self._simulation_data_by_substance = {} self._simulation_data_by_substance_file = 'internal_simulation_data_map' self._load_stored_object_keys() self._load_force_field_hashes() self._load_simulation_data_map()
def _load_stored_object_keys(self): """Load the unique key to each object stored in the storage system. """ stored_object_keys = self._retrieve_object(self._stored_object_keys_file) if stored_object_keys is None: stored_object_keys = set() for unique_key in stored_object_keys: if not self._has_object(unique_key): # The stored entry key does not exist in the system, so skip the entry. continue self._stored_object_keys.add(unique_key) # Store a fresh copy of the key dictionary so that only entries # that exist in the system actually referenced. self._save_stored_object_keys() def _save_stored_object_keys(self): """Save the unique key of each of the objects stored in the storage system. """ self._store_object(self._stored_object_keys_file, self._stored_object_keys) def _store_object(self, storage_key, object_to_store): """Store an object in the estimators storage system. Parameters ---------- storage_key: str A unique key that describes where the object will be stored in the storage system. object_to_store: Any The object to store. The object must be pickle serializable. """ if object_to_store is None: raise ValueError('The object to store cannot be None.') if storage_key in self._stored_object_keys: return self._stored_object_keys.add(storage_key) self._save_stored_object_keys() def _retrieve_object(self, storage_key): """Retrieves a stored object for the estimators storage system. Parameters ---------- storage_key: str A unique key that describes where the stored object can be found within the storage system. Returns ------- Any, optional The stored object if the object key is found, otherwise None. """ raise NotImplementedError() def _has_object(self, storage_key): """Check whether an object with the specified key exists in the storage system. Parameters ---------- storage_key: str A unique key that describes where the stored object can be found within the storage system. Returns ------- True if the object is within the storage system. """ raise NotImplementedError() def _load_force_field_hashes(self): """Load the unique id and hash keys of each of the force fields which have been stored in the force field directory (``self._force_field_root``). """ force_field_id_map = self._retrieve_object(self._force_field_id_map_file) if force_field_id_map is None: force_field_id_map = {} for unique_id in force_field_id_map: force_field_key = 'force_field_{}'.format(unique_id) if not self._has_object(force_field_key): # The force field file does not exist, so skip the entry. continue self._force_field_id_map[unique_id] = force_field_id_map[unique_id] # Store a fresh copy of the hashes so that only force fields that # exist are actually referenced. self._save_force_field_hashes() def _save_force_field_hashes(self): """Save the unique id and force field hash key dictionary. """ self._store_object(self._force_field_id_map_file, self._force_field_id_map) @staticmethod def _force_field_to_hash(force_field): """Converts a ForceField object to a hash string. Parameters ---------- force_field: ForceField The force field to hash. Returns ------- str The hash key of the force field. """ force_field_string = force_field.to_string(discard_cosmetic_attributes=True) return hashlib.sha256(force_field_string.encode()).hexdigest()
[docs] def has_force_field(self, force_field): """Checks whether the force field has been previously stored in the force field directory. Parameters ---------- force_field: ForceField The force field to check for. Returns ------- str, optional None if the force field has not been cached, otherwise the unique id of the cached force field. """ hash_string = self._force_field_to_hash(force_field) for unique_id in self._force_field_id_map: existing_hash = self._force_field_id_map[unique_id] if hash_string != existing_hash: continue force_field_key = 'force_field_{}'.format(unique_id) if not self._has_object(force_field_key): # For some reason the force field got deleted.. continue return unique_id return None
[docs] def retrieve_force_field(self, unique_id): """Retrieves a force field from storage, if it exists. Parameters ---------- unique_id: str The unique id of the force field to retrieve Returns ------- openforcefield.typing.engines.smirnoff.ForceField, optional The force field if present in the storage system with the given key, otherwise None. """ from openforcefield.typing.engines.smirnoff import ForceField force_field_key = 'force_field_{}'.format(unique_id) serialized_force_field = self._retrieve_object(force_field_key) if serialized_force_field is None: raise KeyError(f'The force field with id {unique_id} does not exist ' f'in the storage system.') return ForceField(serialized_force_field)
[docs] def store_force_field(self, force_field): """Store the force field in the cached force field directory. Parameters ---------- force_field: openforcefield.typing.engines.smirnoff.ForceField The force field to store. Returns ------- str The unique id of the stored force field. """ unique_id = str(uuid.uuid4()) # Be extra cautious and mash sure there wasn't # a hash collision. while unique_id in self._force_field_id_map: unique_id = str(uuid.uuid4()) hash_string = self._force_field_to_hash(force_field) force_field_key = 'force_field_{}'.format(unique_id) # We make sure to strip the cosmetic attributes from the stored FF as these should # not affect the science of the FF, and aren't currently consumed by the estimator. self._store_object(force_field_key, force_field.to_string(discard_cosmetic_attributes=True)) # Make sure to hash the force field for easy access. if (unique_id not in self._force_field_id_map or hash_string != self._force_field_id_map[unique_id]): self._force_field_id_map[unique_id] = hash_string self._save_force_field_hashes() return unique_id
def _load_simulation_data_map(self): """Load the dictionary which tracks which stored simulation data was calculated for a specific substance. """ _simulation_data_by_substance = self._retrieve_object(self._simulation_data_by_substance_file) if _simulation_data_by_substance is None: _simulation_data_by_substance = {} for substance_id in _simulation_data_by_substance: self._simulation_data_by_substance[substance_id] = [] for unique_id in _simulation_data_by_substance[substance_id]: data_object, data_directory = self.retrieve_simulation_data_by_id(unique_id) if data_object is None or not path.isdir(data_directory): # The stored data does not exist, so skip the entry. continue self._simulation_data_by_substance[substance_id].append(unique_id) # Store a fresh copy of the hashes so that only force fields that # exist are actually referenced. self._save_simulation_data_map() def _save_simulation_data_map(self): """Save the unique id and simulation data key by substance dictionary. """ self._store_object(self._simulation_data_by_substance_file, self._simulation_data_by_substance)
[docs] def retrieve_simulation_data_by_id(self, unique_id): """Attempts to retrieve a storage piece of simulation data from it's unique id. Parameters ---------- unique_id: str The unique id assigned to the data. Returns ------- BaseStoredData The stored data object. str The path to the data's corresponding directory. """ raise NotImplementedError()
[docs] def retrieve_simulation_data(self, substance, include_component_data=True, data_class=StoredSimulationData): """Retrieves any data that has been stored for a given substance. Parameters ---------- substance: Substance The substance to check for. include_component_data: bool If the substance if a mixture where has multiple components and `include_component_data` is True, data will be returned for both the mixed system, and for the individual components, otherwise only data for the mixed system will be returned. data_class: subclass of BaseStoredData The type of data to retrieve. Returns ------- dict of str and tuple of BaseStoredData and str A dictionary of the stored data objects and their corresponding directory paths partitioned by substance id. """ raise NotImplementedError()
[docs] def store_simulation_data(self, data_object, data_directory): """Store the simulation data. Notes ----- If the storage system already contains equivalent information (i.e data stored for the same substance, thermodynamic state and parameter set) then the data will be merged according to the data objects `merge` method. Parameters ---------- data_object: BaseStoredData The data object being stored. data_directory: str The directory which stores files associated with the data object such as trajectory files. Returns ------- str The unique id of the stored data. """ if not path.isdir(data_directory): raise ValueError(f'The {data_directory} data directory either could' f' not be found or is invalid.') if not isinstance(data_object, BaseStoredData): raise ValueError('The data object must inherit from the `BaseStoredData` class.') if data_object.substance is None: raise ValueError('The data object must have a valid substance.') substance_id = data_object.substance.identifier existing_data_key = None data_to_store = None if substance_id in self._simulation_data_by_substance: # Check if any existing stored data is compatible with the # new data we are trying to store for stored_data_key in self._simulation_data_by_substance[substance_id]: stored_data = self._retrieve_object(stored_data_key) if not stored_data.can_merge(data_object): continue data_to_store = stored_data.merge(stored_data, data_object) existing_data_key = stored_data_key break if existing_data_key is None: existing_data_key = "{}_{}".format(substance_id, uuid.uuid4()) data_to_store = data_object self._store_object(existing_data_key, data_to_store) # Store the unique id assigned to the data in the master # list of ids if not already present. if (substance_id not in self._simulation_data_by_substance or existing_data_key not in self._simulation_data_by_substance[substance_id]): if substance_id not in self._simulation_data_by_substance: self._simulation_data_by_substance[substance_id] = [] self._simulation_data_by_substance[substance_id].append(existing_data_key) self._save_simulation_data_map() return existing_data_key