"""
An API for defining, storing, and loading collections of physical property data.
"""
from collections import defaultdict
import pandas
from simtk.openmm.app import element
from propertyestimator import unit
from propertyestimator.properties import MeasurementSource, CalculationSource
from propertyestimator.substances import Substance
from propertyestimator.utils import create_molecule_from_smiles
from propertyestimator.utils.serialization import TypedBaseModel
[docs]class PhysicalPropertyDataSet(TypedBaseModel):
"""
An object for storing and curating data sets of both physical property
measurements and estimated. This class defines a number of convenience
functions for filtering out unwanted properties, and for generating
general statistics (such as the number of properties per substance)
about the set.
"""
[docs] def __init__(self):
"""
Constructs a new PhysicalPropertyDataSet object.
"""
self._properties = {}
self._sources = []
@property
def properties(self):
"""
dict of str and list of PhysicalProperty: A list of all of the properties
within this set, partitioned by substance identifier.
TODO: Add a link to Substance.identifier when have access to sphinx docs.
TODO: Investigate why PhysicalProperty is not cross-linking.
See Also
--------
Substance.identifier()
"""
return self._properties
@property
def sources(self):
"""list of Source: The list of sources from which the properties were gathered"""
return self._sources
@property
def number_of_properties(self):
"""int: The number of properties in the data set."""
return sum([len(properties) for properties in self._properties.values()])
[docs] def merge(self, data_set):
"""Merge another data set into the current one.
Parameters
----------
data_set : PhysicalPropertyDataSet
The secondary data set to merge into this one.
"""
if data_set is None:
return
# TODO: Do we need to check whether merging the same data set here?
for substance_hash in data_set.properties:
if substance_hash not in self._properties:
self._properties[substance_hash] = []
self._properties[substance_hash].extend(
data_set.properties[substance_hash])
self._sources.extend(data_set.sources)
[docs] def filter_by_function(self, filter_function):
"""Filter the data set using a given filter function.
Parameters
----------
filter_function : lambda
The filter function.
"""
filtered_properties = {}
# This works for now - if we wish to be able to undo a filter then
# a 'filtered' list needs to be maintained separately to the main list.
for substance_id in self._properties:
substance_properties = list(filter(
filter_function, self._properties[substance_id]))
if len(substance_properties) <= 0:
continue
filtered_properties[substance_id] = substance_properties
self._properties = {}
for substance_id in filtered_properties:
self._properties[substance_id] = filtered_properties[substance_id]
[docs] def filter_by_property_types(self, *property_type):
"""Filter the data set based on the type of property (e.g Density).
Parameters
----------
property_type : PropertyType or str
The type of property which should be retained.
Examples
--------
Filter the dataset to only contain densities and static dielectric constants
>>> # Load in the data set of properties which will be used for comparisons
>>> from propertyestimator.datasets import ThermoMLDataSet
>>> data_set = ThermoMLDataSet.from_doi('10.1016/j.jct.2016.10.001')
>>>
>>> # Filter the dataset to only include densities and dielectric constants.
>>> from propertyestimator.properties import Density, DielectricConstant
>>> data_set.filter_by_property_types(Density, DielectricConstant)
or
>>> data_set.filter_by_property_types('Density', 'DielectricConstant')
"""
property_types = []
for type_to_retain in property_type:
if isinstance(type_to_retain, str):
property_types.append(type_to_retain)
else:
property_types.append(type_to_retain.__name__)
def filter_function(x):
return type(x).__name__ in property_types
self.filter_by_function(filter_function)
[docs] def filter_by_phases(self, phases):
"""Filter the data set based on the phase of the property (e.g liquid).
Parameters
----------
phases : PropertyPhase
The phase of property which should be retained.
Examples
--------
Filter the dataset to only include liquid properties.
>>> # Load in the data set of properties which will be used for comparisons
>>> from propertyestimator.datasets import ThermoMLDataSet
>>> data_set = ThermoMLDataSet.from_doi('10.1016/j.jct.2016.10.001')
>>>
>>> from propertyestimator.properties import PropertyPhase
>>> data_set.filter_by_temperature(PropertyPhase.Liquid)
"""
def filter_function(x):
return x.phase & phases
self.filter_by_function(filter_function)
[docs] def filter_by_temperature(self, min_temperature, max_temperature):
"""Filter the data set based on a minimum and maximum temperature.
Parameters
----------
min_temperature : unit.Quantity
The minimum temperature.
max_temperature : unit.Quantity
The maximum temperature.
Examples
--------
Filter the dataset to only include properties measured between 130-260 K.
>>> # Load in the data set of properties which will be used for comparisons
>>> from propertyestimator.datasets import ThermoMLDataSet
>>> data_set = ThermoMLDataSet.from_doi('10.1016/j.jct.2016.10.001')
>>>
>>> from propertyestimator import unit
>>> data_set.filter_by_temperature(min_temperature=130*unit.kelvin, max_temperature=260*unit.kelvin)
"""
def filter_function(x):
return min_temperature <= x.thermodynamic_state.temperature <= max_temperature
self.filter_by_function(filter_function)
[docs] def filter_by_pressure(self, min_pressure, max_pressure):
"""Filter the data set based on a minimum and maximum pressure.
Parameters
----------
min_pressure : unit.Quantity
The minimum pressure.
max_pressure : unit.Quantity
The maximum pressure.
Examples
--------
Filter the dataset to only include properties measured between 70-150 kPa.
>>> # Load in the data set of properties which will be used for comparisons
>>> from propertyestimator.datasets import ThermoMLDataSet
>>> data_set = ThermoMLDataSet.from_doi('10.1016/j.jct.2016.10.001')
>>>
>>> from propertyestimator import unit
>>> data_set.filter_by_temperature(min_pressure=70*unit.kilopascal, max_temperature=150*unit.kilopascal)
"""
def filter_function(x):
if x.thermodynamic_state.pressure is None:
return True
return min_pressure <= x.thermodynamic_state.pressure <= max_pressure
self.filter_by_function(filter_function)
[docs] def filter_by_components(self, number_of_components):
"""Filter the data set based on a minimum and maximum temperature.
Parameters
----------
number_of_components : int
The allowed number of components in the mixture.
Examples
--------
Filter the dataset to only include pure substance properties.
>>> # Load in the data set of properties which will be used for comparisons
>>> from propertyestimator.datasets import ThermoMLDataSet
>>> data_set = ThermoMLDataSet.from_doi('10.1016/j.jct.2016.10.001')
>>>
>>> data_set.filter_by_components(number_of_components=1)
"""
def filter_function(x):
return x.substance.number_of_components == number_of_components
self.filter_by_function(filter_function)
[docs] def filter_by_elements(self, *allowed_elements):
"""Filters out those properties which were estimated for
compounds which contain elements outside of those defined
in `allowed_elements`.
Parameters
----------
allowed_elements: str
The symbols (e.g. C, H, Cl) of the elements to
retain.
"""
def filter_function(physical_property):
substance = physical_property.substance
for component in substance.components:
oe_molecule = create_molecule_from_smiles(component.smiles, 0)
for atom in oe_molecule.GetAtoms():
atomic_number = atom.GetAtomicNum()
atomic_element = element.Element.getByAtomicNumber(atomic_number).symbol
if atomic_element in allowed_elements:
continue
return False
return True
self.filter_by_function(filter_function)
[docs] def filter_by_smiles(self, *allowed_smiles):
"""Filters out those properties which were estimated for
compounds which do not appear in the allowed `smiles` list.
Parameters
----------
allowed_smiles: str
The smiles identifiers of the compounds to keep
after filtering.
"""
def filter_function(physical_property):
substance = physical_property.substance
for component in substance.components:
if component.smiles in allowed_smiles:
continue
return False
return True
self.filter_by_function(filter_function)
[docs] def to_pandas(self):
"""Converts a `PhysicalPropertyDataSet` to a `pandas.DataFrame` object
with columns of
- 'Temperature'
- 'Pressure'
- 'Phase'
- 'Number Of Components'
- 'Component 1'
- 'Mole Fraction 1'
- ...
- 'Component N'
- 'Mole Fraction N'
- '<Property 1> Value'
- '<Property 1> Uncertainty'
- ...
- '<Property N> Value'
- '<Property N> Uncertainty'
- `'Source'`
where 'Component X' is a column containing the smiles representation of component X.
Returns
-------
pandas.DataFrame
The create data frame.
"""
# Determine the maximum number of components for any
# given measurements.
maximum_number_of_components = 0
all_property_types = set()
for substance_id in self._properties:
if len(self._properties[substance_id]) == 0:
continue
substance = self._properties[substance_id][0].substance
maximum_number_of_components = max(maximum_number_of_components, substance.number_of_components)
for physical_property in self._properties[substance_id]:
all_property_types.add(type(physical_property))
# Make sure the maximum number of components is not zero.
if maximum_number_of_components <= 0 and len(self._properties) > 0:
raise ValueError('The data set did not contain any substances with '
'one or more components.')
data_rows = []
# Extract the data from the data set.
for substance_id in self._properties:
data_points_by_state = defaultdict(dict)
for physical_property in self._properties[substance_id]:
all_property_types.add(type(physical_property))
# Extract the measured state.
temperature = physical_property.thermodynamic_state.temperature.to(unit.kelvin)
pressure = None
if physical_property.thermodynamic_state.pressure is not None:
pressure = physical_property.thermodynamic_state.pressure.to(unit.kilopascal)
phase = physical_property.phase
# Extract the component data.
number_of_components = physical_property.substance.number_of_components
components = [] * maximum_number_of_components
for index, component in enumerate(physical_property.substance.components):
amount = next(iter(physical_property.substance.get_amounts(component)))
assert isinstance(amount, Substance.MoleFraction)
components.append((component.smiles, amount.value))
# Extract the value data as a string.
value = None if physical_property.value is None else str(physical_property.value)
uncertainty = None if physical_property.uncertainty is None else str(physical_property.uncertainty)
# Extract the data source.
source = None
if isinstance(physical_property.source, MeasurementSource):
source = physical_property.source.reference
if source is None:
source = physical_property.source.doi
elif isinstance(physical_property.source, CalculationSource):
source = physical_property.source.fidelity
# Create the data row.
data_row = {
'Temperature': str(temperature),
'Pressure': str(pressure),
'Phase': phase,
'Number Of Components': number_of_components
}
for index in range(len(components)):
data_row[f'Component {index + 1}'] = components[index][0]
data_row[f'Mole Fraction {index + 1}'] = components[index][1]
data_row[f'{type(physical_property).__name__} Value'] = value
data_row[f'{type(physical_property).__name__} Uncertainty'] = uncertainty
data_row['Source'] = source
data_points_by_state[physical_property.thermodynamic_state].update(data_row)
for state in data_points_by_state:
data_rows.append(data_points_by_state[state])
# Set up the column headers.
if len(data_rows) == 0:
return None
data_columns = [
'Temperature',
'Pressure',
'Phase',
'Number Of Components',
]
for index in range(maximum_number_of_components):
data_columns.append(f'Component {index + 1}')
data_columns.append(f'Mole Fraction {index + 1}')
for property_type in all_property_types:
data_columns.append(f'{property_type.__name__} Value')
data_columns.append(f'{property_type.__name__} Uncertainty')
data_frame = pandas.DataFrame(data_rows, columns=data_columns)
return data_frame
def __getstate__(self):
return {
'properties': self._properties,
'sources': self._sources
}
def __setstate__(self, state):
self._properties = state['properties']
self._sources = state['sources']