"""The module contains curation components for converting one type of property (e.g.
density) into another (e.g excess molar volume)"""
import functools
import logging
from typing import Union
import pandas
from pydantic import Field, conint
from typing_extensions import Literal
from openff.evaluator.datasets.curation.components import (
CurationComponent,
CurationComponentSchema,
)
logger = logging.getLogger(__name__)
[docs]class ConvertExcessDensityDataSchema(CurationComponentSchema):
type: Literal["ConvertExcessDensityDataSchema"] = "ConvertExcessDensityDataSchema"
temperature_precision: conint(ge=0) = Field(
2,
description="The number of decimal places to compare temperatures (K) to "
"within when attempting to identify compatible pure and binary data.",
)
pressure_precision: conint(ge=0) = Field(
1,
description="The number of decimal places to compare pressures (kPa) to "
"within when attempting to identify compatible pure and binary data.",
)
[docs]class ConvertExcessDensityData(CurationComponent):
"""A component for converting binary mass density data to excess molar volume
data and vice versa where pure density data measured for the components is
available.
Notes
-----
This protocol may result in duplicate data points being generated. It is
recommended to apply the de-duplication filter after this component has been
applied.
"""
@classmethod
@functools.lru_cache(500)
def _molecular_weight(cls, smiles):
from openff.toolkit.topology import Molecule
molecule = Molecule.from_smiles(smiles, allow_undefined_stereo=True)
# Atom.mass is guaranteed to be in Daltons
molecular_weight = sum(atom.mass.m for atom in molecule.atoms)
return molecular_weight
@classmethod
def _find_overlapping_data_points(
cls,
pure_data_set: pandas.DataFrame,
binary_data_set: pandas.DataFrame,
schema: ConvertExcessDensityDataSchema,
):
"""Finds those binary data points for which there also exists pure
data points for each component in the binary system.
Parameters
----------
pure_data_set
The pure data set.
binary_data_set
The binary data set.
schema
The schema for this component.
Returns
-------
pandas.DataFrame
The data set containing the pure and binary data points
measured for the same substances at the same state pounts
"""
if len(pure_data_set) == 0 or len(binary_data_set) == 0:
return pandas.DataFrame()
pure_data_set = pure_data_set.dropna(axis=1, how="all")
binary_data_set = binary_data_set.dropna(axis=1, how="all")
# Round the floats which will be compared.
pure_data_set["Temperature (K)"] = pure_data_set["Temperature (K)"].round(
schema.temperature_precision
)
pure_data_set["Pressure (kPa)"] = pure_data_set["Pressure (kPa)"].round(
schema.pressure_precision
)
binary_data_set["Temperature (K)"] = binary_data_set["Temperature (K)"].round(
schema.temperature_precision
)
binary_data_set["Pressure (kPa)"] = binary_data_set["Pressure (kPa)"].round(
schema.pressure_precision
)
# Only consider pure measurements which only have mole fractions defined
if "Exact Amount 1" in pure_data_set:
pure_data_set = pure_data_set[pure_data_set["Exact Amount 1"].isna()]
if "Mole Fraction 1" not in pure_data_set:
return pandas.DataFrame()
pure_data_set = pure_data_set[pure_data_set["Mole Fraction 1"].notna()]
# Retain only the minimally informative pure data columns.
data_columns = [
"Temperature (K)",
"Pressure (kPa)",
"Phase",
"Component 1",
"Density Value (g / ml)",
"Source",
]
if "Density Uncertainty (g / ml)" in pure_data_set:
data_columns.append("Density Uncertainty (g / ml)")
pure_data_set = pure_data_set[data_columns]
pure_data_set = pandas.merge(
pure_data_set,
pure_data_set,
how="inner",
on=["Temperature (K)", "Pressure (kPa)", "Phase"],
)
overlapping_set = pandas.merge(
binary_data_set,
pure_data_set,
how="inner",
left_on=[
"Temperature (K)",
"Pressure (kPa)",
"Phase",
"Component 1",
"Component 2",
],
right_on=[
"Temperature (K)",
"Pressure (kPa)",
"Phase",
"Component 1_x",
"Component 1_y",
],
suffixes=("", ""),
)
return overlapping_set
@classmethod
def _convert_density_to_v_excess(
cls, density_data_set: pandas.DataFrame
) -> pandas.DataFrame:
"""Converts a pandas data frame containing both binary mass densities
and pure mass densities into one which contains excess molar volume
measurements.
Parameters
----------
density_data_set
The data frame containing both pure and binary
density measurements. This should be generated using the
`find_overlapping_data_points` function.
Returns
-------
A data frame which contains the excess molar volume measurements.
"""
m_1 = density_data_set["Component 1"].apply(cls._molecular_weight)
m_1_x_1 = m_1 * density_data_set["Mole Fraction 1"]
m_2 = density_data_set["Component 2"].apply(cls._molecular_weight)
m_2_x_2 = m_2 * density_data_set["Mole Fraction 2"]
v_excess = (
(m_1_x_1 + m_2_x_2) / density_data_set["Density Value (g / ml)"]
- m_1_x_1 / density_data_set["Density Value (g / ml)_x"]
- m_2_x_2 / density_data_set["Density Value (g / ml)_y"]
)
source = density_data_set[["Source", "Source_x", "Source_y"]].agg(
" + ".join, axis=1
)
# Add the new values to a new data frame.
columns_to_drop = [
x for x in density_data_set if x.endswith("_x") or x.endswith("_y")
]
columns_to_drop.append("Density Value (g / ml)")
columns_to_drop.append("Source")
if "Density Uncertainty (g / ml)" in density_data_set:
columns_to_drop.append("Density Uncertainty (g / ml)")
v_excess_data_set = density_data_set.drop(columns=columns_to_drop).copy()
v_excess_data_set.insert(
v_excess_data_set.shape[1],
"ExcessMolarVolume Value (cm ** 3 / mol)",
v_excess,
)
v_excess_data_set.insert(v_excess_data_set.shape[1], "Source", source)
return v_excess_data_set
@classmethod
def _convert_v_excess_to_density(
cls, v_excess_data_set: pandas.DataFrame
) -> pandas.DataFrame:
"""Converts a pandas data frame containing both excess molar volumes
and pure mass densities into one which contains binary mass density
measurements.
Parameters
----------
v_excess_data_set
The data frame containing both pure density and excess molar
volume measurements. This should be generated using the
`find_overlapping_data_points` function.
Returns
-------
A data frame which contains the excess molar volume measurements.
"""
m_1 = v_excess_data_set["Component 1"].apply(cls._molecular_weight)
m_1_x_1 = m_1 * v_excess_data_set["Mole Fraction 1"]
m_2 = v_excess_data_set["Component 2"].apply(cls._molecular_weight)
m_2_x_2 = m_2 * v_excess_data_set["Mole Fraction 2"]
v_excess = v_excess_data_set["ExcessMolarVolume Value (cm ** 3 / mol)"]
denominator = (
v_excess
+ m_1_x_1 / v_excess_data_set["Density Value (g / ml)_x"]
+ m_2_x_2 / v_excess_data_set["Density Value (g / ml)_y"]
)
rho_binary = (m_1_x_1 + m_2_x_2) / denominator
source = v_excess_data_set[["Source", "Source_x", "Source_y"]].agg(
" + ".join, axis=1
)
# Add the new values to a new data frame.
columns_to_drop = [
x for x in v_excess_data_set if x.endswith("_x") or x.endswith("_y")
]
columns_to_drop.append("ExcessMolarVolume Value (cm ** 3 / mol)")
columns_to_drop.append("Source")
if "ExcessMolarVolume Uncertainty (cm ** 3 / mol)" in v_excess_data_set:
columns_to_drop.append("ExcessMolarVolume Uncertainty (cm ** 3 / mol)")
density_data_set = v_excess_data_set.drop(columns=columns_to_drop).copy()
density_data_set.insert(
density_data_set.shape[1] - 1, "Density Value (g / ml)", rho_binary
)
density_data_set.insert(density_data_set.shape[1] - 1, "Source", source)
return density_data_set
@classmethod
def _apply(
cls,
data_frame: pandas.DataFrame,
schema: ConvertExcessDensityDataSchema,
n_processes,
) -> pandas.DataFrame:
if len(data_frame) == 0:
return data_frame
# Check to make sure the data frame contains at least a
# density column which may store pure densities.
if "Density Value (g / ml)" not in data_frame:
return data_frame
# Separate out the data sets of interest
pure_density_data = data_frame[
(data_frame["Density Value (g / ml)"].notna())
& (data_frame["N Components"] == 1)
]
pure_density_data = pure_density_data.dropna(axis=1, how="all")
# Exit early if no pure densities can be found.
if len(pure_density_data) == 0:
return data_frame
# Add the pure data to the binary data sets to make conversion easier.
binary_density_data = data_frame[
(data_frame["Density Value (g / ml)"].notna())
& (data_frame["N Components"] == 2)
]
binary_density_data = binary_density_data.dropna(axis=1, how="all")
binary_density_data = cls._find_overlapping_data_points(
pure_density_data, binary_density_data, schema
)
v_excess_data = pandas.DataFrame()
if "ExcessMolarVolume Value (cm ** 3 / mol)" in data_frame:
v_excess_data = data_frame[
(data_frame["ExcessMolarVolume Value (cm ** 3 / mol)"].notna())
& (data_frame["N Components"] == 2)
]
v_excess_data = v_excess_data.dropna(axis=1, how="all")
v_excess_data = cls._find_overlapping_data_points(
pure_density_data, v_excess_data, schema
)
if len(binary_density_data) == 0 and len(v_excess_data) == 0:
return data_frame
# Inter-convert the two sets
data_to_concat = [data_frame]
if len(binary_density_data) > 0:
v_excess_from_density = cls._convert_density_to_v_excess(
binary_density_data
)
data_to_concat.append(v_excess_from_density)
if len(v_excess_data) > 0:
density_from_v_excess = cls._convert_v_excess_to_density(v_excess_data)
data_to_concat.append(density_from_v_excess)
if len(data_to_concat) > 1:
converted_data = pandas.concat(
data_to_concat,
ignore_index=True,
sort=False,
)
else:
converted_data = data_frame
return converted_data
ConversionComponentSchema = Union[ConvertExcessDensityDataSchema]