Source code for openff.evaluator.datasets.curation.components.conversion

"""The module contains curation components for converting one type of property (e.g.
density) into another (e.g excess molar volume)"""

import functools
import logging
from typing import Union

import pandas
from pydantic import Field, conint
from typing_extensions import Literal

from openff.evaluator.datasets.curation.components import (
    CurationComponent,
    CurationComponentSchema,
)

logger = logging.getLogger(__name__)


[docs]class ConvertExcessDensityDataSchema(CurationComponentSchema): type: Literal["ConvertExcessDensityDataSchema"] = "ConvertExcessDensityDataSchema" temperature_precision: conint(ge=0) = Field( 2, description="The number of decimal places to compare temperatures (K) to " "within when attempting to identify compatible pure and binary data.", ) pressure_precision: conint(ge=0) = Field( 1, description="The number of decimal places to compare pressures (kPa) to " "within when attempting to identify compatible pure and binary data.", )
[docs]class ConvertExcessDensityData(CurationComponent): """A component for converting binary mass density data to excess molar volume data and vice versa where pure density data measured for the components is available. Notes ----- This protocol may result in duplicate data points being generated. It is recommended to apply the de-duplication filter after this component has been applied. """ @classmethod @functools.lru_cache(500) def _molecular_weight(cls, smiles): from openff.toolkit.topology import Molecule molecule = Molecule.from_smiles(smiles, allow_undefined_stereo=True) # Atom.mass is guaranteed to be in Daltons molecular_weight = sum(atom.mass.m for atom in molecule.atoms) return molecular_weight @classmethod def _find_overlapping_data_points( cls, pure_data_set: pandas.DataFrame, binary_data_set: pandas.DataFrame, schema: ConvertExcessDensityDataSchema, ): """Finds those binary data points for which there also exists pure data points for each component in the binary system. Parameters ---------- pure_data_set The pure data set. binary_data_set The binary data set. schema The schema for this component. Returns ------- pandas.DataFrame The data set containing the pure and binary data points measured for the same substances at the same state pounts """ if len(pure_data_set) == 0 or len(binary_data_set) == 0: return pandas.DataFrame() pure_data_set = pure_data_set.dropna(axis=1, how="all") binary_data_set = binary_data_set.dropna(axis=1, how="all") # Round the floats which will be compared. pure_data_set["Temperature (K)"] = pure_data_set["Temperature (K)"].round( schema.temperature_precision ) pure_data_set["Pressure (kPa)"] = pure_data_set["Pressure (kPa)"].round( schema.pressure_precision ) binary_data_set["Temperature (K)"] = binary_data_set["Temperature (K)"].round( schema.temperature_precision ) binary_data_set["Pressure (kPa)"] = binary_data_set["Pressure (kPa)"].round( schema.pressure_precision ) # Only consider pure measurements which only have mole fractions defined if "Exact Amount 1" in pure_data_set: pure_data_set = pure_data_set[pure_data_set["Exact Amount 1"].isna()] if "Mole Fraction 1" not in pure_data_set: return pandas.DataFrame() pure_data_set = pure_data_set[pure_data_set["Mole Fraction 1"].notna()] # Retain only the minimally informative pure data columns. data_columns = [ "Temperature (K)", "Pressure (kPa)", "Phase", "Component 1", "Density Value (g / ml)", "Source", ] if "Density Uncertainty (g / ml)" in pure_data_set: data_columns.append("Density Uncertainty (g / ml)") pure_data_set = pure_data_set[data_columns] pure_data_set = pandas.merge( pure_data_set, pure_data_set, how="inner", on=["Temperature (K)", "Pressure (kPa)", "Phase"], ) overlapping_set = pandas.merge( binary_data_set, pure_data_set, how="inner", left_on=[ "Temperature (K)", "Pressure (kPa)", "Phase", "Component 1", "Component 2", ], right_on=[ "Temperature (K)", "Pressure (kPa)", "Phase", "Component 1_x", "Component 1_y", ], suffixes=("", ""), ) return overlapping_set @classmethod def _convert_density_to_v_excess( cls, density_data_set: pandas.DataFrame ) -> pandas.DataFrame: """Converts a pandas data frame containing both binary mass densities and pure mass densities into one which contains excess molar volume measurements. Parameters ---------- density_data_set The data frame containing both pure and binary density measurements. This should be generated using the `find_overlapping_data_points` function. Returns ------- A data frame which contains the excess molar volume measurements. """ m_1 = density_data_set["Component 1"].apply(cls._molecular_weight) m_1_x_1 = m_1 * density_data_set["Mole Fraction 1"] m_2 = density_data_set["Component 2"].apply(cls._molecular_weight) m_2_x_2 = m_2 * density_data_set["Mole Fraction 2"] v_excess = ( (m_1_x_1 + m_2_x_2) / density_data_set["Density Value (g / ml)"] - m_1_x_1 / density_data_set["Density Value (g / ml)_x"] - m_2_x_2 / density_data_set["Density Value (g / ml)_y"] ) source = density_data_set[["Source", "Source_x", "Source_y"]].agg( " + ".join, axis=1 ) # Add the new values to a new data frame. columns_to_drop = [ x for x in density_data_set if x.endswith("_x") or x.endswith("_y") ] columns_to_drop.append("Density Value (g / ml)") columns_to_drop.append("Source") if "Density Uncertainty (g / ml)" in density_data_set: columns_to_drop.append("Density Uncertainty (g / ml)") v_excess_data_set = density_data_set.drop(columns=columns_to_drop).copy() v_excess_data_set.insert( v_excess_data_set.shape[1], "ExcessMolarVolume Value (cm ** 3 / mol)", v_excess, ) v_excess_data_set.insert(v_excess_data_set.shape[1], "Source", source) return v_excess_data_set @classmethod def _convert_v_excess_to_density( cls, v_excess_data_set: pandas.DataFrame ) -> pandas.DataFrame: """Converts a pandas data frame containing both excess molar volumes and pure mass densities into one which contains binary mass density measurements. Parameters ---------- v_excess_data_set The data frame containing both pure density and excess molar volume measurements. This should be generated using the `find_overlapping_data_points` function. Returns ------- A data frame which contains the excess molar volume measurements. """ m_1 = v_excess_data_set["Component 1"].apply(cls._molecular_weight) m_1_x_1 = m_1 * v_excess_data_set["Mole Fraction 1"] m_2 = v_excess_data_set["Component 2"].apply(cls._molecular_weight) m_2_x_2 = m_2 * v_excess_data_set["Mole Fraction 2"] v_excess = v_excess_data_set["ExcessMolarVolume Value (cm ** 3 / mol)"] denominator = ( v_excess + m_1_x_1 / v_excess_data_set["Density Value (g / ml)_x"] + m_2_x_2 / v_excess_data_set["Density Value (g / ml)_y"] ) rho_binary = (m_1_x_1 + m_2_x_2) / denominator source = v_excess_data_set[["Source", "Source_x", "Source_y"]].agg( " + ".join, axis=1 ) # Add the new values to a new data frame. columns_to_drop = [ x for x in v_excess_data_set if x.endswith("_x") or x.endswith("_y") ] columns_to_drop.append("ExcessMolarVolume Value (cm ** 3 / mol)") columns_to_drop.append("Source") if "ExcessMolarVolume Uncertainty (cm ** 3 / mol)" in v_excess_data_set: columns_to_drop.append("ExcessMolarVolume Uncertainty (cm ** 3 / mol)") density_data_set = v_excess_data_set.drop(columns=columns_to_drop).copy() density_data_set.insert( density_data_set.shape[1] - 1, "Density Value (g / ml)", rho_binary ) density_data_set.insert(density_data_set.shape[1] - 1, "Source", source) return density_data_set @classmethod def _apply( cls, data_frame: pandas.DataFrame, schema: ConvertExcessDensityDataSchema, n_processes, ) -> pandas.DataFrame: if len(data_frame) == 0: return data_frame # Check to make sure the data frame contains at least a # density column which may store pure densities. if "Density Value (g / ml)" not in data_frame: return data_frame # Separate out the data sets of interest pure_density_data = data_frame[ (data_frame["Density Value (g / ml)"].notna()) & (data_frame["N Components"] == 1) ] pure_density_data = pure_density_data.dropna(axis=1, how="all") # Exit early if no pure densities can be found. if len(pure_density_data) == 0: return data_frame # Add the pure data to the binary data sets to make conversion easier. binary_density_data = data_frame[ (data_frame["Density Value (g / ml)"].notna()) & (data_frame["N Components"] == 2) ] binary_density_data = binary_density_data.dropna(axis=1, how="all") binary_density_data = cls._find_overlapping_data_points( pure_density_data, binary_density_data, schema ) v_excess_data = pandas.DataFrame() if "ExcessMolarVolume Value (cm ** 3 / mol)" in data_frame: v_excess_data = data_frame[ (data_frame["ExcessMolarVolume Value (cm ** 3 / mol)"].notna()) & (data_frame["N Components"] == 2) ] v_excess_data = v_excess_data.dropna(axis=1, how="all") v_excess_data = cls._find_overlapping_data_points( pure_density_data, v_excess_data, schema ) if len(binary_density_data) == 0 and len(v_excess_data) == 0: return data_frame # Inter-convert the two sets data_to_concat = [data_frame] if len(binary_density_data) > 0: v_excess_from_density = cls._convert_density_to_v_excess( binary_density_data ) data_to_concat.append(v_excess_from_density) if len(v_excess_data) > 0: density_from_v_excess = cls._convert_v_excess_to_density(v_excess_data) data_to_concat.append(density_from_v_excess) if len(data_to_concat) > 1: converted_data = pandas.concat( data_to_concat, ignore_index=True, sort=False, ) else: converted_data = data_frame return converted_data
ConversionComponentSchema = Union[ConvertExcessDensityDataSchema]