Source code for openff.evaluator.datasets.curation.components.conversion

"""The module contains curation components for converting one type of property (e.g.
density) into another (e.g excess molar volume)"""

import functools
import logging
from typing import Union

import pandas
from pydantic import Field, conint
from typing_extensions import Literal

from openff.evaluator.datasets.curation.components import (
    CurationComponent,
    CurationComponentSchema,
)

logger = logging.getLogger(__name__)


[docs]class ConvertExcessDensityDataSchema(CurationComponentSchema):
    type: Literal["ConvertExcessDensityDataSchema"] = "ConvertExcessDensityDataSchema"

    temperature_precision: conint(ge=0) = Field(
        2,
        description="The number of decimal places to compare temperatures (K) to "
        "within when attempting to identify compatible pure and binary data.",
    )
    pressure_precision: conint(ge=0) = Field(
        1,
        description="The number of decimal places to compare pressures (kPa) to "
        "within when attempting to identify compatible pure and binary data.",
    )


[docs]class ConvertExcessDensityData(CurationComponent):
    """A component for converting binary mass density data to excess molar volume
    data and vice versa where pure density data measured for the components is
    available.

    Notes
    -----
    This protocol may result in duplicate data points being generated. It is
    recommended to apply the de-duplication filter after this component has been
    applied.
    """

    @classmethod
    @functools.lru_cache(500)
    def _molecular_weight(cls, smiles):
        from openff.toolkit.topology import Molecule

        molecule = Molecule.from_smiles(smiles, allow_undefined_stereo=True)

        # Atom.mass is guaranteed to be in Daltons
        molecular_weight = sum(atom.mass.m for atom in molecule.atoms)

        return molecular_weight

    @classmethod
    def _find_overlapping_data_points(
        cls,
        pure_data_set: pandas.DataFrame,
        binary_data_set: pandas.DataFrame,
        schema: ConvertExcessDensityDataSchema,
    ):
        """Finds those binary data points for which there also exists pure
         data points for each component in the binary system.

        Parameters
        ----------
        pure_data_set
            The pure data set.
        binary_data_set
            The binary data set.
        schema
            The schema for this component.

        Returns
        -------
        pandas.DataFrame
            The data set containing the pure and binary data points
            measured for the same substances at the same state pounts
        """

        if len(pure_data_set) == 0 or len(binary_data_set) == 0:
            return pandas.DataFrame()

        pure_data_set = pure_data_set.dropna(axis=1, how="all")
        binary_data_set = binary_data_set.dropna(axis=1, how="all")

        # Round the floats which will be compared.
        pure_data_set["Temperature (K)"] = pure_data_set["Temperature (K)"].round(
            schema.temperature_precision
        )
        pure_data_set["Pressure (kPa)"] = pure_data_set["Pressure (kPa)"].round(
            schema.pressure_precision
        )

        binary_data_set["Temperature (K)"] = binary_data_set["Temperature (K)"].round(
            schema.temperature_precision
        )
        binary_data_set["Pressure (kPa)"] = binary_data_set["Pressure (kPa)"].round(
            schema.pressure_precision
        )

        # Only consider pure measurements which only have mole fractions defined
        if "Exact Amount 1" in pure_data_set:
            pure_data_set = pure_data_set[pure_data_set["Exact Amount 1"].isna()]

        if "Mole Fraction 1" not in pure_data_set:
            return pandas.DataFrame()

        pure_data_set = pure_data_set[pure_data_set["Mole Fraction 1"].notna()]

        # Retain only the minimally informative pure data columns.
        data_columns = [
            "Temperature (K)",
            "Pressure (kPa)",
            "Phase",
            "Component 1",
            "Density Value (g / ml)",
            "Source",
        ]

        if "Density Uncertainty (g / ml)" in pure_data_set:
            data_columns.append("Density Uncertainty (g / ml)")

        pure_data_set = pure_data_set[data_columns]

        pure_data_set = pandas.merge(
            pure_data_set,
            pure_data_set,
            how="inner",
            on=["Temperature (K)", "Pressure (kPa)", "Phase"],
        )

        overlapping_set = pandas.merge(
            binary_data_set,
            pure_data_set,
            how="inner",
            left_on=[
                "Temperature (K)",
                "Pressure (kPa)",
                "Phase",
                "Component 1",
                "Component 2",
            ],
            right_on=[
                "Temperature (K)",
                "Pressure (kPa)",
                "Phase",
                "Component 1_x",
                "Component 1_y",
            ],
            suffixes=("", ""),
        )

        return overlapping_set

    @classmethod
    def _convert_density_to_v_excess(
        cls, density_data_set: pandas.DataFrame
    ) -> pandas.DataFrame:
        """Converts a pandas data frame containing both binary mass densities
        and pure mass densities into one which contains excess molar volume
        measurements.

        Parameters
        ----------
        density_data_set
            The data frame containing both pure and binary
            density measurements. This should be generated using the
            `find_overlapping_data_points` function.

        Returns
        -------
            A data frame which contains the excess molar volume measurements.
        """

        m_1 = density_data_set["Component 1"].apply(cls._molecular_weight)
        m_1_x_1 = m_1 * density_data_set["Mole Fraction 1"]

        m_2 = density_data_set["Component 2"].apply(cls._molecular_weight)
        m_2_x_2 = m_2 * density_data_set["Mole Fraction 2"]

        v_excess = (
            (m_1_x_1 + m_2_x_2) / density_data_set["Density Value (g / ml)"]
            - m_1_x_1 / density_data_set["Density Value (g / ml)_x"]
            - m_2_x_2 / density_data_set["Density Value (g / ml)_y"]
        )

        source = density_data_set[["Source", "Source_x", "Source_y"]].agg(
            " + ".join, axis=1
        )

        # Add the new values to a new data frame.
        columns_to_drop = [
            x for x in density_data_set if x.endswith("_x") or x.endswith("_y")
        ]
        columns_to_drop.append("Density Value (g / ml)")
        columns_to_drop.append("Source")

        if "Density Uncertainty (g / ml)" in density_data_set:
            columns_to_drop.append("Density Uncertainty (g / ml)")

        v_excess_data_set = density_data_set.drop(columns=columns_to_drop).copy()

        v_excess_data_set.insert(
            v_excess_data_set.shape[1],
            "ExcessMolarVolume Value (cm ** 3 / mol)",
            v_excess,
        )
        v_excess_data_set.insert(v_excess_data_set.shape[1], "Source", source)

        return v_excess_data_set

    @classmethod
    def _convert_v_excess_to_density(
        cls, v_excess_data_set: pandas.DataFrame
    ) -> pandas.DataFrame:
        """Converts a pandas data frame containing both excess molar volumes
        and pure mass densities into one which contains binary mass density
        measurements.

        Parameters
        ----------
        v_excess_data_set
            The data frame containing both pure density and excess molar
            volume measurements. This should be generated using the
            `find_overlapping_data_points` function.

        Returns
        -------
            A data frame which contains the excess molar volume measurements.
        """

        m_1 = v_excess_data_set["Component 1"].apply(cls._molecular_weight)
        m_1_x_1 = m_1 * v_excess_data_set["Mole Fraction 1"]

        m_2 = v_excess_data_set["Component 2"].apply(cls._molecular_weight)
        m_2_x_2 = m_2 * v_excess_data_set["Mole Fraction 2"]

        v_excess = v_excess_data_set["ExcessMolarVolume Value (cm ** 3 / mol)"]

        denominator = (
            v_excess
            + m_1_x_1 / v_excess_data_set["Density Value (g / ml)_x"]
            + m_2_x_2 / v_excess_data_set["Density Value (g / ml)_y"]
        )

        rho_binary = (m_1_x_1 + m_2_x_2) / denominator

        source = v_excess_data_set[["Source", "Source_x", "Source_y"]].agg(
            " + ".join, axis=1
        )

        # Add the new values to a new data frame.
        columns_to_drop = [
            x for x in v_excess_data_set if x.endswith("_x") or x.endswith("_y")
        ]
        columns_to_drop.append("ExcessMolarVolume Value (cm ** 3 / mol)")
        columns_to_drop.append("Source")

        if "ExcessMolarVolume Uncertainty (cm ** 3 / mol)" in v_excess_data_set:
            columns_to_drop.append("ExcessMolarVolume Uncertainty (cm ** 3 / mol)")

        density_data_set = v_excess_data_set.drop(columns=columns_to_drop).copy()

        density_data_set.insert(
            density_data_set.shape[1] - 1, "Density Value (g / ml)", rho_binary
        )
        density_data_set.insert(density_data_set.shape[1] - 1, "Source", source)

        return density_data_set

    @classmethod
    def _apply(
        cls,
        data_frame: pandas.DataFrame,
        schema: ConvertExcessDensityDataSchema,
        n_processes,
    ) -> pandas.DataFrame:
        if len(data_frame) == 0:
            return data_frame

        # Check to make sure the data frame contains at least a
        # density column which may store pure densities.
        if "Density Value (g / ml)" not in data_frame:
            return data_frame

        # Separate out the data sets of interest
        pure_density_data = data_frame[
            (data_frame["Density Value (g / ml)"].notna())
            & (data_frame["N Components"] == 1)
        ]

        pure_density_data = pure_density_data.dropna(axis=1, how="all")

        # Exit early if no pure densities can be found.
        if len(pure_density_data) == 0:
            return data_frame

        # Add the pure data to the binary data sets to make conversion easier.
        binary_density_data = data_frame[
            (data_frame["Density Value (g / ml)"].notna())
            & (data_frame["N Components"] == 2)
        ]
        binary_density_data = binary_density_data.dropna(axis=1, how="all")

        binary_density_data = cls._find_overlapping_data_points(
            pure_density_data, binary_density_data, schema
        )

        v_excess_data = pandas.DataFrame()

        if "ExcessMolarVolume Value (cm ** 3 / mol)" in data_frame:
            v_excess_data = data_frame[
                (data_frame["ExcessMolarVolume Value (cm ** 3 / mol)"].notna())
                & (data_frame["N Components"] == 2)
            ]
            v_excess_data = v_excess_data.dropna(axis=1, how="all")
            v_excess_data = cls._find_overlapping_data_points(
                pure_density_data, v_excess_data, schema
            )

        if len(binary_density_data) == 0 and len(v_excess_data) == 0:
            return data_frame

        # Inter-convert the two sets
        data_to_concat = [data_frame]

        if len(binary_density_data) > 0:
            v_excess_from_density = cls._convert_density_to_v_excess(
                binary_density_data
            )
            data_to_concat.append(v_excess_from_density)

        if len(v_excess_data) > 0:
            density_from_v_excess = cls._convert_v_excess_to_density(v_excess_data)
            data_to_concat.append(density_from_v_excess)

        if len(data_to_concat) > 1:
            converted_data = pandas.concat(
                data_to_concat,
                ignore_index=True,
                sort=False,
            )

        else:
            converted_data = data_frame

        return converted_data


ConversionComponentSchema = Union[ConvertExcessDensityDataSchema]