Source code for propertyestimator.datasets.thermoml

"""
An API for importing a ThermoML archive.
"""
import copy
import logging
import re
import traceback
from enum import Enum, unique
from urllib.error import HTTPError
from urllib.request import urlopen
from xml.etree import ElementTree

import numpy as np

from propertyestimator import unit
from propertyestimator.properties import MeasurementSource, PropertyPhase
from propertyestimator.substances import Substance
from propertyestimator.thermodynamics import ThermodynamicState
from propertyestimator.utils.openmm import openmm_quantity_to_pint

from .datasets import PhysicalPropertyDataSet
from .plugins import registered_thermoml_properties


def unit_from_thermoml_string(full_string):
    """Extract the unit from a ThermoML property name.

    Parameters
    ----------
    full_string: str
        The string to convert to a Unit object.

    Returns
    ----------
    unit.Unit
        The parsed unit.
    """

    full_string_split = full_string.split(",")

    # Extract the unit portion of the string
    unit_string = full_string_split[1] if len(full_string_split) > 1 else ""
    # Convert symbols like dm3 to dm**3
    unit_string = re.sub(r"([a-z])([0-9]+)", r"\1**\2", unit_string.strip())

    return unit.Unit(unit_string)


def phase_from_thermoml_string(string):
    """Converts a ThermoML string to a PropertyPhase

        Parameters
        ----------
        string: str
            The string to convert to a PropertyPhase

        Returns
        ----------
        PropertyPhase
            The converted PropertyPhase
        """
    phase_string = string.lower().strip()
    phase = PropertyPhase.Undefined

    if phase_string == "liquid" or phase_string.find("solution") >= 0:
        phase = PropertyPhase.Liquid
    elif phase_string.find("crystal") >= 0 and not phase_string.find("liquid") >= 0:
        phase = PropertyPhase.Solid
    elif phase_string.find("gas") >= 0:
        phase = PropertyPhase.Gas

    return phase


@unique
class _ConstraintType(Enum):
    """An enumeration of the supported types of ThermoML constraint
    types.
    """

    Undefined = "Undefined"
    Temperature = "Temperature, K"
    Pressure = "Pressure, kPa"
    ComponentMoleFraction = "Mole fraction"
    ComponentMassFraction = "Mass fraction"
    ComponentMolality = "Molality, mol/kg"
    SolventMoleFraction = "Solvent: Mole fraction"
    SolventMassFraction = "Solvent: Mass fraction"
    SolventMolality = "Solvent: Molality, mol/kg"

    @staticmethod
    def from_node(node):
        """Converts either a ConstraintType or VariableType xml node to a _ConstraintType.

        Parameters
        ----------
        node: xml.etree.Element
            The xml node to convert.

        Returns
        ----------
        _ConstraintType
            The converted constraint type.
        """

        try:
            constraint_type = _ConstraintType(node.text)
        except (KeyError, ValueError):
            constraint_type = _ConstraintType.Undefined

        if constraint_type == _ConstraintType.Undefined:
            logging.debug(f"{node.tag}->{node.text} is an unsupported constraint type.")

        return constraint_type

    def is_composition_constraint(self):
        """Checks whether the purpose of this constraint is
        to constrain the substance composition.

        Returns
        -------
        bool
            True if the constraint type is either a

            - `_ConstraintType.ComponentMoleFraction`
            - `_ConstraintType.ComponentMassFraction`
            - `_ConstraintType.ComponentMolality`
            - `_ConstraintType.SolventMoleFraction`
            - `_ConstraintType.SolventMassFraction`
            - `_ConstraintType.SolventMolality`
        """
        return (
            self == _ConstraintType.ComponentMoleFraction
            or self == _ConstraintType.ComponentMassFraction
            or self == _ConstraintType.ComponentMolality
            or self == _ConstraintType.SolventMoleFraction
            or self == _ConstraintType.SolventMassFraction
            or self == _ConstraintType.SolventMolality
        )


class _Constraint:
    """A wrapper around a ThermoML `Constraint` node. A constraint
    in ThermoML encompasses such constructs as temperature, pressure
    or composition at which a measurement was recorded.
    """

    def __init__(self):

        self.type = _ConstraintType.Undefined
        self.value = 0.0

        self.solvents = []

        # Describes which compound the variable acts upon.
        self.compound_index = None

    @classmethod
    def from_node(cls, constraint_node, namespace):
        """Creates a _Constraint from an xml node.

        Parameters
        ----------
        constraint_node: Element
            The xml node to convert.
        namespace: dict of str and str
            The xml namespace.

        Returns
        ----------
        _Constraint, optional
            The extracted constraint if the constraint type is supported,
            otherwise `None`.
        """
        # Extract the xml nodes.
        type_node = constraint_node.find(".//ThermoML:ConstraintType/*", namespace)
        value_node = constraint_node.find("./ThermoML:nConstraintValue", namespace)

        solvent_index_nodes = constraint_node.find(
            "./ThermoML:Solvent//ThermoML:nOrgNum", namespace
        )
        compound_index_node = constraint_node.find(
            "./ThermoML:ConstraintID/ThermoML:RegNum/*", namespace
        )

        value = float(value_node.text)
        # Determine what the default unit for this variable should be.
        unit_type = unit_from_thermoml_string(type_node.text)

        return_value = cls()

        return_value.type = _ConstraintType.from_node(type_node)
        return_value.value = value * unit_type

        if compound_index_node is not None:
            return_value.compound_index = int(compound_index_node.text)

        if solvent_index_nodes is not None:
            for solvent_index_node in solvent_index_nodes:
                return_value.solvents.append(int(solvent_index_node))

        return None if return_value.type is _ConstraintType.Undefined else return_value

    @classmethod
    def from_variable(cls, variable, value):
        """Creates a _Constraint from an existing
        `_VariableDefinition` variable definition.

        Parameters
        ----------
        variable: _VariableDefinition
            The variable to convert.
        value: propertyestimator.unit.Quantity
            The value of the constant.

        Returns
        ----------
        _Constraint
            The created constraint.
        """
        return_value = cls()

        return_value.type = variable.type
        return_value.compound_index = variable.compound_index

        return_value.solvents.extend(variable.solvents)

        return_value.value = value

        return return_value


class _VariableDefinition:
    """A wrapper around a ThermoML Variable node. A variable in
    ThermoML is essentially just the definition of a `Constraint`
    (the constraint type, the expected units, etc.) whose value is
    defined inside of another ThermoML node.
    """

    def __init__(self):

        self.index = -1
        self.type = _ConstraintType.Undefined

        self.solvents = []
        self.default_unit = None

        # Describes which compound the variable acts upon.
        self.compound_index = None

    @classmethod
    def from_node(cls, variable_node, namespace):
        """Creates a `_VariableDefinition` from an xml node.

        Parameters
        ----------
        variable_node: xml.etree.Element
            The xml node to convert.
        namespace: dict of str and str
            The xml namespace.

        Returns
        ----------
        _VariableDefinition
            The created variable definition.
        """
        # Extract the xml nodes.
        type_node = variable_node.find(".//ThermoML:VariableType/*", namespace)
        index_node = variable_node.find("ThermoML:nVarNumber", namespace)

        solvent_index_nodes = variable_node.find(
            "./ThermoML:Solvent//ThermoML:nOrgNum", namespace
        )
        compound_index_node = variable_node.find(
            "./ThermoML:VariableID/ThermoML:RegNum/*", namespace
        )

        return_value = cls()

        return_value.default_unit = unit_from_thermoml_string(type_node.text)

        return_value.index = int(index_node.text)
        return_value.type = _ConstraintType.from_node(type_node)

        if compound_index_node is not None:
            return_value.compound_index = int(compound_index_node.text)

        if solvent_index_nodes is not None:
            for solvent_index_node in solvent_index_nodes:
                return_value.solvents.append(int(solvent_index_node))

        return None if return_value.type is _ConstraintType.Undefined else return_value


class _PropertyUncertainty:
    """A wrapper around a ThermoML PropUncertainty node.
    """

    # Reduce code redundancy by reusing this class for
    # both property and combined uncertainties.
    prefix = ""

    def __init__(self):

        self.index = -1
        self.coverage_factor = None

    @classmethod
    def from_xml(cls, node, namespace):
        """Creates a _PropertyUncertainty from an xml node.

        Parameters
        ----------
        node: Element
            The xml node to convert.
        namespace: dict of str and str
            The xml namespace.

        Returns
        ----------
        _Compound
            The created property uncertainty.
        """

        coverage_factor_node = node.find(
            f"ThermoML:n{cls.prefix}CoverageFactor", namespace
        )
        confidence_node = node.find(
            f"ThermoML:n{cls.prefix}UncertLevOfConfid", namespace
        )

        # As defined by https://www.nist.gov/pml/nist-technical-note-1297/nist-tn-1297-7-reporting-uncertainty
        if coverage_factor_node is not None:
            coverage_factor = float(coverage_factor_node.text)
        elif confidence_node is not None and confidence_node.text == "95":
            coverage_factor = 2
        else:
            return None

        index_node = node.find(f"ThermoML:n{cls.prefix}UncertAssessNum", namespace)
        index = int(index_node.text)

        return_value = cls()

        return_value.coverage_factor = coverage_factor
        return_value.index = index

        return return_value


class _CombinedUncertainty(_PropertyUncertainty):
    """A wrapper around a ThermoML CombPropUncertainty node.
    """

    prefix = "Comb"


class _Compound:
    """A wrapper around a ThermoML Compound node.
    """

    def __init__(self):

        self.smiles = None
        self.index = -1

    @staticmethod
    def smiles_from_inchi_string(inchi_string):
        """Attempts to create a SMILES pattern from an inchi string.

        Parameters
        ----------
        inchi_string: str
            The InChI string to convert.

        Returns
        ----------
        str, optional
            None if the identifier cannot be converted, otherwise the converted SMILES pattern.
        """
        from openeye import oechem

        if inchi_string is None:
            raise ValueError("The InChI string cannot be `None`.")

        temp_molecule = oechem.OEMol()

        if oechem.OEParseInChI(temp_molecule, inchi_string) is False:
            raise ValueError("All InChI strings in ThermoML files must be valid.")

        return oechem.OEMolToSmiles(temp_molecule)

    @staticmethod
    def smiles_from_thermoml_smiles_string(thermoml_string):
        """Attempts to create a SMILES pattern from a thermoml smiles string.

        Parameters
        ----------
        thermoml_string: str
            The string to convert.

        Returns
        ----------
        str, optional
            None if the identifier cannot be converted, otherwise the converted SMILES pattern.
        """
        from openeye import oechem

        if thermoml_string is None:
            raise ValueError("The string cannot be `None`.")

        temp_molecule = oechem.OEMol()

        parse_smiles_options = oechem.OEParseSmilesOptions(quiet=True)

        # Should make sure all smiles are OEChem consistent.
        if (
            oechem.OEParseSmiles(temp_molecule, thermoml_string, parse_smiles_options)
            is False
        ):
            raise ValueError("All SMILES strings in ThermoML files must be valid.")

        return oechem.OEMolToSmiles(temp_molecule)

    @staticmethod
    def smiles_from_common_name(common_name):
        """Attempts to create a SMILES pattern from a common molecular name using
        the `oechem.OEParseIUPACName` method.

        Parameters
        ----------
        common_name: str
            The common name to convert.
        Returns
        ----------
        str, None
            None if the identifier cannot be converted, otherwise the converted SMILES pattern.
        """
        from openeye import oechem
        from openeye import oeiupac

        if common_name is None:
            return None

        temp_molecule = oechem.OEMol()
        smiles = None

        if oeiupac.OEParseIUPACName(temp_molecule, common_name) is True:
            smiles = oechem.OEMolToSmiles(temp_molecule)

        return smiles

    @classmethod
    def from_xml_node(cls, node, namespace):
        """Creates a _Compound from an xml node.

        Parameters
        ----------
        node: Element
            The xml node to convert.
        namespace: dict of str and str
            The xml namespace.

        Returns
        ----------
        _Compound
            The created compound wrapper.
        """
        # Gather up all possible identifiers
        inchi_identifier_nodes = node.findall("ThermoML:sStandardInChI", namespace)
        smiles_identifier_nodes = node.findall("ThermoML:sSmiles", namespace)
        common_identifier_nodes = node.findall("ThermoML:sCommonName", namespace)

        if (
            len(inchi_identifier_nodes) > 0
            and inchi_identifier_nodes[0].text is not None
        ):
            # Convert InChI key to a smiles pattern.
            smiles = cls.smiles_from_inchi_string(inchi_identifier_nodes[0].text)

        elif (
            len(smiles_identifier_nodes) > 0
            and smiles_identifier_nodes[0].text is not None
        ):
            # Standardise the smiles pattern using OE.
            smiles = cls.smiles_from_thermoml_smiles_string(
                smiles_identifier_nodes[0].text
            )

        elif (
            len(common_identifier_nodes) > 0
            and common_identifier_nodes[0].text is not None
        ):
            # Convert the common name to a smiles pattern.
            smiles = cls.smiles_from_common_name(common_identifier_nodes[0].text)

        else:

            logging.debug(
                "A ThermoML:Compound node does not have a valid InChI identifier, "
                "a valid SMILES pattern, or an understandable common name."
            )

            return None

        index_node = node.find("./ThermoML:RegNum/*", namespace)

        if index_node is None:
            raise ValueError("A ThermoML:Compound does not have an index defined.")

        compound_index = int(index_node.text)

        return_value = cls()

        return_value.smiles = smiles
        return_value.index = compound_index

        return return_value


class _Property:
    """A wrapper around a ThermoML Property node.
    """

    class SoluteStandardState(Enum):
        """Describes the standard state of a solute.
        """

        Undefined = ("Undefined",)
        InfiniteDilutionSolute = ("Infinite dilution solute",)
        PureCompound = ("Pure compound",)
        PureLiquidSolute = ("Pure liquid solute",)
        StandardMolality = ("Standard molality (1 mol/kg) solute",)

        @staticmethod
        def from_node(node):
            """Converts an `eStandardState` node a `_Property.SoluteStandardState`.

            Parameters
            ----------
            node: xml.etree.Element
                The xml node to convert.

            Returns
            ----------
            _Property.SoluteStandardState
                The converted state type.
            """

            try:
                standard_state = _Property.SoluteStandardState(node.text)
            except (KeyError, ValueError):
                standard_state = _Property.SoluteStandardState.Undefined

            if standard_state == _ConstraintType.Undefined:

                logging.debug(
                    f"{node.tag}->{node.text} is an unsupported "
                    f"solute standard state type."
                )

            return standard_state

    def __init__(self, base_type):

        self.thermodynamic_state = None
        self.phase = PropertyPhase.Undefined

        self.substance = None

        self.value = None
        self.uncertainty = None

        self.source = None

        self.index = None

        self.type = base_type

        self.solute_standard_state = _Property.SoluteStandardState.Undefined
        self.solvents = []

        self.target_compound_index = None

        self.property_uncertainty_definitions = {}
        self.combined_uncertainty_definitions = {}

        self.default_unit = None

        self.target_compound_index = None

    @staticmethod
    def extract_uncertainty_definitions(
        node,
        namespace,
        property_uncertainty_definitions,
        combined_uncertainty_definitions,
    ):

        """Extract any property or combined uncertainties from a property xml node.

        Parameters
        ----------
        node: Element
            The xml node to convert.
        namespace: dict of str and str
            The xml namespace.
        property_uncertainty_definitions: list(_PropertyUncertainty)
            A list of the extracted property uncertainties.
        combined_uncertainty_definitions: list(_PropertyUncertainty)
            A list of the extracted combined property uncertainties.
        """

        property_nodes = node.findall("ThermoML:CombinedUncertainty", namespace)

        for property_node in property_nodes:

            if property_node is None:
                continue

            uncertainty_definition = _CombinedUncertainty.from_xml(
                property_node, namespace
            )

            if uncertainty_definition is None:
                continue

            combined_uncertainty_definitions[
                uncertainty_definition.index
            ] = uncertainty_definition

        property_nodes = node.findall("ThermoML:PropUncertainty", namespace)

        for property_node in property_nodes:

            if property_node is None:
                continue

            uncertainty_definition = _PropertyUncertainty.from_xml(
                property_node, namespace
            )

            if uncertainty_definition is None:
                continue

            property_uncertainty_definitions[
                uncertainty_definition.index
            ] = uncertainty_definition

    @classmethod
    def from_xml_node(cls, node, namespace, parent_phases):
        """Creates a _Property from an xml node.

        Parameters
        ----------
        node: Element
            The xml node to convert.
        namespace: dict of str and str
            The xml namespace.
        parent_phases: PropertyPhase
            The phases specfied in the parent PureOrMixtureData node.

        Returns
        ----------
        _Compound
            The created property.
        """

        # Gather up all possible identifiers
        index_node = node.find("ThermoML:nPropNumber", namespace)

        property_index = int(index_node.text)

        phase_node = node.find("./ThermoML:PropPhaseID//ThermoML:ePropPhase", namespace)
        phase = PropertyPhase.Undefined | parent_phases

        if phase_node is not None:
            phase |= phase_from_thermoml_string(phase_node.text)

        reference_phase_node = node.find(
            "./ThermoML:RefPhaseID//ThermoML:eRefPhase", namespace
        )

        if reference_phase_node is not None:
            phase |= phase_from_thermoml_string(reference_phase_node.text)

        if phase == PropertyPhase.Undefined:

            logging.debug(
                f"A property was measured in an unsupported phase "
                f"({phase_node.text}) and will be skipped."
            )

            return None

        property_group_node = node.find(
            "./ThermoML:Property-MethodID//ThermoML:PropertyGroup//*", namespace
        )

        property_name_node = property_group_node.find("./ThermoML:ePropName", namespace)
        method_name_node = property_group_node.find("./ThermoML:eMethodName", namespace)

        if method_name_node is None:
            method_name_node = property_group_node.find(
                "./ThermoML:sMethodName", namespace
            )

        if method_name_node is None or property_name_node is None:
            raise RuntimeError("A property does not have a name / method entry.")

        if property_name_node.text not in registered_thermoml_properties:

            logging.debug(
                f"An unsupported property was found "
                f"({property_name_node.text}) and will be skipped."
            )

            return None

        registered_plugin = registered_thermoml_properties[property_name_node.text]

        if (registered_plugin.supported_phases & phase) != phase:

            logging.debug(
                f"The {property_name_node.text} property is currently only supported "
                f"when measured in the {str(registered_plugin.supported_phases)} phase, "
                f"and not the {str(phase)} phase."
            )

            return None

        return_value = cls(registered_plugin.class_type)

        return_value.index = property_index
        return_value.phase = phase

        return_value.default_unit = unit_from_thermoml_string(property_name_node.text)

        return_value.type = registered_plugin.class_type
        return_value.method_name = method_name_node.text

        property_uncertainty_definitions = {}
        combined_uncertainty_definitions = {}

        cls.extract_uncertainty_definitions(
            node,
            namespace,
            property_uncertainty_definitions,
            combined_uncertainty_definitions,
        )

        return_value.combined_uncertainty_definitions = combined_uncertainty_definitions
        return_value.property_uncertainty_definitions = property_uncertainty_definitions

        solvent_index_nodes = node.findall(
            "./ThermoML:Solvent//ThermoML:nOrgNum", namespace
        )

        if solvent_index_nodes is not None:
            for solvent_index_node in solvent_index_nodes:
                return_value.solvents.append(int(solvent_index_node.text))

        # The solute standard state appears to describe which a solute should
        # be present in only trace amounts. It only seems to be relevant for
        # activity based properties.
        standard_state_node = node.find("./ThermoML:eStandardState", namespace)

        if standard_state_node is not None:
            return_value.solute_standard_state = _Property.SoluteStandardState.from_node(
                standard_state_node
            )

        # Property->Property-MethodID->RegNum describes which compound is referred
        # to if the property is based on one of the compounds e.g. the activity
        # coefficient of compound 2.
        target_compound_node = node.find(
            "./ThermoML:Property-MethodID/ThermoML:RegNum/ThermoML:nOrgNum", namespace
        )

        if target_compound_node is not None:
            return_value.target_compound_index = int(target_compound_node.text)

        return return_value

    def set_value(self, value, uncertainty):
        """Set the value and uncertainty of this property, adding units if necessary.

        Parameters
        ----------
        value: float or unit.Quantity
            The value of the property
        uncertainty: float or unit.Quantity, optional
            The uncertainty in the value.
        """
        value_quantity = value

        if not isinstance(value_quantity, unit.Quantity):
            value_quantity = value * self.default_unit

        self.value = value_quantity

        if uncertainty is not None:

            uncertainty_quantity = uncertainty

            if not isinstance(uncertainty_quantity, unit.Quantity):
                uncertainty_quantity = uncertainty * self.default_unit

            self.uncertainty = uncertainty_quantity


class _PureOrMixtureData:
    """A wrapper around a ThermoML PureOrMixtureData node.
    """

    @staticmethod
    def extract_compound_indices(node, namespace, compounds):
        """Extract a list of the compound indices which a given `PureOrMixtureData`
        node depends upon.

        Parameters
        ----------
        node: xml.etree.Element
            The xml node to read.
        namespace: dict of str and str
            The xml namespace.
        compounds: dict of int and _Compound
            The compounds which were able to be parsed from the
            parent archive file, with keys of their assigned
            indices.
        """

        component_nodes = node.findall("ThermoML:Component", namespace)
        compound_indices = []

        # Figure out which compounds are going to be associated with
        # the property entries.
        for component_node in component_nodes:

            index_node = component_node.find("./ThermoML:RegNum/*", namespace)

            compound_index = int(index_node.text)

            if compound_index not in compounds:

                logging.debug(
                    "A PureOrMixtureData entry depends on an "
                    "unsupported compound and has been ignored"
                )

                return None

            if compound_index in compound_indices:

                raise ValueError(
                    "A ThermoML:PureOrMixtureData states its dependency on the "
                    "same compound twice."
                )

            compound_indices.append(compound_index)

        return compound_indices

    @staticmethod
    def extract_property_definitions(node, namespace, parent_phases):
        """Extract those property definitions defined by a PureOrMixtureData
        node. The extracted definitions are not extracted, as these a defined
        elsewhere in the archive file.

        Parameters
        ----------
        node: xml.etree.Element
            The xml node to read.
        namespace: dict of str and str
            The xml namespace.
        parent_phases: PropertyPhase
            The phases specified by the parent PureOrMixtureData node.

        Returns
        ----------
        dict of int and _Property
            The extracted property definitions with keys of their
            assigned indices.
        """

        property_nodes = node.findall("ThermoML:Property", namespace)
        properties = {}

        for property_node in property_nodes:

            property_definition = _Property.from_xml_node(
                property_node, namespace, parent_phases
            )

            if property_definition is None:
                continue

            if property_definition.index in properties:

                raise ValueError(
                    "A ThermoML data set contains two " "properties with the same index"
                )

            properties[property_definition.index] = property_definition

        return properties

    @staticmethod
    def validate_constraint(constraint, compounds):
        """Validates a constraint object - this may be either
        a full `_Constraint` or just a `_VariableDefinition`.

        Parameters
        ----------
        constraint: _Constraint or _VariableDefinition
            The constraint to validate.
        compounds: dict of int and _Compound
            A dictionary of the compounds the parent PureOrMixtureData was
            measured for.

        Returns
        -------
        bool
            True if the constraint is valid, False otherwise.
        """

        if constraint is None or constraint.type is _ConstraintType.Undefined:

            logging.debug("An unsupported constraint has been ignored.")
            return False

        if (
            constraint.compound_index is not None
            and constraint.compound_index not in compounds
        ):

            logging.debug(
                "A constraint exists upon a non-existent compound and will be ignored."
            )
            return False

        if (
            constraint.type.is_composition_constraint()
            and constraint.compound_index is None
        ):

            logging.debug(
                "An unsupported constraint has been ignored - composition constraints"
                "need to have a corresponding compound_index."
            )
            return False

        return True

    @staticmethod
    def extract_global_constraints(node, namespace, compounds):
        """Extract the constraints which should be applied to all of
        the properties defined in a `PureOrMixtureData` node.

        Parameters
        ----------
        node: xml.etree.Element
            The xml node to read.
        namespace: dict of str and str
            The xml namespace.
        compounds: dict of int and _Compound
            A dictionary of the compounds this PureOrMixtureData was
            measured for.

        Returns
        ----------
        dict of int and _Constraint, optional
            The extracted constraints if all could be parsed,
            otherwise `None`.
        """

        constraint_nodes = node.findall("ThermoML:Constraint", namespace)
        constraints = []

        for constraint_node in constraint_nodes:

            constraint = _Constraint.from_node(constraint_node, namespace)

            if not _PureOrMixtureData.validate_constraint(constraint, compounds):
                return None

            constraints.append(constraint)

        return constraints

    @staticmethod
    def extract_variable_definitions(node, namespace, compounds):
        """Extract all of the 'variables' in a PureOrMixtureData node.
        These are simply constraints whose values are defined elsewhere
        in the archive.

        Parameters
        ----------
        node: xml.etree.Element
            The xml node to read.
        namespace: dict of str and str
            The xml namespace.
        compounds: dict of int and _Compound
            A dictionary of the compounds this PureOrMixtureData was calculated for.

        Returns
        ----------
        dict of int and _VariableDefinition
            The extracted variable definitions if all could be parsed,
            otherwise `None`.
        """
        variable_nodes = node.findall("ThermoML:Variable", namespace)
        variables = {}

        for variable_node in variable_nodes:

            variable = _VariableDefinition.from_node(variable_node, namespace)

            if not _PureOrMixtureData.validate_constraint(variable, compounds):
                return None

            variables[variable.index] = variable

        return variables

    @staticmethod
    def extract_uncertainty(node, namespace, property_definition):
        """Extracts the uncertainties on the measured properties
        contained in this `PureOrMixtureData` node.

        Parameters
        ----------
        node: xml.etree.Element
            The xml node to read.
        namespace: dict of str and str
            The xml namespace.
        property_definition: _Property
            The property to which this uncertainty is attached.

        Returns
        -------
        float, optional
            The uncertainty in the property if it can be parsed or
            if one is present, otherwise `None`.
        """

        # Look for a standard uncertainty..
        uncertainty_node = node.find(".//ThermoML:nCombStdUncertValue", namespace)

        if uncertainty_node is None:
            uncertainty_node = node.find(".//ThermoML:nStdUncertValue", namespace)

        # We have found a std. uncertainty
        if uncertainty_node is not None:
            return float(uncertainty_node.text)

        # Try to calculate uncertainty from a coverage factor if present
        if (
            len(property_definition.combined_uncertainty_definitions) == 0
            and len(property_definition.property_uncertainty_definitions) == 0
        ):

            return None

        combined = len(property_definition.combined_uncertainty_definitions) > 0

        prefix = (
            _CombinedUncertainty.prefix if combined else _PropertyUncertainty.prefix
        )

        if combined:
            index_node = node.find(
                "./ThermoML:CombinedUncertainty/ThermoML:nCombUncertAssessNum",
                namespace,
            )
        else:
            index_node = node.find(
                "./ThermoML:PropUncertainty/ThermoML:nUncertAssessNum", namespace
            )

        expanded_uncertainty_node = node.find(
            ".//ThermoML:n" + prefix + "ExpandUncertValue", namespace
        )

        if index_node is None or expanded_uncertainty_node is None:
            return None

        expanded_uncertainty = float(expanded_uncertainty_node.text)
        index = int(index_node.text)

        if (
            combined
            and index not in property_definition.combined_uncertainty_definitions
        ):
            return None

        if (
            not combined
            and index not in property_definition.property_uncertainty_definitions
        ):
            return None

        divisor = (
            property_definition.combined_uncertainty_definitions[index].coverage_factor
            if combined
            else property_definition.property_uncertainty_definitions[
                index
            ].coverage_factor
        )

        return expanded_uncertainty / divisor

    @staticmethod
    def _smiles_to_molecular_weight(smiles):
        """Calculates the molecular weight of a substance specified
        by a smiles string.

        Parameters
        ----------
        smiles: str
            The smiles string to calculate the molecular weight of.

        Returns
        -------
        propertyestimator.unit.Quantity
            The molecular weight.
        """

        from simtk import unit as simtk_unit
        from openforcefield.topology import Molecule

        try:

            molecule = Molecule.from_smiles(smiles)

        except Exception as e:

            formatted_exception = traceback.format_exception(None, e, e.__traceback__)

            raise ValueError(
                f"The toolkit raised an exception for the "
                f"{smiles} smiles pattern: {formatted_exception}"
            )

        molecular_weight = 0.0 * simtk_unit.dalton

        for atom in molecule.atoms:
            molecular_weight += atom.mass

        return openmm_quantity_to_pint(molecular_weight)

    @staticmethod
    def _solvent_mole_fractions_to_moles(
        solvent_mass, solvent_mole_fractions, solvent_compounds
    ):
        """Converts a set of solvent mole fractions to moles for a
        given mass of solvent.

        Parameters
        ----------
        solvent_mass: propertyestimator.unit.Quantity
            The total mass of the solvent in units compatible with kg.
        solvent_mole_fractions: dict of int and float
            The mole fractions of any solvent compounds in the system.
        solvent_compounds: dict of int and float
            A dictionary of any solvent compounds in the system.

        Returns
        -------
        dict of int and unit.Quantity
            A dictionary of the moles of each solvent compound.
        """
        weighted_molecular_weights = 0.0 * unit.gram / unit.mole
        number_of_moles = {}

        for solvent_index in solvent_compounds:

            solvent_smiles = solvent_compounds[solvent_index].smiles

            solvent_fraction = solvent_mole_fractions[solvent_index]
            solvent_weight = _PureOrMixtureData._smiles_to_molecular_weight(
                solvent_smiles
            )

            weighted_molecular_weights += solvent_weight * solvent_fraction

        total_solvent_moles = solvent_mass / weighted_molecular_weights

        for solvent_index in solvent_compounds:

            moles = solvent_mole_fractions[solvent_index] * total_solvent_moles
            number_of_moles[solvent_index] = moles

        return number_of_moles

    @staticmethod
    def _convert_mole_fractions(constraints, compounds, solvent_mole_fractions=None):
        """Converts a set of `_Constraint` to mole fractions.

        Parameters
        ----------
        constraints: list of _Constraint
            The constraints to convert.
        compounds: dict of int and _Compound
            The compounds in the system.
        solvent_mole_fractions: dict of int and float
            The mole fractions of any solvent compounds in the system,
            where the total mole fraction of all solvents must be equal
            to one.

        Returns
        -------
        dict of int and float
            A dictionary of compound indices and mole fractions.
        """

        # noinspection PyTypeChecker
        number_of_constraints = len(constraints)

        mole_fractions = {}
        total_mol_fraction = 0.0

        for constraint in constraints:

            mole_fraction = constraint.value

            if isinstance(mole_fraction, unit.Quantity):
                mole_fraction = mole_fraction.to(unit.dimensionless).magnitude

            mole_fractions[constraint.compound_index] = mole_fraction
            total_mol_fraction += mole_fractions[constraint.compound_index]

        if (
            number_of_constraints != len(compounds)
            and solvent_mole_fractions is not None
        ) or (
            number_of_constraints != len(compounds) - 1
            and number_of_constraints != len(compounds)
            and solvent_mole_fractions is None
        ):

            raise ValueError(
                f"The number of mole fraction constraints ({number_of_constraints}) must be one "
                f"less than or equal to the number of compounds being constrained ({len(compounds)}) "
                f"if a solvent list is not present, otherwise there must be an equal number."
            )

        # Handle the case were a single mole fraction constraint is missing.
        if number_of_constraints == len(compounds) - 1:

            for compound_index in compounds:

                if compound_index in mole_fractions:
                    continue

                mole_fractions[compound_index] = 1.0 - total_mol_fraction

        # Recompute the total mole fraction to be safe.
        total_mol_fraction = 0.0

        for compound_index in mole_fractions:
            total_mol_fraction += mole_fractions[compound_index]

        # Account for any solvent present.
        if solvent_mole_fractions is not None:

            # Assume the remainder of the mole fraction is the solvent.
            remaining_mole_fraction = 1.0 - total_mol_fraction

            for solvent_index in solvent_mole_fractions:
                mole_fractions[solvent_index] = (
                    solvent_mole_fractions[solvent_index] * remaining_mole_fraction
                )

        return mole_fractions

    @staticmethod
    def _convert_mass_fractions(
        constraints, compounds, solvent_mole_fractions=None, solvent_compounds=None
    ):
        """Converts a set of `_Constraint` to mole fractions.

        Parameters
        ----------
        constraints: list of _Constraint
            The constraints to convert.
        compounds: dict of int and _Compound
            The compounds in the system.
        solvent_mole_fractions: dict of int and float
            The mole fractions of any solvent compounds in the system,
            where the total mole fraction of all solvents must be equal
            to one.
        solvent_compounds: dict of int and float
            A dictionary of any explicitly defined solvent compounds in the
            system.

        Returns
        -------
        dict of int and float
            A dictionary of compound indices and mole fractions.
        """

        # noinspection PyTypeChecker
        number_of_constraints = len(constraints)

        mass_fractions = {}
        total_mass_fraction = 0.0

        for constraint in constraints:

            mass_fraction = constraint.value

            if isinstance(mass_fraction, unit.Quantity):
                mass_fraction = mass_fraction.to(unit.dimensionless).magnitude

            mass_fractions[constraint.compound_index] = mass_fraction
            total_mass_fraction += mass_fraction

        if (
            number_of_constraints != len(compounds)
            and solvent_mole_fractions is not None
        ) or (
            number_of_constraints != len(compounds) - 1
            and number_of_constraints != len(compounds)
            and solvent_mole_fractions is None
        ):

            raise ValueError(
                f"The number of mass fraction constraints ({number_of_constraints}) must be one "
                f"less than or equal to the number of compounds being constrained ({len(compounds)}) "
                f"if a solvent list is not present, otherwise there must be an equal number."
            )

        # Handle the case were a single mass fraction constraint is missing.
        if number_of_constraints == len(compounds) - 1:

            for compound_index in compounds:

                if compound_index in mass_fractions:
                    continue

                mass_fractions[compound_index] = 1.0 - total_mass_fraction
                if isinstance(mass_fractions[compound_index], unit.Quantity):
                    mass_fractions[compound_index] = (
                        mass_fractions[compound_index].to(unit.dimensionless).magnitude
                    )

        total_mass = 1 * unit.gram
        total_solvent_mass = total_mass

        moles = {}
        total_moles = 0.0 * unit.mole

        for compound_index in compounds:

            compound_smiles = compounds[compound_index].smiles
            compound_weight = _PureOrMixtureData._smiles_to_molecular_weight(
                compound_smiles
            )

            moles[compound_index] = (
                total_mass * mass_fractions[compound_index] / compound_weight
            )
            total_moles += moles[compound_index]

            total_solvent_mass -= total_mass * mass_fractions[compound_index]

        if (
            number_of_constraints == len(compounds)
            and solvent_mole_fractions is not None
        ):

            solvent_moles = _PureOrMixtureData._solvent_mole_fractions_to_moles(
                total_solvent_mass, solvent_mole_fractions, solvent_compounds
            )

            for solvent_index in solvent_moles:

                moles[solvent_index] = solvent_moles[solvent_index]
                total_moles += solvent_moles[solvent_index]

        mole_fractions = {}

        for compound_index in moles:

            mole_fraction = moles[compound_index] / total_moles
            mole_fractions[compound_index] = mole_fraction

        return mole_fractions

    @staticmethod
    def _convert_molality(
        constraints, compounds, solvent_mole_fractions=None, solvent_compounds=None
    ):
        """Converts a set of `_Constraint` to mole fractions.

        Parameters
        ----------
        constraints: list of _Constraint
            The constraints to convert.
        compounds: dict of int and _Compound
            The compounds in the system.
        solvent_mole_fractions: dict of int and float
            The mole fractions of any solvent compounds in the system,
            where the total mole fraction of all solvents must be equal
            to one.
        solvent_compounds: dict of int and float
            A dictionary of any explicitly defined solvent compounds in the
            system.

        Returns
        -------
        dict of int and float
            A dictionary of compound indices and mole fractions.
        """
        number_of_moles = {}
        # noinspection PyTypeChecker
        number_of_constraints = len(constraints)

        mole_fractions = {}

        total_number_of_moles = 0.0 * unit.moles
        total_solvent_mass = 1.0 * unit.kilograms

        for constraint in constraints:

            molality = constraint.value
            moles = molality * total_solvent_mass

            number_of_moles[constraint.compound_index] = moles
            total_number_of_moles += moles

        if (
            number_of_constraints != len(compounds) - 1
            and solvent_mole_fractions is None
        ) or (
            number_of_constraints != len(compounds)
            and solvent_mole_fractions is not None
        ):

            raise ValueError(
                f"The number of molality constraints ({number_of_constraints}) must be one "
                f"less than the number of compounds being constrained ({len(compounds)}) if a "
                f"solvent list is not present, otherwise there must be an equal number."
            )

        if (
            number_of_constraints == len(compounds) - 1
            and solvent_mole_fractions is None
        ):

            # In this case, there is no explicit solvent entry and the last component
            # whose molality has not been constrained is considered to be the 'solvent'
            for compound_index in compounds:

                if compound_index in number_of_moles:
                    continue

                compound_smiles = compounds[compound_index].smiles
                compound_weight = _PureOrMixtureData._smiles_to_molecular_weight(
                    compound_smiles
                )

                moles = total_solvent_mass / compound_weight

                number_of_moles[compound_index] = moles
                total_number_of_moles += moles

        elif (
            number_of_constraints == len(compounds)
            and solvent_mole_fractions is not None
        ):

            solvent_moles = _PureOrMixtureData._solvent_mole_fractions_to_moles(
                total_solvent_mass, solvent_mole_fractions, solvent_compounds
            )

            for solvent_index in solvent_moles:

                number_of_moles[solvent_index] = solvent_moles[solvent_index]
                total_number_of_moles += solvent_moles[solvent_index]

        for compound_index in number_of_moles:

            mole_fraction = number_of_moles[compound_index] / total_number_of_moles
            mole_fractions[compound_index] = mole_fraction

        return mole_fractions

    @staticmethod
    def build_substance(thermoml_property, constraints, compounds):
        """Build a Substance object from the extracted constraints and compounds.

        Parameters
        ----------
        thermoml_property: _Property
            The property to which this mixture belongs.
        constraints: list of _Constraint
            The ThermoML constraints.
        compounds: dict of int and _Compound
            A dictionary of the compounds this PureOrMixtureData was calculated for.

        Returns
        ----------
        Substance
            The constructed substance.
        """

        # TODO: We need to take into account `thermoml_property.target_compound_index` and
        #       `thermoml_property.solute_standard_state` to properly identify infinitely
        #       diluted solutes in the system (if any). Otherwise the solute will be
        #       assigned a mole fraction of zero.

        solvent_constraint_type = _ConstraintType.Undefined
        component_constraint_type = _ConstraintType.Undefined

        solvent_indices = set()

        for solvent_index in thermoml_property.solvents:

            if solvent_index in solvent_indices:
                continue

            solvent_indices.add(solvent_index)

        # Determine which types of solvent and component constraints are
        # being applied.
        for constraint in constraints:

            # Make sure we hunt down solvent indices.
            for solvent_index in constraint.solvents:

                if solvent_index in solvent_indices:
                    continue

                solvent_indices.add(solvent_index)

            # Only composition type restraints apply here, skip
            # the rest.
            if not constraint.type.is_composition_constraint():
                continue

            if (
                constraint.type == _ConstraintType.SolventMassFraction
                or constraint.type == _ConstraintType.SolventMoleFraction
                or constraint.type == _ConstraintType.SolventMolality
            ):

                if solvent_constraint_type == _ConstraintType.Undefined:
                    solvent_constraint_type = constraint.type

                if solvent_constraint_type != constraint.type:

                    logging.debug(
                        f"A property with different types of solvent composition constraints "
                        f"was found - {solvent_constraint_type} vs {constraint.type}). This "
                        f"is likely a bug in the ThermoML file and so this property will be "
                        f"skipped."
                    )

                    return None

            else:

                if component_constraint_type == _ConstraintType.Undefined:
                    component_constraint_type = constraint.type

                if component_constraint_type != constraint.type:

                    logging.debug(
                        f"A property with different types of composition constraints "
                        f"was found - {component_constraint_type} vs {constraint.type}). This "
                        f"is likely a bug in the ThermoML file and so this property will be "
                        f"skipped."
                    )

                    return None

        # If no constraint was applied, this very likely means a pure substance
        # was found.
        if (
            component_constraint_type == _ConstraintType.Undefined
            and solvent_constraint_type == _ConstraintType.Undefined
        ):

            component_constraint_type = _ConstraintType.ComponentMoleFraction

        elif (
            component_constraint_type == _ConstraintType.Undefined
            and solvent_constraint_type != _ConstraintType.Undefined
        ):

            logging.debug(
                f"A property with only solvent composition "
                f"constraints {solvent_constraint_type} was found."
            )

            return None

        solvent_mole_fractions = {}

        solvent_constraints = [
            constraint
            for constraint in constraints
            if constraint.type == solvent_constraint_type
        ]

        solvent_compounds = {}

        for solvent_index in solvent_indices:

            if solvent_index in compounds:

                solvent_compounds[solvent_index] = compounds[solvent_index]
                continue

            logging.debug(
                "The composition of a non-existent solvent was "
                "found. This usually only occurs in cases were "
                "the solvent component could not be understood "
                "by the framework."
            )

            return None

        # Make sure all of the solvents have not been removed.
        if (
            solvent_constraint_type != _ConstraintType.Undefined
            and len(solvent_indices) == 0
        ):

            logging.debug(
                "The composition of a solvent was found, however the "
                "solvent list is empty. This usually only occurs in "
                "cases were the solvent component could not be understood "
                "by the framework."
            )

            return None

        remaining_constraints = [
            constraint
            for constraint in constraints
            if constraint.type == component_constraint_type
        ]

        remaining_compounds = {}

        for compound_index in compounds:

            if compound_index in solvent_indices:
                continue

            remaining_compounds[compound_index] = compounds[compound_index]

        # Determine the mole fractions of the solvent species, if any.
        if solvent_constraint_type == _ConstraintType.SolventMoleFraction:

            solvent_mole_fractions = _PureOrMixtureData._convert_mole_fractions(
                solvent_constraints, solvent_compounds
            )

        elif solvent_constraint_type == _ConstraintType.SolventMassFraction:

            solvent_mole_fractions = _PureOrMixtureData._convert_mass_fractions(
                solvent_constraints, solvent_compounds
            )

        elif solvent_constraint_type == _ConstraintType.SolventMolality:

            solvent_mole_fractions = _PureOrMixtureData._convert_molality(
                solvent_constraints, solvent_compounds
            )

        elif solvent_constraint_type == _ConstraintType.Undefined:

            solvent_mole_fractions = None
            solvent_compounds = None

            remaining_compounds = compounds

        # Determine the mole fractions of the remaining compounds.
        mole_fractions = {}

        if component_constraint_type == _ConstraintType.ComponentMoleFraction:

            mole_fractions = _PureOrMixtureData._convert_mole_fractions(
                remaining_constraints, remaining_compounds, solvent_mole_fractions
            )

        elif component_constraint_type == _ConstraintType.ComponentMassFraction:

            mole_fractions = _PureOrMixtureData._convert_mass_fractions(
                remaining_constraints,
                remaining_compounds,
                solvent_mole_fractions,
                solvent_compounds,
            )

        elif component_constraint_type == _ConstraintType.ComponentMolality:

            mole_fractions = _PureOrMixtureData._convert_molality(
                remaining_constraints,
                remaining_compounds,
                solvent_mole_fractions,
                solvent_compounds,
            )

        if len(mole_fractions) != len(compounds):

            raise ValueError(
                f"The number of mole fractions ({len(mole_fractions)}) does not "
                f"equal the total number of compounds ({len(compounds)})"
            )

        # Make sure we haven't picked up a dimensionless unit be accident.
        for compound_index in mole_fractions:

            if isinstance(mole_fractions[compound_index], unit.Quantity):
                mole_fractions[compound_index] = (
                    mole_fractions[compound_index].to(unit.dimensionless).magnitude
                )

        total_mol_fraction = sum([value for value in mole_fractions.values()])

        if not np.isclose(total_mol_fraction, 1.0):
            raise ValueError(
                f"The total mole fraction {total_mol_fraction} is not equal to 1.0"
            )

        substance = Substance()

        for compound_index in compounds:

            compound = compounds[compound_index]

            if np.isclose(mole_fractions[compound_index], 0.0):
                continue

            substance.add_component(
                component=Substance.Component(smiles=compound.smiles),
                amount=Substance.MoleFraction(mole_fractions[compound_index]),
            )

        return substance

    @staticmethod
    def extract_measured_properties(
        node,
        namespace,
        property_definitions,
        global_constraints,
        variable_definitions,
        compounds,
    ):

        """Extract the measured properties defined by a ThermoML
        PureOrMixtureData node.

        Parameters
        ----------
        node: xml.etree.Element
            The xml node to read.
        namespace: dict of str and str
            The xml namespace.
        property_definitions: dict of int and _Property
            The extracted property definitions.
        global_constraints: dict of int and _Constraint
            The extracted constraints.
        variable_definitions: dict of int and _VariableDefinition
            The extracted variable definitions.
        compounds: dict of int and _Compound
            The extracted compounds.

        Returns
        ----------
        list of _Property
            The extracted measured properties.
        """

        value_nodes = node.findall("ThermoML:NumValues", namespace)

        measured_properties = []

        # Each value_node corresponds to one measure property.
        for value_node in value_nodes:

            constraints = []

            temperature_constraint = None
            pressure_constraint = None

            for global_constraint in global_constraints:

                constraint = copy.deepcopy(global_constraint)
                constraints.append(constraint)

                if constraint.type == _ConstraintType.Temperature:
                    temperature_constraint = constraint
                elif constraint.type == _ConstraintType.Pressure:
                    pressure_constraint = constraint

            # First extract the values of any variable constraints
            variable_nodes = value_node.findall("ThermoML:VariableValue", namespace)

            skip_entry = False

            for variable_node in variable_nodes:

                variable_index = int(
                    variable_node.find("./ThermoML:nVarNumber", namespace).text
                )

                if variable_index not in variable_definitions:

                    # The property was constrained by an unsupported variable and
                    # so will be skipped for now.
                    skip_entry = True
                    break

                variable_definition = variable_definitions[variable_index]

                variable_value = float(
                    variable_node.find("ThermoML:nVarValue", namespace).text
                )
                value_as_quantity = unit.Quantity(
                    variable_value, variable_definition.default_unit
                )

                # Convert the 'variable' into a full constraint entry
                constraint = _Constraint.from_variable(
                    variable_definition, value_as_quantity
                )
                constraints.append(constraint)

                if constraint.type == _ConstraintType.Temperature:
                    temperature_constraint = constraint
                elif constraint.type == _ConstraintType.Pressure:
                    pressure_constraint = constraint

            if skip_entry:
                continue

            # Extract the thermodynamic state that the property was measured at.
            if temperature_constraint is None:

                logging.debug(
                    "A property did not report the temperature it "
                    "was measured at and will be ignored."
                )
                continue

            temperature = temperature_constraint.value
            pressure = (
                None if pressure_constraint is None else pressure_constraint.value
            )

            thermodynamic_state = ThermodynamicState(
                temperature=temperature, pressure=pressure
            )

            # Now extract the actual values of the measured properties, and their
            # uncertainties
            property_nodes = value_node.findall("ThermoML:PropertyValue", namespace)

            for property_node in property_nodes:

                property_index = int(
                    property_node.find("./ThermoML:nPropNumber", namespace).text
                )

                if property_index not in property_definitions:

                    # Most likely the property was dropped earlier due to an unsupported phase / type
                    continue

                property_definition = property_definitions[property_index]

                uncertainty = _PureOrMixtureData.extract_uncertainty(
                    property_node, namespace, property_definition
                )

                measured_property = copy.deepcopy(property_definition)

                measured_property.thermodynamic_state = thermodynamic_state

                property_value_node = property_node.find(
                    ".//ThermoML:nPropValue", namespace
                )

                measured_property.set_value(
                    float(property_value_node.text),
                    None if uncertainty is None else float(uncertainty),
                )

                mixture = _PureOrMixtureData.build_substance(
                    measured_property, constraints, compounds
                )

                if mixture is None:
                    continue

                measured_property.substance = mixture
                measured_properties.append(measured_property)

        # By this point we now have the measured properties and the thermodynamic state
        # they were measured at converted to standardised classes.
        return measured_properties

    @staticmethod
    def from_xml_node(node, namespace, compounds):
        """Extracts all of the data in a ThermoML PureOrMixtureData node.

        Parameters
        ----------
        node: xml.etree.Element
            The xml node to read.
        namespace: dict of str and str
            The xml namespace.
        compounds: dict of int and _Compound
            A list of the already extracted `_Compound`'s.

        Returns
        ----------
        list of _Property
            A list of extracted properties.
        """

        # Figure out which compounds are going to be associated with
        # the property entries.
        compound_indices = _PureOrMixtureData.extract_compound_indices(
            node, namespace, compounds
        )

        if compound_indices is None:
            # Most likely this entry depended on a non-parsable compound
            # and will be skipped entirely
            return None

        if len(compound_indices) == 0:

            logging.debug("A PureOrMixtureData entry with no compounds was ignored.")
            return None

        phase_nodes = node.findall("./ThermoML:PhaseID/ThermoML:ePhase", namespace)

        all_phases = None

        for phase_node in phase_nodes:

            phase = phase_from_thermoml_string(phase_node.text)

            if phase == PropertyPhase.Undefined:

                logging.debug(
                    f"A property was measured in an unsupported phase "
                    f"({phase_node.text}) and will be skipped."
                )

                return None

            all_phases = phase if all_phases is None else all_phases | phase

        # Extract property definitions - values come later!
        property_definitions = _PureOrMixtureData.extract_property_definitions(
            node, namespace, all_phases
        )

        if len(property_definitions) == 0:
            return None

        for property_index in property_definitions:

            all_phases |= property_definitions[property_index].phase
            property_definitions[property_index].phase |= all_phases

        # Extract any constraints on the system e.g pressure, temperature
        global_constraints = _PureOrMixtureData.extract_global_constraints(
            node, namespace, compounds
        )

        if global_constraints is None:
            return None

        # Extract any variables set on the system e.g pressure, temperature
        # Only the definition entry and not the value of the variable is extracted
        variable_definitions = _PureOrMixtureData.extract_variable_definitions(
            node, namespace, compounds
        )

        if len(global_constraints) == 0 and len(variable_definitions) == 0:

            logging.debug("A PureOrMixtureData entry with no constraints was ignored.")
            return None

        used_compounds = {}

        for compound_index in compounds:

            if compound_index not in compound_indices:
                continue

            used_compounds[compound_index] = compounds[compound_index]

        measured_properties = _PureOrMixtureData.extract_measured_properties(
            node,
            namespace,
            property_definitions,
            global_constraints,
            variable_definitions,
            used_compounds,
        )

        return measured_properties


[docs]class ThermoMLDataSet(PhysicalPropertyDataSet): """A dataset of physical property measurements created from a ThermoML dataset. Examples -------- For example, we can use the DOI `10.1016/j.jct.2005.03.012` as a key for retrieving the dataset from the ThermoML Archive: >>> dataset = ThermoMLDataSet.from_doi('10.1016/j.jct.2005.03.012') You can also specify multiple ThermoML Archive keys to create a dataset from multiple ThermoML files: >>> thermoml_keys = ['10.1021/acs.jced.5b00365', '10.1021/acs.jced.5b00474'] >>> dataset = ThermoMLDataSet.from_doi(*thermoml_keys) """
[docs] def __init__(self): """Constructs a new ThermoMLDataSet object.""" super().__init__()
[docs] @classmethod def from_doi(cls, *doi_list): """Load a ThermoML data set from a list of DOIs Parameters ---------- doi_list: str The list of DOIs to pull data from Returns ------- ThermoMLDataSet The loaded data set. """ return_value = None for doi in doi_list: # E.g https://trc.nist.gov/ThermoML/10.1016/j.jct.2016.12.009.xml doi_url = f"https://trc.nist.gov/ThermoML/{doi}.xml" data_set = cls._from_url(doi_url, MeasurementSource(doi=doi)) if data_set is None or len(data_set.properties) == 0: continue if return_value is None: return_value = data_set else: return_value.merge(data_set) return return_value
[docs] @classmethod def from_url(cls, *url_list): """Load a ThermoML data set from a list of URLs Parameters ---------- url_list: str The list of URLs to pull data from Returns ------- ThermoMLDataSet The loaded data set. """ return_value = None for url in url_list: data_set = cls._from_url(url) if data_set is None or len(data_set.properties) == 0: continue if return_value is None: return_value = data_set else: return_value.merge(data_set) return return_value
@classmethod def _from_url(cls, url, source=None): """Load a ThermoML data set from a given URL Parameters ---------- url: str The URL to pull data from source: Source, optional An optional source which gives more information (e.g DOIs) for the url. Returns ---------- ThermoMLDataSet The loaded data set. """ if source is None: source = MeasurementSource(reference=url) return_value = None try: with urlopen(url) as response: return_value = cls.from_xml(response.read(), source) except HTTPError: logging.warning(f"No ThermoML file could not be found at {url}") return return_value
[docs] @classmethod def from_file(cls, *file_list): """Load a ThermoML data set from a list of files Parameters ---------- file_list: str The list of files to pull data from Returns ------- ThermoMLDataSet The loaded data set. """ return_value = None counter = 0 for file in file_list: data_set = cls._from_file(file) counter += 1 if data_set is None or len(data_set.properties) == 0: continue if return_value is None: return_value = data_set else: return_value.merge(data_set) return return_value
@classmethod def _from_file(cls, path): """Load a ThermoML data set from a given file Parameters ---------- path: str The file path to pull data from Returns ------- ThermoMLDataSet The loaded data set. """ source = MeasurementSource(reference=path) return_value = None try: with open(path) as file: return_value = ThermoMLDataSet.from_xml(file.read(), source) except FileNotFoundError: logging.warning(f"No ThermoML file could not be found at {path}") return return_value
[docs] @classmethod def from_xml(cls, xml, source): """Load a ThermoML data set from an xml object. Parameters ---------- xml: str The xml string to parse. source: Source The source of the xml object. Returns ------- ThermoMLDataSet The loaded ThermoML data set. """ root_node = ElementTree.fromstring(xml) if root_node is None: logging.warning("The ThermoML XML document could not be parsed.") return None if root_node.tag.find("DataReport") < 0: logging.warning( "The ThermoML XML document does not contain the expected root node." ) return None # Extract the namespace that will prefix all type names namespace_string = re.search(r"{.*\}", root_node.tag).group(0)[1:-1] namespace = {"ThermoML": namespace_string} return_value = ThermoMLDataSet() compounds = {} # Extract the base compounds present in the xml file for node in root_node.findall("ThermoML:Compound", namespace): compound = _Compound.from_xml_node(node, namespace) if compound is None: continue if compound.index in compounds: raise RuntimeError( "A ThermoML data set contains two " "compounds with the same index" ) compounds[compound.index] = compound # Pull out any and all properties in the file. for node in root_node.findall("ThermoML:PureOrMixtureData", namespace): properties = _PureOrMixtureData.from_xml_node(node, namespace, compounds) if properties is None or len(properties) == 0: continue for measured_property in properties: substance_id = measured_property.substance.identifier if substance_id not in return_value._properties: return_value._properties[substance_id] = [] if measured_property.type is None: raise ValueError( "An unexepected property type managed to slip through the cracks." ) final_property = measured_property.type() final_property.value = measured_property.value final_property.uncertainty = measured_property.uncertainty final_property.phase = measured_property.phase final_property.thermodynamic_state = ( measured_property.thermodynamic_state ) final_property.substance = measured_property.substance final_property.source = source return_value._properties[substance_id].append(final_property) return_value._sources.append(source) return return_value