Source code for openff.evaluator.datasets.thermoml.thermoml

An API for importing a ThermoML archive.

import copy
import logging
import re
import traceback
from enum import Enum, unique
from urllib.error import HTTPError
from xml.etree import ElementTree

import numpy as np
import requests
from openff.units import unit

from openff.evaluator.datasets import (
from openff.evaluator.substances import Component, MoleFraction, Substance
from openff.evaluator.thermodynamics import ThermodynamicState

logger = logging.getLogger(__name__)

def _unit_from_thermoml_string(full_string):
    """Extract the unit from a ThermoML property name.

    full_string: str
        The string to convert to a Unit object.

        The parsed unit.

    full_string_split = full_string.split(",")

    # Extract the unit portion of the string
    unit_string = full_string_split[1] if len(full_string_split) > 1 else ""
    # Convert symbols like dm3 to dm**3
    unit_string = re.sub(r"([a-z])([0-9]+)", r"\1**\2", unit_string.strip())

    return unit.Unit(unit_string)

def _phase_from_thermoml_string(string):
    """Converts a ThermoML string to a PropertyPhase

    string: str
        The string to convert to a PropertyPhase

        The converted PropertyPhase
    phase_string = string.lower().strip()
    phase = PropertyPhase.Undefined

    if phase_string == "liquid" or phase_string.find("solution") >= 0:
        phase = PropertyPhase.Liquid
    elif phase_string.find("crystal") >= 0 and not phase_string.find("liquid") >= 0:
        phase = PropertyPhase.Solid
    elif phase_string.find("gas") >= 0:
        phase = PropertyPhase.Gas

    return phase

class _ConstraintType(Enum):
    """An enumeration of the supported types of ThermoML constraint

    Undefined = "Undefined"
    Temperature = "Temperature, K"
    Pressure = "Pressure, kPa"
    ComponentMoleFraction = "Mole fraction"
    ComponentMassFraction = "Mass fraction"
    ComponentMolality = "Molality, mol/kg"
    SolventMoleFraction = "Solvent: Mole fraction"
    SolventMassFraction = "Solvent: Mass fraction"
    SolventMolality = "Solvent: Molality, mol/kg"

    def from_node(node):
        """Converts either a ConstraintType or VariableType xml node to a _ConstraintType.

        node: xml.etree.Element
            The xml node to convert.

            The converted constraint type.

            constraint_type = _ConstraintType(node.text)
        except (KeyError, ValueError):
            constraint_type = _ConstraintType.Undefined

        if constraint_type == _ConstraintType.Undefined:
            logging.debug(f"{node.tag}->{node.text} is an unsupported constraint type.")

        return constraint_type

    def is_composition_constraint(self):
        """Checks whether the purpose of this constraint is
        to constrain the substance composition.

            True if the constraint type is either a

            - `_ConstraintType.ComponentMoleFraction`
            - `_ConstraintType.ComponentMassFraction`
            - `_ConstraintType.ComponentMolality`
            - `_ConstraintType.SolventMoleFraction`
            - `_ConstraintType.SolventMassFraction`
            - `_ConstraintType.SolventMolality`
        return (
            self == _ConstraintType.ComponentMoleFraction
            or self == _ConstraintType.ComponentMassFraction
            or self == _ConstraintType.ComponentMolality
            or self == _ConstraintType.SolventMoleFraction
            or self == _ConstraintType.SolventMassFraction
            or self == _ConstraintType.SolventMolality

class _Constraint:
    """A wrapper around a ThermoML `Constraint` node. A constraint
    in ThermoML encompasses such constructs as temperature, pressure
    or composition at which a measurement was recorded.

    def __repr__(self):
        return f"Constraint(type={self.type}, value={self.value})"

    def __init__(self):
        self.type = _ConstraintType.Undefined
        self.value = 0.0

        self.solvents = []

        # Describes which compound the variable acts upon.
        self.compound_index = None

    def from_node(cls, constraint_node, namespace):
        """Creates a _Constraint from an xml node.

        constraint_node: Element
            The xml node to convert.
        namespace: dict of str and str
            The xml namespace.

        _Constraint, optional
            The extracted constraint if the constraint type is supported,
            otherwise `None`.
        # Extract the xml nodes.
        type_node = constraint_node.find(".//ThermoML:ConstraintType/*", namespace)
        value_node = constraint_node.find("./ThermoML:nConstraintValue", namespace)

        solvent_index_nodes = constraint_node.find(
            "./ThermoML:Solvent//ThermoML:nOrgNum", namespace
        compound_index_node = constraint_node.find(
            "./ThermoML:ConstraintID/ThermoML:RegNum/*", namespace

        value = float(value_node.text)
        # Determine what the default unit for this variable should be.
        unit_type = _unit_from_thermoml_string(type_node.text)

        return_value = cls()

        return_value.type = _ConstraintType.from_node(type_node)
        return_value.value = value * unit_type

        if compound_index_node is not None:
            return_value.compound_index = int(compound_index_node.text)

        if solvent_index_nodes is not None:
            for solvent_index_node in solvent_index_nodes:

        return None if return_value.type is _ConstraintType.Undefined else return_value

    def from_variable(cls, variable, value):
        """Creates a _Constraint from an existing
        `_VariableDefinition` variable definition.

        variable: _VariableDefinition
            The variable to convert.
        value: openff.evaluator.unit.Quantity
            The value of the constant.

            The created constraint.
        return_value = cls()

        return_value.type = variable.type
        return_value.compound_index = variable.compound_index


        return_value.value = value

        return return_value

class _VariableDefinition:
    """A wrapper around a ThermoML Variable node. A variable in
    ThermoML is essentially just the definition of a `Constraint`
    (the constraint type, the expected units, etc.) whose value is
    defined inside of another ThermoML node.

    def __init__(self):
        self.index = -1
        self.type = _ConstraintType.Undefined

        self.solvents = []
        self.default_unit = None

        # Describes which compound the variable acts upon.
        self.compound_index = None

    def from_node(cls, variable_node, namespace):
        """Creates a `_VariableDefinition` from an xml node.

        variable_node: xml.etree.Element
            The xml node to convert.
        namespace: dict of str and str
            The xml namespace.

            The created variable definition.
        # Extract the xml nodes.
        type_node = variable_node.find(".//ThermoML:VariableType/*", namespace)
        index_node = variable_node.find("ThermoML:nVarNumber", namespace)

        solvent_index_nodes = variable_node.find(
            "./ThermoML:Solvent//ThermoML:nOrgNum", namespace
        compound_index_node = variable_node.find(
            "./ThermoML:VariableID/ThermoML:RegNum/*", namespace

        return_value = cls()

        return_value.default_unit = _unit_from_thermoml_string(type_node.text)

        return_value.index = int(index_node.text)
        return_value.type = _ConstraintType.from_node(type_node)

        if compound_index_node is not None:
            return_value.compound_index = int(compound_index_node.text)

        if solvent_index_nodes is not None:
            for solvent_index_node in solvent_index_nodes:

        return None if return_value.type is _ConstraintType.Undefined else return_value

class _PropertyUncertainty:
    """A wrapper around a ThermoML PropUncertainty node."""

    # Reduce code redundancy by reusing this class for
    # both property and combined uncertainties.
    prefix = ""

    def __init__(self):
        self.index = -1
        self.coverage_factor = None

    def from_xml(cls, node, namespace):
        """Creates a _PropertyUncertainty from an xml node.

        node: Element
            The xml node to convert.
        namespace: dict of str and str
            The xml namespace.

            The created property uncertainty.

        coverage_factor_node = node.find(
            f"ThermoML:n{cls.prefix}CoverageFactor", namespace
        confidence_node = node.find(
            f"ThermoML:n{cls.prefix}UncertLevOfConfid", namespace

        # As defined by
        if coverage_factor_node is not None:
            coverage_factor = float(coverage_factor_node.text)
        elif confidence_node is not None and confidence_node.text == "95":
            coverage_factor = 2
            return None

        index_node = node.find(f"ThermoML:n{cls.prefix}UncertAssessNum", namespace)
        index = int(index_node.text)

        return_value = cls()

        return_value.coverage_factor = coverage_factor
        return_value.index = index

        return return_value

class _CombinedUncertainty(_PropertyUncertainty):
    """A wrapper around a ThermoML CombPropUncertainty node."""

    prefix = "Comb"

class _Compound:
    """A wrapper around a ThermoML Compound node."""

    def __repr__(self):
        return f"Compound(smiles={self.smiles}, index={self.index})"

    def __init__(self):
        self.smiles = None
        self.index = -1

    def smiles_from_inchi_string(inchi_string):
        """Attempts to create a SMILES pattern from an inchi string.

        inchi_string: str
            The InChI string to convert.

        str, optional
            None if the identifier cannot be converted, otherwise the converted SMILES pattern.
        from openff.toolkit.topology import Molecule

            import rdkit.Chem
        except ImportError:
            return None

        if inchi_string is None:
            raise ValueError("The InChI string cannot be `None`.")

        molecule = rdkit.Chem.MolFromInchi(inchi_string, removeHs=False)

        if not molecule:
            raise ValueError(f"The InchI string ({inchi_string}) could not be parsed")

            return Molecule.from_rdkit(molecule).to_smiles(
        except ValueError:
            return None

    def smiles_from_thermoml_smiles_string(thermoml_string):
        """Attempts to create a SMILES pattern from a thermoml smiles string.

        thermoml_string: str
            The string to convert.

        str, optional
            None if the identifier cannot be converted, otherwise the converted SMILES pattern.
        from openff.toolkit.topology import Molecule
        from openff.toolkit.utils.rdkit_wrapper import RDKitToolkitWrapper

        if thermoml_string is None:
            raise ValueError("The string cannot be `None`.")

        molecule = Molecule.from_smiles(
            thermoml_string, toolkit_registry=RDKitToolkitWrapper()

        return molecule.to_smiles(isomeric=True, explicit_hydrogens=False, mapped=False)

    def smiles_from_common_name(common_name):
        """Attempts to create a SMILES pattern from an IUPAC name.

        common_name: str
            The common name to convert.
        str, None
            None if the identifier cannot be converted, otherwise the converted SMILES pattern.
        from openff.toolkit.topology import Molecule
        from openff.toolkit.utils import InvalidIUPACNameError, LicenseError

        if common_name is None:
            return None

            molecule = Molecule.from_iupac(common_name, allow_undefined_stereo=True)
            smiles = molecule.to_smiles(

            if isinstance(smiles, str) and len(smiles) == 0:
                smiles = None

        except LicenseError:
            smiles = None
        except (ValueError, InvalidIUPACNameError):
            smiles = None

        return smiles

    def from_xml_node(cls, node, namespace):
        """Creates a _Compound from an xml node.

        node: Element
            The xml node to convert.
        namespace: dict of str and str
            The xml namespace.

            The created compound wrapper.
        # Gather up all possible identifiers
        inchi_identifier_nodes = node.findall("ThermoML:sStandardInChI", namespace)
        smiles_identifier_nodes = node.findall("ThermoML:sSmiles", namespace)
        common_identifier_nodes = node.findall("ThermoML:sCommonName", namespace)

        smiles = None

        if (
            len(inchi_identifier_nodes) > 0
            and inchi_identifier_nodes[0].text is not None
            # Convert InChI key to a smiles pattern.
            smiles = cls.smiles_from_inchi_string(inchi_identifier_nodes[0].text)

        if (
            smiles is None
            and len(smiles_identifier_nodes) > 0
            and smiles_identifier_nodes[0].text is not None
            # Standardise the smiles pattern using OE.
            smiles = cls.smiles_from_thermoml_smiles_string(

        if (
            smiles is None
            and len(common_identifier_nodes) > 0
            and common_identifier_nodes[0].text is not None
            # Convert the common name to a smiles pattern.
            smiles = cls.smiles_from_common_name(common_identifier_nodes[0].text)

        if smiles is None:
                "A ThermoML:Compound node does not have a valid InChI identifier, "
                "a valid SMILES pattern, or an understandable common name."

            return None

        index_node = node.find("./ThermoML:RegNum/*", namespace)

        if index_node is None:
            raise ValueError("A ThermoML:Compound does not have an index defined.")

        compound_index = int(index_node.text)

        return_value = cls()

        return_value.smiles = smiles
        return_value.index = compound_index

        return return_value

class _PureOrMixtureData:
    """A wrapper around a ThermoML PureOrMixtureData node."""

    def extract_compound_indices(node, namespace, compounds):
        """Extract a list of the compound indices which a given `PureOrMixtureData`
        node depends upon.

        node: xml.etree.Element
            The xml node to read.
        namespace: dict of str and str
            The xml namespace.
        compounds: dict of int and _Compound
            The compounds which were able to be parsed from the
            parent archive file, with keys of their assigned

        component_nodes = node.findall("ThermoML:Component", namespace)
        compound_indices = []

        # Figure out which compounds are going to be associated with
        # the property entries.
        for component_node in component_nodes:
            index_node = component_node.find("./ThermoML:RegNum/*", namespace)

            compound_index = int(index_node.text)

            if compound_index not in compounds:
                    "A PureOrMixtureData entry depends on an "
                    "unsupported compound and has been ignored"

                return None

            if compound_index in compound_indices:
                raise ValueError(
                    "A ThermoML:PureOrMixtureData states its dependency on the "
                    "same compound twice."


        return compound_indices

    def extract_property_definitions(node, namespace, parent_phases):
        """Extract those property definitions defined by a PureOrMixtureData
        node. The extracted definitions are not extracted, as these a defined
        elsewhere in the archive file.

        node: xml.etree.Element
            The xml node to read.
        namespace: dict of str and str
            The xml namespace.
        parent_phases: PropertyPhase
            The phases specified by the parent PureOrMixtureData node.

        dict of int and ThermoMLProperty
            The extracted property definitions with keys of their
            assigned indices.

        property_nodes = node.findall("ThermoML:Property", namespace)
        properties = {}

        for property_node in property_nodes:
            property_definition = ThermoMLProperty.from_xml_node(
                property_node, namespace, parent_phases

            if property_definition is None:

            if property_definition.index in properties:
                raise ValueError(
                    "A ThermoML data set contains two properties with the same index"

            properties[property_definition.index] = property_definition

        return properties

    def validate_constraint(constraint, compounds):
        """Validates a constraint object - this may be either
        a full `_Constraint` or just a `_VariableDefinition`.

        constraint: _Constraint or _VariableDefinition
            The constraint to validate.
        compounds: dict of int and _Compound
            A dictionary of the compounds the parent PureOrMixtureData was
            measured for.

            True if the constraint is valid, False otherwise.

        if constraint is None or constraint.type is _ConstraintType.Undefined:
            logging.debug("An unsupported constraint has been ignored.")
            return False

        if (
            constraint.compound_index is not None
            and constraint.compound_index not in compounds
                "A constraint exists upon a non-existent compound and will be ignored."
            return False

        if (
            and constraint.compound_index is None
                "An unsupported constraint has been ignored - composition constraints"
                "need to have a corresponding compound_index."
            return False

        return True

    def extract_global_constraints(node, namespace, compounds):
        """Extract the constraints which should be applied to all of
        the properties defined in a `PureOrMixtureData` node.

        node: xml.etree.Element
            The xml node to read.
        namespace: dict of str and str
            The xml namespace.
        compounds: dict of int and _Compound
            A dictionary of the compounds this PureOrMixtureData was
            measured for.

        list of _Constraint, optional
            The extracted constraints if all could be parsed,
            otherwise `None`.

        constraint_nodes = node.findall("ThermoML:Constraint", namespace)
        constraints = []

        for constraint_node in constraint_nodes:
            constraint = _Constraint.from_node(constraint_node, namespace)

            if not _PureOrMixtureData.validate_constraint(constraint, compounds):
                return None


        return constraints

    def extract_variable_definitions(node, namespace, compounds):
        """Extract all of the 'variables' in a PureOrMixtureData node.
        These are simply constraints whose values are defined elsewhere
        in the archive.

        node: xml.etree.Element
            The xml node to read.
        namespace: dict of str and str
            The xml namespace.
        compounds: dict of int and _Compound
            A dictionary of the compounds this PureOrMixtureData was calculated for.

        dict of int and _VariableDefinition
            The extracted variable definitions which could be parsed.
        variable_nodes = node.findall("ThermoML:Variable", namespace)
        variables = {}

        for variable_node in variable_nodes:
            variable = _VariableDefinition.from_node(variable_node, namespace)

            if not _PureOrMixtureData.validate_constraint(variable, compounds):

            variables[variable.index] = variable

        return variables

    def extract_uncertainty(node, namespace, property_definition):
        """Extracts the uncertainties on the measured properties
        contained in this `PureOrMixtureData` node.

        node: xml.etree.Element
            The xml node to read.
        namespace: dict of str and str
            The xml namespace.
        property_definition: ThermoMLProperty
            The property to which this uncertainty is attached.

        float, optional
            The uncertainty in the property if it can be parsed or
            if one is present, otherwise `None`.

        # Look for a standard uncertainty..
        uncertainty_node = node.find(".//ThermoML:nCombStdUncertValue", namespace)

        if uncertainty_node is None:
            uncertainty_node = node.find(".//ThermoML:nStdUncertValue", namespace)

        # We have found a std. uncertainty
        if uncertainty_node is not None:
            return float(uncertainty_node.text)

        # Try to calculate uncertainty from a coverage factor if present
        if (
            len(property_definition.combined_uncertainty_definitions) == 0
            and len(property_definition.property_uncertainty_definitions) == 0
            return None

        combined = len(property_definition.combined_uncertainty_definitions) > 0

        prefix = (
            _CombinedUncertainty.prefix if combined else _PropertyUncertainty.prefix

        if combined:
            index_node = node.find(
            index_node = node.find(
                "./ThermoML:PropUncertainty/ThermoML:nUncertAssessNum", namespace

        expanded_uncertainty_node = node.find(
            ".//ThermoML:n" + prefix + "ExpandUncertValue", namespace

        if index_node is None or expanded_uncertainty_node is None:
            return None

        expanded_uncertainty = float(expanded_uncertainty_node.text)
        index = int(index_node.text)

        if (
            and index not in property_definition.combined_uncertainty_definitions
            return None

        if (
            not combined
            and index not in property_definition.property_uncertainty_definitions
            return None

        divisor = (
            if combined
            else property_definition.property_uncertainty_definitions[

        return expanded_uncertainty / divisor

    def _smiles_to_molecular_weight(smiles):
        """Calculates the molecular weight of a substance specified
        by a smiles string.

        smiles: str
            The smiles string to calculate the molecular weight of.

            The molecular weight in units of grams per mole.

        from openff.toolkit.topology import Molecule

            molecule = Molecule.from_smiles(smiles)

        except Exception as e:
            formatted_exception = traceback.format_exception(None, e, e.__traceback__)

            raise ValueError(
                f"The toolkit raised an exception for the "
                f"{smiles} smiles pattern: {formatted_exception}"

        molecular_weight = 0.0 * unit.dalton

        for atom in molecule.atoms:
            molecular_weight += atom.mass

        # Molecular weight in Daltons is not a molar quantity, so need
        # multiply it through by Avogadro's number per mol to become gram/mol
        return unit.avogadro_number * molecular_weight / unit.mol

    def _solvent_mole_fractions_to_moles(
        solvent_mass, solvent_mole_fractions, solvent_compounds
        """Converts a set of solvent mole fractions to moles for a
        given mass of solvent.

        solvent_mass: openff.evaluator.unit.Quantity
            The total mass of the solvent in units compatible with kg.
        solvent_mole_fractions: dict of int and float
            The mole fractions of any solvent compounds in the system.
        solvent_compounds: dict of int and float
            A dictionary of any solvent compounds in the system.

        dict of int and openff.evaluator.unit.Quantity
            A dictionary of the moles of each solvent compound.
        weighted_molecular_weights = 0.0 * unit.gram / unit.mole
        number_of_moles = {}

        for solvent_index in solvent_compounds:
            solvent_smiles = solvent_compounds[solvent_index].smiles

            solvent_fraction = solvent_mole_fractions[solvent_index]
            solvent_weight = _PureOrMixtureData._smiles_to_molecular_weight(

            weighted_molecular_weights += solvent_weight * solvent_fraction

        total_solvent_moles = solvent_mass / weighted_molecular_weights

        for solvent_index in solvent_compounds:
            moles = solvent_mole_fractions[solvent_index] * total_solvent_moles
            number_of_moles[solvent_index] = moles

        return number_of_moles

    def _convert_mole_fractions(constraints, compounds, solvent_mole_fractions=None):
        """Converts a set of `_Constraint` to mole fractions.

        constraints: list of _Constraint
            The constraints to convert.
        compounds: dict of int and _Compound
            The compounds in the system.
        solvent_mole_fractions: dict of int and float
            The mole fractions of any solvent compounds in the system,
            where the total mole fraction of all solvents must be equal
            to one.

        dict of int and float
            A dictionary of compound indices and mole fractions.

        # noinspection PyTypeChecker
        number_of_constraints = len(constraints)

        mole_fractions = {}
        total_mol_fraction = 0.0

        for constraint in constraints:
            mole_fraction = constraint.value

            if isinstance(mole_fraction, unit.Quantity):
                mole_fraction =

            mole_fractions[constraint.compound_index] = mole_fraction
            total_mol_fraction += mole_fractions[constraint.compound_index]

        if (
            number_of_constraints != len(compounds)
            and solvent_mole_fractions is not None
        ) or (
            number_of_constraints != len(compounds) - 1
            and number_of_constraints != len(compounds)
            and solvent_mole_fractions is None
            raise ValueError(
                f"The number of mole fraction constraints ({number_of_constraints}) must be one "
                f"less than or equal to the number of compounds being constrained ({len(compounds)}) "
                f"if a solvent list is not present, otherwise there must be an equal number."

        # Handle the case were a single mole fraction constraint is missing.
        if number_of_constraints == len(compounds) - 1:
            for compound_index in compounds:
                if compound_index in mole_fractions:

                mole_fractions[compound_index] = 1.0 - total_mol_fraction

        # Recompute the total mole fraction to be safe.
        total_mol_fraction = 0.0

        for compound_index in mole_fractions:
            total_mol_fraction += mole_fractions[compound_index]

        # Account for any solvent present.
        if solvent_mole_fractions is not None:
            # Assume the remainder of the mole fraction is the solvent.
            remaining_mole_fraction = 1.0 - total_mol_fraction

            for solvent_index in solvent_mole_fractions:
                mole_fractions[solvent_index] = (
                    solvent_mole_fractions[solvent_index] * remaining_mole_fraction

        return mole_fractions

    def _convert_mass_fractions(
        constraints, compounds, solvent_mole_fractions=None, solvent_compounds=None
        """Converts a set of `_Constraint` to mole fractions.

        constraints: list of _Constraint
            The constraints to convert.
        compounds: dict of int and _Compound
            The compounds in the system.
        solvent_mole_fractions: dict of int and float
            The mole fractions of any solvent compounds in the system,
            where the total mole fraction of all solvents must be equal
            to one.
        solvent_compounds: dict of int and float
            A dictionary of any explicitly defined solvent compounds in the

        dict of int and float
            A dictionary of compound indices and mole fractions.

        # noinspection PyTypeChecker
        number_of_constraints = len(constraints)

        mass_fractions = {}
        total_mass_fraction = 0.0

        for constraint in constraints:
            mass_fraction = constraint.value

            if isinstance(mass_fraction, unit.Quantity):
                mass_fraction =

            mass_fractions[constraint.compound_index] = mass_fraction
            total_mass_fraction += mass_fraction

        if (
            number_of_constraints != len(compounds)
            and solvent_mole_fractions is not None
        ) or (
            number_of_constraints != len(compounds) - 1
            and number_of_constraints != len(compounds)
            and solvent_mole_fractions is None
            raise ValueError(
                f"The number of mass fraction constraints ({number_of_constraints}) must be one "
                f"less than or equal to the number of compounds being constrained ({len(compounds)}) "
                f"if a solvent list is not present, otherwise there must be an equal number."

        # Handle the case were a single mass fraction constraint is missing.
        if number_of_constraints == len(compounds) - 1:
            for compound_index in compounds:
                if compound_index in mass_fractions:

                mass_fractions[compound_index] = 1.0 - total_mass_fraction
                if isinstance(mass_fractions[compound_index], unit.Quantity):
                    mass_fractions[compound_index] = (

        total_mass = 1 * unit.gram
        total_solvent_mass = total_mass

        moles = {}
        total_moles = 0.0 * unit.mole

        for compound_index in compounds:
            compound_smiles = compounds[compound_index].smiles
            compound_weight = _PureOrMixtureData._smiles_to_molecular_weight(

            moles[compound_index] = (
                total_mass * mass_fractions[compound_index] / compound_weight
            total_moles += moles[compound_index]

            total_solvent_mass -= total_mass * mass_fractions[compound_index]

        if (
            number_of_constraints == len(compounds)
            and solvent_mole_fractions is not None
            solvent_moles = _PureOrMixtureData._solvent_mole_fractions_to_moles(
                total_solvent_mass, solvent_mole_fractions, solvent_compounds

            for solvent_index in solvent_moles:
                moles[solvent_index] = solvent_moles[solvent_index]
                total_moles += solvent_moles[solvent_index]

        mole_fractions = {}

        for compound_index in moles:
            mole_fraction = moles[compound_index] / total_moles
            mole_fractions[compound_index] = mole_fraction

        return mole_fractions

    def _convert_molality(
        constraints, compounds, solvent_mole_fractions=None, solvent_compounds=None
        """Converts a set of `_Constraint` to mole fractions.

        constraints: list of _Constraint
            The constraints to convert.
        compounds: dict of int and _Compound
            The compounds in the system.
        solvent_mole_fractions: dict of int and float
            The mole fractions of any solvent compounds in the system,
            where the total mole fraction of all solvents must be equal
            to one.
        solvent_compounds: dict of int and float
            A dictionary of any explicitly defined solvent compounds in the

        dict of int and float
            A dictionary of compound indices and mole fractions.
        number_of_moles = {}
        # noinspection PyTypeChecker
        number_of_constraints = len(constraints)

        mole_fractions = {}

        total_number_of_moles = 0.0 * unit.moles
        total_solvent_mass = 1.0 * unit.kilograms

        for constraint in constraints:
            molality = constraint.value
            moles = molality * total_solvent_mass

            number_of_moles[constraint.compound_index] = moles
            total_number_of_moles += moles

        if (
            number_of_constraints != len(compounds) - 1
            and solvent_mole_fractions is None
        ) or (
            number_of_constraints != len(compounds)
            and solvent_mole_fractions is not None
            raise ValueError(
                f"The number of molality constraints ({number_of_constraints}) must be one "
                f"less than the number of compounds being constrained ({len(compounds)}) if a "
                f"solvent list is not present, otherwise there must be an equal number."

        if (
            number_of_constraints == len(compounds) - 1
            and solvent_mole_fractions is None
            # In this case, there is no explicit solvent entry and the last component
            # whose molality has not been constrained is considered to be the 'solvent'
            for compound_index in compounds:
                if compound_index in number_of_moles:

                compound_smiles = compounds[compound_index].smiles
                compound_weight = _PureOrMixtureData._smiles_to_molecular_weight(

                moles = total_solvent_mass / compound_weight

                number_of_moles[compound_index] = moles
                total_number_of_moles += moles

        elif (
            number_of_constraints == len(compounds)
            and solvent_mole_fractions is not None
            solvent_moles = _PureOrMixtureData._solvent_mole_fractions_to_moles(
                total_solvent_mass, solvent_mole_fractions, solvent_compounds

            for solvent_index in solvent_moles:
                number_of_moles[solvent_index] = solvent_moles[solvent_index]
                total_number_of_moles += solvent_moles[solvent_index]

        for compound_index in number_of_moles:
            mole_fraction = number_of_moles[compound_index] / total_number_of_moles
            mole_fractions[compound_index] = mole_fraction

        return mole_fractions

    def build_substance(thermoml_property, constraints, compounds):
        """Build a Substance object from the extracted constraints and compounds.

        thermoml_property: ThermoMLProperty
            The property to which this mixture belongs.
        constraints: list of _Constraint
            The ThermoML constraints.
        compounds: dict of int and _Compound
            A dictionary of the compounds this PureOrMixtureData was calculated for.

            The constructed substance.

        # TODO: We need to take into account `thermoml_property.target_compound_index` and
        #       `thermoml_property.solute_standard_state` to properly identify infinitely
        #       diluted solutes in the system (if any). Otherwise the solute will be
        #       assigned a mole fraction of zero.

        solvent_constraint_type = _ConstraintType.Undefined
        component_constraint_type = _ConstraintType.Undefined

        solvent_indices = set()

        for solvent_index in thermoml_property.solvents:
            if solvent_index in solvent_indices:


        # Determine which types of solvent and component constraints are
        # being applied.
        for constraint in constraints:
            # Make sure we hunt down solvent indices.
            for solvent_index in constraint.solvents:
                if solvent_index in solvent_indices:


            # Only composition type restraints apply here, skip
            # the rest.
            if not constraint.type.is_composition_constraint():

            if (
                constraint.type == _ConstraintType.SolventMassFraction
                or constraint.type == _ConstraintType.SolventMoleFraction
                or constraint.type == _ConstraintType.SolventMolality
                if solvent_constraint_type == _ConstraintType.Undefined:
                    solvent_constraint_type = constraint.type

                if solvent_constraint_type != constraint.type:
                        f"A property with different types of solvent composition constraints "
                        f"was found - {solvent_constraint_type} vs {constraint.type}). This "
                        f"is likely a bug in the ThermoML file and so this property will be "

                    return None

                if component_constraint_type == _ConstraintType.Undefined:
                    component_constraint_type = constraint.type

                if component_constraint_type != constraint.type:
                        f"A property with different types of composition constraints "
                        f"was found - {component_constraint_type} vs {constraint.type}). This "
                        f"is likely a bug in the ThermoML file and so this property will be "

                    return None

        # If no constraint was applied, this very likely means a pure substance
        # was found.
        if (
            component_constraint_type == _ConstraintType.Undefined
            and solvent_constraint_type == _ConstraintType.Undefined
            component_constraint_type = _ConstraintType.ComponentMoleFraction

        elif (
            component_constraint_type == _ConstraintType.Undefined
            and solvent_constraint_type != _ConstraintType.Undefined
                f"A property with only solvent composition "
                f"constraints {solvent_constraint_type} was found."

            return None

        solvent_mole_fractions = {}

        solvent_constraints = [
            for constraint in constraints
            if constraint.type == solvent_constraint_type

        solvent_compounds = {}

        for solvent_index in solvent_indices:
            if solvent_index in compounds:
                solvent_compounds[solvent_index] = compounds[solvent_index]

                "The composition of a non-existent solvent was "
                "found. This usually only occurs in cases were "
                "the solvent component could not be understood "
                "by the framework."

            return None

        # Make sure all of the solvents have not been removed.
        if (
            solvent_constraint_type != _ConstraintType.Undefined
            and len(solvent_indices) == 0
                "The composition of a solvent was found, however the "
                "solvent list is empty. This usually only occurs in "
                "cases were the solvent component could not be understood "
                "by the framework."

            return None

        remaining_constraints = [
            for constraint in constraints
            if constraint.type == component_constraint_type

        remaining_compounds = {}

        for compound_index in compounds:
            if compound_index in solvent_indices:

            remaining_compounds[compound_index] = compounds[compound_index]

        # Determine the mole fractions of the solvent species, if any.
        if solvent_constraint_type == _ConstraintType.SolventMoleFraction:
            solvent_mole_fractions = _PureOrMixtureData._convert_mole_fractions(
                solvent_constraints, solvent_compounds

        elif solvent_constraint_type == _ConstraintType.SolventMassFraction:
            solvent_mole_fractions = _PureOrMixtureData._convert_mass_fractions(
                solvent_constraints, solvent_compounds

        elif solvent_constraint_type == _ConstraintType.SolventMolality:
            solvent_mole_fractions = _PureOrMixtureData._convert_molality(
                solvent_constraints, solvent_compounds

        elif solvent_constraint_type == _ConstraintType.Undefined:
            solvent_mole_fractions = None
            solvent_compounds = None

            remaining_compounds = compounds

        # Determine the mole fractions of the remaining compounds.
        mole_fractions = {}

        if component_constraint_type == _ConstraintType.ComponentMoleFraction:
            mole_fractions = _PureOrMixtureData._convert_mole_fractions(
                remaining_constraints, remaining_compounds, solvent_mole_fractions

        elif component_constraint_type == _ConstraintType.ComponentMassFraction:
            mole_fractions = _PureOrMixtureData._convert_mass_fractions(

        elif component_constraint_type == _ConstraintType.ComponentMolality:
            mole_fractions = _PureOrMixtureData._convert_molality(

        if len(mole_fractions) != len(compounds):
            raise ValueError(
                f"The number of mole fractions ({len(mole_fractions)}) does not "
                f"equal the total number of compounds ({len(compounds)})"

        # Make sure we haven't picked up a dimensionless unit be accident.
        for compound_index in mole_fractions:
            if isinstance(mole_fractions[compound_index], unit.Quantity):
                mole_fractions[compound_index] = (

        total_mol_fraction = sum([value for value in mole_fractions.values()])

        if not np.isclose(total_mol_fraction, 1.0):
            raise ValueError(
                f"The total mole fraction {total_mol_fraction} is not equal to 1.0"

        substance = Substance()

        for compound_index in compounds:
            compound = compounds[compound_index]

            if np.isclose(mole_fractions[compound_index], 0.0):


        return substance

    def extract_measured_properties(
        """Extract the measured properties defined by a ThermoML
        PureOrMixtureData node.

        node: xml.etree.Element
            The xml node to read.
        namespace: dict of str and str
            The xml namespace.
        property_definitions: dict of int and ThermoMLProperty
            The extracted property definitions.
        global_constraints: list of _Constraint
            The extracted constraints.
        variable_definitions: dict of int and _VariableDefinition
            The extracted variable definitions.
        compounds: dict of int and _Compound
            The extracted compounds.

        list of ThermoMLProperty
            The extracted measured properties.

        value_nodes = node.findall("ThermoML:NumValues", namespace)

        measured_properties = []

        # Each value_node corresponds to one measure property.
        for value_node in value_nodes:
            constraints = []

            temperature_constraint = None
            pressure_constraint = None

            for global_constraint in global_constraints:
                constraint = copy.deepcopy(global_constraint)

                if constraint.type == _ConstraintType.Temperature:
                    temperature_constraint = constraint
                elif constraint.type == _ConstraintType.Pressure:
                    pressure_constraint = constraint

            # First extract the values of any variable constraints
            variable_nodes = value_node.findall("ThermoML:VariableValue", namespace)

            skip_entry = False

            for variable_node in variable_nodes:
                variable_index = int(
                    variable_node.find("./ThermoML:nVarNumber", namespace).text

                if variable_index not in variable_definitions:
                    # The property was constrained by an unsupported variable and
                    # so will be skipped for now.
                    skip_entry = True

                variable_definition = variable_definitions[variable_index]

                variable_value = float(
                    variable_node.find("ThermoML:nVarValue", namespace).text
                value_as_quantity = unit.Quantity(
                    variable_value, variable_definition.default_unit

                # Convert the 'variable' into a full constraint entry
                constraint = _Constraint.from_variable(
                    variable_definition, value_as_quantity

                if constraint.type == _ConstraintType.Temperature:
                    temperature_constraint = constraint
                elif constraint.type == _ConstraintType.Pressure:
                    pressure_constraint = constraint

            if skip_entry:

            # Extract the thermodynamic state that the property was measured at.
            if temperature_constraint is None:
                    "A property did not report the temperature it "
                    "was measured at and will be ignored."

            temperature = temperature_constraint.value
            pressure = (
                None if pressure_constraint is None else pressure_constraint.value

            thermodynamic_state = ThermodynamicState(
                temperature=temperature, pressure=pressure

            # Now extract the actual values of the measured properties, and their
            # uncertainties
            property_nodes = value_node.findall("ThermoML:PropertyValue", namespace)

            for property_node in property_nodes:
                property_index = int(
                    property_node.find("./ThermoML:nPropNumber", namespace).text

                if property_index not in property_definitions:
                    # Most likely the property was dropped earlier due to an unsupported phase / type

                property_definition = property_definitions[property_index]

                uncertainty = _PureOrMixtureData.extract_uncertainty(
                    property_node, namespace, property_definition

                measured_property = copy.deepcopy(property_definition)

                measured_property.thermodynamic_state = thermodynamic_state

                property_value_node = property_node.find(
                    ".//ThermoML:nPropValue", namespace

                    None if uncertainty is None else float(uncertainty),

                mixture = _PureOrMixtureData.build_substance(
                    measured_property, constraints, compounds

                if mixture is None:

                measured_property.substance = mixture

        # By this point we now have the measured properties and the thermodynamic state
        # they were measured at converted to standardised classes.
        return measured_properties

    def from_xml_node(node, namespace, compounds):
        """Extracts all of the data in a ThermoML PureOrMixtureData node.

        node: xml.etree.Element
            The xml node to read.
        namespace: dict of str and str
            The xml namespace.
        compounds: dict of int and _Compound
            A list of the already extracted `_Compound`'s.

        list of ThermoMLProperty
            A list of extracted properties.

        # Figure out which compounds are going to be associated with
        # the property entries.
        compound_indices = _PureOrMixtureData.extract_compound_indices(
            node, namespace, compounds

        if compound_indices is None:
            # Most likely this entry depended on a non-parsable compound
            # and will be skipped entirely
            return None

        if len(compound_indices) == 0:
            logging.debug("A PureOrMixtureData entry with no compounds was ignored.")
            return None

        phase_nodes = node.findall("./ThermoML:PhaseID/ThermoML:ePhase", namespace)

        all_phases = None

        for phase_node in phase_nodes:
            phase = _phase_from_thermoml_string(phase_node.text)

            if phase == PropertyPhase.Undefined:
                    f"A property was measured in an unsupported phase "
                    f"({phase_node.text}) and will be skipped."

                return None

            all_phases = phase if all_phases is None else all_phases | phase

        # Extract property definitions - values come later!
        property_definitions = _PureOrMixtureData.extract_property_definitions(
            node, namespace, all_phases

        if len(property_definitions) == 0:
            return None

        for property_index in property_definitions:
            all_phases |= property_definitions[property_index].phase
            property_definitions[property_index].phase |= all_phases

        # Extract any constraints on the system e.g pressure, temperature
        global_constraints = _PureOrMixtureData.extract_global_constraints(
            node, namespace, compounds

        if global_constraints is None:
            return None

        # Extract any variables set on the system e.g pressure, temperature
        # Only the definition entry and not the value of the variable is extracted
        variable_definitions = _PureOrMixtureData.extract_variable_definitions(
            node, namespace, compounds

        if len(global_constraints) == 0 and len(variable_definitions) == 0:
            logging.debug("A PureOrMixtureData entry with no constraints was ignored.")
            return None

        used_compounds = {}

        for compound_index in compounds:
            if compound_index not in compound_indices:

            used_compounds[compound_index] = compounds[compound_index]

        measured_properties = _PureOrMixtureData.extract_measured_properties(

        return measured_properties

class ThermoMLProperty:
    """A wrapper around a ThermoML Property node."""

    class SoluteStandardState(Enum):
        """Describes the standard state of a solute."""

        Undefined = ("Undefined",)
        InfiniteDilutionSolute = ("Infinite dilution solute",)
        PureCompound = ("Pure compound",)
        PureLiquidSolute = ("Pure liquid solute",)
        StandardMolality = ("Standard molality (1 mol/kg) solute",)

        def from_node(node):
            """Converts an `eStandardState` node a `ThermoMLProperty.SoluteStandardState`.

            node: xml.etree.Element
                The xml node to convert.

                The converted state type.

                standard_state = ThermoMLProperty.SoluteStandardState(node.text)
            except (KeyError, ValueError):
                standard_state = ThermoMLProperty.SoluteStandardState.Undefined

            if standard_state == _ConstraintType.Undefined:
                    f"{node.tag}->{node.text} is an unsupported "
                    f"solute standard state type."

            return standard_state

    def __repr__(self):
        return (
            f"<ThermoMLProperty {self.type_string}, substance={self.substance}, "
            "thermodynamic_state={self.thermodynamic_state}, "
            f"value={self.value}, uncertainty={self.uncertainty} >"

    def __init__(self, type_string):
        self.type_string = type_string

        self.thermodynamic_state = None
        self.phase = PropertyPhase.Undefined

        self.substance = None

        self.value = None
        self.uncertainty = None

        self.source = None

        self.index = None

        self.solute_standard_state = ThermoMLProperty.SoluteStandardState.Undefined
        self.solvents = []

        self.target_compound_index = None

        self.property_uncertainty_definitions = {}
        self.combined_uncertainty_definitions = {}

        self.default_unit = None

        self.target_compound_index = None

    def extract_uncertainty_definitions(
        """Extract any property or combined uncertainties from a property xml node.

        node: Element
            The xml node to convert.
        namespace: dict of str and str
            The xml namespace.
        property_uncertainty_definitions: list(_PropertyUncertainty)
            A list of the extracted property uncertainties.
        combined_uncertainty_definitions: list(_PropertyUncertainty)
            A list of the extracted combined property uncertainties.

        property_nodes = node.findall("ThermoML:CombinedUncertainty", namespace)

        for property_node in property_nodes:
            if property_node is None:

            uncertainty_definition = _CombinedUncertainty.from_xml(
                property_node, namespace

            if uncertainty_definition is None:

            combined_uncertainty_definitions[uncertainty_definition.index] = (

        property_nodes = node.findall("ThermoML:PropUncertainty", namespace)

        for property_node in property_nodes:
            if property_node is None:

            uncertainty_definition = _PropertyUncertainty.from_xml(
                property_node, namespace

            if uncertainty_definition is None:

            property_uncertainty_definitions[uncertainty_definition.index] = (

    def from_xml_node(cls, node, namespace, parent_phases):
        """Creates a ThermoMLProperty from an xml node.

        node: Element
            The xml node to convert.
        namespace: dict of str and str
            The xml namespace.
        parent_phases: PropertyPhase
            The phases specfied in the parent PureOrMixtureData node.

            The created property.

        # Gather up all possible identifiers
        index_node = node.find("ThermoML:nPropNumber", namespace)

        property_index = int(index_node.text)

        phase_node = node.find("./ThermoML:PropPhaseID//ThermoML:ePropPhase", namespace)
        phase = PropertyPhase.Undefined | parent_phases

        if phase_node is not None:
            phase |= _phase_from_thermoml_string(phase_node.text)

        reference_phase_node = node.find(
            "./ThermoML:RefPhaseID//ThermoML:eRefPhase", namespace

        if reference_phase_node is not None:
            phase |= _phase_from_thermoml_string(reference_phase_node.text)

        if phase == PropertyPhase.Undefined:
                f"A property was measured in an unsupported phase "
                f"({phase_node.text}) and will be skipped."

            return None

        property_group_node = node.find(
            "./ThermoML:Property-MethodID//ThermoML:PropertyGroup//*", namespace

        property_name_node = property_group_node.find("./ThermoML:ePropName", namespace)
        method_name_node = property_group_node.find("./ThermoML:eMethodName", namespace)

        if method_name_node is None:
            method_name_node = property_group_node.find(
                "./ThermoML:sMethodName", namespace

        if method_name_node is None or property_name_node is None:
            raise RuntimeError("A property does not have a name / method entry.")

        if property_name_node.text not in ThermoMLDataSet.registered_properties:
                f"An unsupported property was found "
                f"({property_name_node.text}) and will be skipped."

            return None

        registered_plugin = ThermoMLDataSet.registered_properties[

        if (registered_plugin.supported_phases & phase) != phase:
                f"The {property_name_node.text} property is currently only supported "
                f"when measured in the {str(registered_plugin.supported_phases)} phase, "
                f"and not the {str(phase)} phase."

            return None

        return_value = cls(property_name_node.text)

        return_value.index = property_index
        return_value.phase = phase

        return_value.default_unit = _unit_from_thermoml_string(property_name_node.text)

        return_value.method_name = method_name_node.text

        property_uncertainty_definitions = {}
        combined_uncertainty_definitions = {}


        return_value.combined_uncertainty_definitions = combined_uncertainty_definitions
        return_value.property_uncertainty_definitions = property_uncertainty_definitions

        solvent_index_nodes = node.findall(
            "./ThermoML:Solvent//ThermoML:nOrgNum", namespace

        if solvent_index_nodes is not None:
            for solvent_index_node in solvent_index_nodes:

        # The solute standard state appears to describe which a solute should
        # be present in only trace amounts. It only seems to be relevant for
        # activity based properties.
        standard_state_node = node.find("./ThermoML:eStandardState", namespace)

        if standard_state_node is not None:
            return_value.solute_standard_state = (

        # Property->Property-MethodID->RegNum describes which compound is referred
        # to if the property is based on one of the compounds e.g. the activity
        # coefficient of compound 2.
        target_compound_node = node.find(
            "./ThermoML:Property-MethodID/ThermoML:RegNum/ThermoML:nOrgNum", namespace

        if target_compound_node is not None:
            return_value.target_compound_index = int(target_compound_node.text)

        return return_value

    def set_value(self, value, uncertainty):
        """Set the value and uncertainty of this property, adding units if necessary.

        value: float or unit.Quantity
            The value of the property
        uncertainty: float or unit.Quantity, optional
            The uncertainty in the value.
        value_quantity = value

        if not isinstance(value_quantity, unit.Quantity):
            value_quantity = value * self.default_unit

        self.value = value_quantity

        if uncertainty is not None:
            uncertainty_quantity = uncertainty

            if not isinstance(uncertainty_quantity, unit.Quantity):
                uncertainty_quantity = uncertainty * self.default_unit

            self.uncertainty = uncertainty_quantity

[docs]class ThermoMLDataSet(PhysicalPropertyDataSet): """A dataset of physical property measurements created from a ThermoML dataset. Examples -------- For example, we can use the DOI `10.1016/j.jct.2005.03.012` as a key for retrieving the dataset from the ThermoML Archive: >>> dataset = ThermoMLDataSet.from_doi('10.1016/j.jct.2005.03.012') You can also specify multiple ThermoML Archive keys to create a dataset from multiple ThermoML files: >>> thermoml_keys = ['10.1021/acs.jced.5b00365', '10.1021/acs.jced.5b00474'] >>> dataset = ThermoMLDataSet.from_doi(*thermoml_keys) """ registered_properties = {}
[docs] def __init__(self): """Constructs a new ThermoMLDataSet object.""" super().__init__()
[docs] @classmethod def from_doi(cls, *doi_list): """Load a ThermoML data set from a list of DOIs Parameters ---------- doi_list: str The list of DOIs to pull data from Returns ------- ThermoMLDataSet The loaded data set. """ return_value = None for doi in doi_list: # E.g doi_url = f"{doi}.xml" data_set = cls._from_url(doi_url, MeasurementSource(doi=doi)) if data_set is None or len(data_set) == 0: continue if return_value is None: return_value = data_set else: return_value.merge(data_set) return return_value
[docs] @classmethod def from_url(cls, *url_list): """Load a ThermoML data set from a list of URLs Parameters ---------- url_list: str The list of URLs to pull data from Returns ------- ThermoMLDataSet The loaded data set. """ return_value = None for url in url_list: data_set = cls._from_url(url) if data_set is None or len(data_set) == 0: continue if return_value is None: return_value = data_set else: return_value.merge(data_set) return return_value
@classmethod def _from_url(cls, url, source=None): """Load a ThermoML data set from a given URL Parameters ---------- url: str The URL to pull data from source: Source, optional An optional source which gives more information (e.g DOIs) for the url. Returns ---------- ThermoMLDataSet The loaded data set. """ if source is None: source = MeasurementSource(reference=url) return_value = None try: request = requests.get(url) request.raise_for_status() # Handle the case where ThermoML returns a 404 error code, but rather # redirects to an error page with code 200. if request.text.startswith("<html>"): raise HTTPError(url, 404, "Not found", None, None) return_value = cls.from_xml(request.text, source) except (HTTPError, requests.exceptions.HTTPError): logger.warning(f"No ThermoML file could not be found at {url}") return return_value
[docs] @classmethod def from_file(cls, *file_list): """Load a ThermoML data set from a list of files Parameters ---------- file_list: str The list of files to pull data from Returns ------- ThermoMLDataSet The loaded data set. """ return_value = None counter = 0 for file in file_list: data_set = cls._from_file(file) counter += 1 if data_set is None or len(data_set) == 0: continue if return_value is None: return_value = data_set else: return_value.merge(data_set) return return_value
@classmethod def _from_file(cls, path): """Load a ThermoML data set from a given file Parameters ---------- path: str The file path to pull data from Returns ------- ThermoMLDataSet The loaded data set. """ source = MeasurementSource(reference=path) return_value = None try: with open(path) as file: return_value = ThermoMLDataSet.from_xml(, source) except FileNotFoundError: logger.warning(f"No ThermoML file could not be found at {path}") return return_value
[docs] @classmethod def from_xml(cls, xml, default_source): """Load a ThermoML data set from an xml object. Parameters ---------- xml: str The xml string to parse. default_source: Source The source to use if one cannot be parsed from the archive itself. Returns ------- ThermoMLDataSet The loaded ThermoML data set. """ root_node = ElementTree.fromstring(xml) if root_node is None: logger.warning("The ThermoML XML document could not be parsed.") return None if root_node.tag.find("DataReport") < 0: logger.warning( "The ThermoML XML document does not contain the expected root node." ) return None # Extract the namespace that will prefix all type names namespace_string ="{.*\}", root_node.tag).group(0)[1:-1] namespace = {"ThermoML": namespace_string} # Attempt to find a DOI for this archive doi_node = root_node.find("ThermoML:Citation/ThermoML:sDOI", namespace) if doi_node is not None: source = MeasurementSource(doi=doi_node.text) else: source = default_source return_value = ThermoMLDataSet() compounds = {} # Extract the base compounds present in the xml file for node in root_node.findall("ThermoML:Compound", namespace): compound = _Compound.from_xml_node(node, namespace) if compound is None: continue if compound.index in compounds: raise RuntimeError( "A ThermoML data set contains two compounds with the same index" ) compounds[compound.index] = compound # Pull out any and all properties in the file. for node in root_node.findall("ThermoML:PureOrMixtureData", namespace): properties = _PureOrMixtureData.from_xml_node(node, namespace, compounds) if properties is None or len(properties) == 0: continue for measured_property in properties: registered_plugin = ThermoMLDataSet.registered_properties[ measured_property.type_string ] mapped_property = registered_plugin.conversion_function( measured_property ) mapped_property.source = source return_value.add_properties(mapped_property) return return_value