import functools
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
import numpy
import pandas
from openff.units import unit
from scipy.optimize import linear_sum_assignment
from typing_extensions import Literal
from openff.evaluator._pydantic import Field, root_validator, validator
from openff.evaluator.datasets.curation.components import (
from openff.evaluator.datasets.utilities import (
from openff.evaluator.utils.checkmol import (
conint = int
confloat = float
PositiveInt = int
PositiveFloat = float
from openff.evaluator._pydantic import (
logger = logging.getLogger(__name__)
ComponentEnvironments = List[List[ChemicalEnvironment]]
MoleFractionRange = Tuple[confloat(ge=0.0, le=1.0), confloat(ge=0.0, le=1.0)]
[docs]class FilterDuplicatesSchema(CurationComponentSchema):
type: Literal["FilterDuplicates"] = "FilterDuplicates"
temperature_precision: conint(ge=0) = Field(
description="The number of decimal places to compare temperatures (K) to "
pressure_precision: conint(ge=0) = Field(
description="The number of decimal places to compare pressures (kPa) to "
mole_fraction_precision: conint(ge=0) = Field(
description="The number of decimal places to compare mole fractions to within.",
[docs]class FilterDuplicates(CurationComponent):
"""A component to remove duplicate data points (within a specified precision)
from a data set.
def _apply(
cls, data_frame: pandas.DataFrame, schema: FilterDuplicatesSchema, n_processes
) -> pandas.DataFrame:
if len(data_frame) == 0:
return data_frame
data_frame = data_frame.copy()
data_frame = reorder_data_frame(data_frame)
minimum_n_components = data_frame["N Components"].min()
maximum_n_components = data_frame["N Components"].max()
filtered_data = []
for n_components in range(minimum_n_components, maximum_n_components + 1):
component_data = data_frame[
data_frame["N Components"] == n_components
component_data["Temperature (K)"] = component_data["Temperature (K)"].round(
component_data["Pressure (kPa)"] = component_data["Pressure (kPa)"].round(
subset_columns = ["Temperature (K)", "Pressure (kPa)", "Phase"]
for index in range(n_components):
component_data[f"Mole Fraction {index + 1}"] = component_data[
f"Mole Fraction {index + 1}"
f"Component {index + 1}",
f"Role {index + 1}",
f"Mole Fraction {index + 1}",
f"Exact Amount {index + 1}",
subset_columns = [x for x in subset_columns if x in component_data]
value_headers = [x for x in component_data if x.find(" Value ") >= 0]
sorted_filtered_data = []
for value_header in value_headers:
uncertainty_header = value_header.replace("Value", "Uncertainty")
property_data = component_data[component_data[value_header].notna()]
if uncertainty_header in component_data:
property_data = property_data.sort_values(
uncertainty_header, na_position="first"
property_data = property_data.drop_duplicates(
subset=subset_columns, keep="last"
sorted_filtered_data = pandas.concat(
sorted_filtered_data, ignore_index=True, sort=False
filtered_data = pandas.concat(filtered_data, ignore_index=True, sort=False)
return filtered_data
[docs]class FilterByTemperatureSchema(CurationComponentSchema):
type: Literal["FilterByTemperature"] = "FilterByTemperature"
minimum_temperature: Optional[PositiveFloat] = Field(
description="Retain data points measured for temperatures above this value (K)",
maximum_temperature: Optional[PositiveFloat] = Field(
description="Retain data points measured for temperatures below this value (K)",
def _min_max(cls, values):
minimum_temperature = values.get("minimum_temperature")
maximum_temperature = values.get("maximum_temperature")
if minimum_temperature is not None and maximum_temperature is not None:
assert maximum_temperature > minimum_temperature
return values
[docs]class FilterByTemperature(CurationComponent):
"""A component which will filter out data points which were measured outside of a
specified temperature range
def _apply(
data_frame: pandas.DataFrame,
schema: FilterByTemperatureSchema,
) -> pandas.DataFrame:
filtered_frame = data_frame
if schema.minimum_temperature is not None:
filtered_frame = filtered_frame[
schema.minimum_temperature < filtered_frame["Temperature (K)"]
if schema.maximum_temperature is not None:
filtered_frame = filtered_frame[
filtered_frame["Temperature (K)"] < schema.maximum_temperature
return filtered_frame
[docs]class FilterByPressureSchema(CurationComponentSchema):
type: Literal["FilterByPressure"] = "FilterByPressure"
minimum_pressure: Optional[PositiveFloat] = Field(
description="Retain data points measured for pressures above this value (kPa)",
maximum_pressure: Optional[PositiveFloat] = Field(
description="Retain data points measured for pressures below this value (kPa)",
def _min_max(cls, values):
minimum_pressure = values.get("minimum_pressure")
maximum_pressure = values.get("maximum_pressure")
if minimum_pressure is not None and maximum_pressure is not None:
assert maximum_pressure > minimum_pressure
return values
[docs]class FilterByPressure(CurationComponent):
"""A component which will filter out data points which were measured outside of a
specified pressure range.
def _apply(
cls, data_frame: pandas.DataFrame, schema: FilterByPressureSchema, n_processes
) -> pandas.DataFrame:
filtered_frame = data_frame
if schema.minimum_pressure is not None:
filtered_frame = filtered_frame[
schema.minimum_pressure < filtered_frame["Pressure (kPa)"]
if schema.maximum_pressure is not None:
filtered_frame = filtered_frame[
filtered_frame["Pressure (kPa)"] < schema.maximum_pressure
return filtered_frame
[docs]class FilterByMoleFractionSchema(CurationComponentSchema):
type: Literal["FilterByMoleFraction"] = "FilterByMoleFraction"
mole_fraction_ranges: Dict[conint(gt=1), List[List[MoleFractionRange]]] = Field(
description="The ranges of mole fractions to retain. Each key in the "
"dictionary corresponds to a number of components in the system. Each value "
"is a list of the allowed mole fraction ranges for all but one of the "
"components, i.e for a binary system, the allowed mole fraction for only the "
"first component must be specified.",
def _validate_ranges(cls, value: Dict[int, List[List[MoleFractionRange]]]):
for n_components, ranges in value.items():
assert len(ranges) == n_components - 1
assert all(
mole_fraction_range[0] < mole_fraction_range[1]
for component_ranges in ranges
for mole_fraction_range in component_ranges
return value
[docs]class FilterByMoleFraction(CurationComponent):
"""A component which will filter out data points which were measured outside of a
specified mole fraction range.
def _apply(
data_frame: pandas.DataFrame,
schema: FilterByMoleFractionSchema,
) -> pandas.DataFrame:
filtered_frame = data_frame
full_query = ~filtered_frame["N Components"].isin(schema.mole_fraction_ranges)
for n_components, ranges in schema.mole_fraction_ranges.items():
# Build the query to apply
n_component_query = filtered_frame["N Components"] == n_components
for index, component_ranges in enumerate(ranges):
component_query = None
for mole_fraction_range in component_ranges:
fraction_query = (
filtered_frame[f"Mole Fraction {index + 1}"]
> mole_fraction_range[0]
) & (
filtered_frame[f"Mole Fraction {index + 1}"]
< mole_fraction_range[1]
if component_query is None:
component_query = fraction_query
component_query |= fraction_query
n_component_query &= component_query
full_query |= n_component_query
filtered_frame = filtered_frame[full_query]
return filtered_frame
[docs]class FilterByRacemicSchema(CurationComponentSchema):
type: Literal["FilterByRacemic"] = "FilterByRacemic"
[docs]class FilterByRacemic(CurationComponent):
"""A component which will filter out data points which were measured for racemic
def _apply(
data_frame: pandas.DataFrame,
schema: FilterByMoleFractionSchema,
) -> pandas.DataFrame:
# Begin building the query. All pure substances should be
# retained by default.
query = data_frame["N Components"] < 2
for n_components in range(2, data_frame["N Components"].max() + 1):
component_data = data_frame[data_frame["N Components"] == n_components]
if len(component_data) == 0:
component_combinations = itertools.combinations(range(n_components), 2)
is_racemic = None
for index_0, index_1 in component_combinations:
components_racemic = component_data[
f"Component {index_0 + 1}"
].str.replace("@", "") == component_data[
f"Component {index_1 + 1}"
"@", ""
is_racemic = (
if is_racemic is None
else (is_racemic | components_racemic)
not_racemic = ~is_racemic
query |= not_racemic
filtered_frame = data_frame[query]
return filtered_frame
[docs]class FilterByElementsSchema(CurationComponentSchema):
type: Literal["FilterByElements"] = "FilterByElements"
allowed_elements: Optional[List[constr(min_length=1)]] = Field(
description="The only elements which must be present in the measured system "
"for the data point to be retained. This option is mutually exclusive with "
forbidden_elements: Optional[List[constr(min_length=1)]] = Field(
description="The elements which must not be present in the measured system for "
"the data point to be retained. This option is mutually exclusive with "
def _validate_mutually_exclusive(cls, values):
allowed_elements = values.get("allowed_elements")
forbidden_elements = values.get("forbidden_elements")
assert allowed_elements is not None or forbidden_elements is not None
assert allowed_elements is None or forbidden_elements is None
return values
[docs]class FilterByElements(CurationComponent):
"""A component which will filter out data points which were measured for systems
which contain specific elements."""
def _apply(
cls, data_frame: pandas.DataFrame, schema: FilterByElementsSchema, n_processes
) -> pandas.DataFrame:
from openff.toolkit.topology import Molecule
def filter_function(data_row):
n_components = data_row["N Components"]
for index in range(n_components):
smiles = data_row[f"Component {index + 1}"]
molecule = Molecule.from_smiles(smiles, allow_undefined_stereo=True)
if schema.allowed_elements is not None and not all(
[x.symbol in schema.allowed_elements for x in molecule.atoms]
return False
if schema.forbidden_elements is not None and any(
[x.symbol in schema.forbidden_elements for x in molecule.atoms]
return False
return True
# noinspection PyTypeChecker
return data_frame[data_frame.apply(filter_function, axis=1)]
[docs]class FilterByPropertyTypesSchema(CurationComponentSchema):
type: Literal["FilterByPropertyTypes"] = "FilterByPropertyTypes"
property_types: List[constr(min_length=1)] = Field(
description="The types of property to retain.",
n_components: Dict[constr(min_length=1), List[PositiveInt]] = Field(
description="Optionally specify the number of components that a property "
"should have been measured for (e.g. pure, binary) in order for that data "
"point to be retained.",
strict: bool = Field(
description="If true, only substances (defined without consideration for their "
"mole fractions or exact amount) which have data available for all of the "
"specified property types will be retained. Note that the data points aren't "
"required to have been measured at the same state.",
def _validate_n_components(cls, values):
property_types = values.get("property_types")
n_components = values.get("n_components")
assert all(x in property_types for x in n_components)
return values
[docs]class FilterByPropertyTypes(CurationComponent):
"""A component which will apply a filter which only retains properties of specified
def _apply(
data_frame: pandas.DataFrame,
schema: FilterByPropertyTypesSchema,
) -> pandas.DataFrame:
property_headers = [
header for header in data_frame if header.find(" Value ") >= 0
# Removes the columns for properties which are not of interest.
for header in property_headers:
property_type = header.split(" ")[0]
if property_type in schema.property_types:
data_frame = data_frame.drop(header, axis=1)
uncertainty_header = header.replace(" Value ", " Uncertainty ")
if uncertainty_header in data_frame:
data_frame = data_frame.drop(uncertainty_header, axis=1)
# Drop any rows which do not contain any values for the property types of
# interest.
property_headers = [
for header in property_headers
if header.split(" ")[0] in schema.property_types
data_frame = data_frame.dropna(subset=property_headers, how="all")
# Apply a more specific filter which only retain which contain values
# for the specific property types, and which were measured for the
# specified number of components.
for property_type, n_components in schema.n_components.items():
property_header = next(
iter(x for x in property_headers if x.find(f"{property_type} ") == 0),
if property_header is None:
data_frame = data_frame[
| data_frame["N Components"].isin(n_components)
# Apply the strict filter if requested
if schema.strict:
reordered_data_frame = reorder_data_frame(data_frame)
# Build a dictionary of which properties should be present partitioned
# by the number of components they should have been be measured for.
property_types = defaultdict(list)
if len(schema.n_components) > 0:
for property_type, n_components in schema.n_components.items():
for n_component in n_components:
min_n_components = min(property_types)
max_n_components = max(property_types)
min_n_components = reordered_data_frame["N Components"].min()
max_n_components = reordered_data_frame["N Components"].max()
for n_components in range(min_n_components, max_n_components + 1):
substances_with_data = set()
components_with_data = {}
# For each N component find substances which have data points for
# all of the specified property types.
for n_components in range(min_n_components, max_n_components + 1):
component_data = reordered_data_frame[
reordered_data_frame["N Components"] == n_components
if n_components not in property_types or len(component_data) == 0:
n_component_headers = [
for header in property_headers
if header.split(" ")[0] in property_types[n_components]
and header in component_data
if len(n_component_headers) != len(property_types[n_components]):
n_component_substances = set.intersection(
for header in n_component_headers
components_with_data[n_components] = {
for substance in n_component_substances
for component in substance
if len(schema.n_components) > 0:
components_with_all_data = set.intersection(
# Filter out any smiles for don't appear in all of the N component
# substances.
data_frame = FilterBySmiles.apply(
# Filter out any substances which (within each N component) don't have
# all of the specified data types.
data_frame = FilterBySubstances.apply(
data_frame = data_frame.dropna(axis=1, how="all")
return data_frame
[docs]class FilterByStereochemistrySchema(CurationComponentSchema):
type: Literal["FilterByStereochemistry"] = "FilterByStereochemistry"
[docs]class FilterByStereochemistry(CurationComponent):
"""A component which filters out data points measured for systems whereby the
stereochemistry of a number of components is undefined."""
def _apply(
data_frame: pandas.DataFrame,
schema: FilterByStereochemistrySchema,
) -> pandas.DataFrame:
from openff.toolkit.topology import Molecule
from openff.toolkit.utils import UndefinedStereochemistryError
def filter_function(data_row):
n_components = data_row["N Components"]
for index in range(n_components):
smiles = data_row[f"Component {index + 1}"]
except UndefinedStereochemistryError:
return False
return True
# noinspection PyTypeChecker
return data_frame[data_frame.apply(filter_function, axis=1)]
[docs]class FilterByChargedSchema(CurationComponentSchema):
type: Literal["FilterByCharged"] = "FilterByCharged"
[docs]class FilterByCharged(CurationComponent):
"""A component which filters out data points measured for substances where any of
the constituent components have a net non-zero charge.
def _apply(
cls, data_frame: pandas.DataFrame, schema: FilterByChargedSchema, n_processes
) -> pandas.DataFrame:
from openff.toolkit.topology import Molecule
def filter_function(data_row):
n_components = data_row["N Components"]
for index in range(n_components):
smiles = data_row[f"Component {index + 1}"]
molecule = Molecule.from_smiles(smiles, allow_undefined_stereo=True)
# noinspection PyUnresolvedReferences
atom_charges = [
if isinstance(atom.formal_charge, int)
else atom.formal_charge.m_as(unit.elementary_charge)
for atom in molecule.atoms
if numpy.isclose(sum(atom_charges), 0.0):
return False
return True
# noinspection PyTypeChecker
return data_frame[data_frame.apply(filter_function, axis=1)]
[docs]class FilterByIonicLiquidSchema(CurationComponentSchema):
type: Literal["FilterByIonicLiquid"] = "FilterByIonicLiquid"
[docs]class FilterByIonicLiquid(CurationComponent):
"""A component which filters out data points measured for substances which
contain or are classed as an ionic liquids.
def _apply(
data_frame: pandas.DataFrame,
schema: FilterByIonicLiquidSchema,
) -> pandas.DataFrame:
def filter_function(data_row):
n_components = data_row["N Components"]
for index in range(n_components):
smiles = data_row[f"Component {index + 1}"]
if "." in smiles:
return False
return True
# noinspection PyTypeChecker
return data_frame[data_frame.apply(filter_function, axis=1)]
[docs]class FilterBySmilesSchema(CurationComponentSchema):
type: Literal["FilterBySmiles"] = "FilterBySmiles"
smiles_to_include: Optional[List[str]] = Field(
description="The smiles patterns to retain. This option is mutually "
"exclusive with `smiles_to_exclude`",
smiles_to_exclude: Optional[List[str]] = Field(
description="The smiles patterns to exclude. This option is mutually "
"exclusive with `smiles_to_include`",
allow_partial_inclusion: bool = Field(
description="If False, all the components in a substance must appear in "
"the `smiles_to_include` list, otherwise, only some must appear. "
"This option only applies when `smiles_to_include` is set.",
def _validate_mutually_exclusive(cls, values):
smiles_to_include = values.get("smiles_to_include")
smiles_to_exclude = values.get("smiles_to_exclude")
assert smiles_to_include is not None or smiles_to_exclude is not None
assert smiles_to_include is None or smiles_to_exclude is None
return values
[docs]class FilterBySmiles(CurationComponent):
"""A component which filters the data set so that it only contains either a
specific set of smiles, or does not contain any of a set of specifically excluded
def _apply(
cls, data_frame: pandas.DataFrame, schema: FilterBySmilesSchema, n_processes
) -> pandas.DataFrame:
smiles_to_include = schema.smiles_to_include
smiles_to_exclude = schema.smiles_to_exclude
if smiles_to_include is not None:
smiles_to_exclude = []
elif smiles_to_exclude is not None:
smiles_to_include = []
def filter_function(data_row):
n_components = data_row["N Components"]
component_smiles = [
data_row[f"Component {index + 1}"] for index in range(n_components)
if any(x in smiles_to_exclude for x in component_smiles):
return False
elif len(smiles_to_exclude) > 0:
return True
if not schema.allow_partial_inclusion and not all(
x in smiles_to_include for x in component_smiles
return False
if schema.allow_partial_inclusion and not any(
x in smiles_to_include for x in component_smiles
return False
return True
# noinspection PyTypeChecker
return data_frame[data_frame.apply(filter_function, axis=1)]
[docs]class FilterBySmirksSchema(CurationComponentSchema):
type: Literal["FilterBySmirks"] = "FilterBySmirks"
smirks_to_include: Optional[List[str]] = Field(
description="The smirks patterns which must be matched by a substance in "
"order to retain a measurement. This option is mutually exclusive with "
smirks_to_exclude: Optional[List[str]] = Field(
description="The smirks patterns which must not be matched by a substance in "
"order to retain a measurement. This option is mutually exclusive with "
allow_partial_inclusion: bool = Field(
description="If False, all the components in a substance must match at least "
"one pattern in `smirks_to_include` in order to retain a measurement, "
"otherwise, only a least one component must match. This option only applies "
"when `smirks_to_include` is set.",
def _validate_mutually_exclusive(cls, values):
smirks_to_include = values.get("smirks_to_include")
smirks_to_exclude = values.get("smirks_to_exclude")
assert smirks_to_include is not None or smirks_to_exclude is not None
assert smirks_to_include is None or smirks_to_exclude is None
return values
[docs]class FilterBySmirks(CurationComponent):
"""A component which filters a data set so that it only contains measurements made
for molecules which contain (or don't) a set of chemical environments
represented by SMIRKS patterns.
def _find_smirks_matches(smiles_pattern, *smirks_patterns):
"""Determines which (if any) of the specified smirks match the specified
smiles_pattern: str
The SMILES representation to try and match against.
smirks_patterns: str
The smirks patterns to try and match.
list of str
The matched smirks patterns.
from rdkit import Chem
def _rdmol_from_smiles(smiles: str) -> Chem.Mol:
Create an RDKit molecule from a SMILES string.
smiles: str
The SMILES string to convert.
rdmol: rdkit.Chem.rdchem.Mol
The RDKit molecule.
Taken from
# TODO: This function does not handle SMILES parsing failures gracefully.
rdmol = Chem.MolFromSmiles(smiles, sanitize=False)
Chem.SetAromaticity(rdmol, Chem.AromaticityModel.AROMATICITY_MDL)
return rdmol
def _has_match(rdmol: Chem.Mol, smarts: str) -> bool:
Run substructure matching, returning only whether or not matches were found.
rdmol: Chem.Mol
The RDKit molecule to match against.
smarts: str
The SMARTS pattern to match.
Whether or not a match(es) was found.
Taken from
# TODO: This function does not handle SMILES parsing failures gracefully.
qmol = Chem.MolFromSmarts(smarts)
return len(rdmol.GetSubstructMatch(qmol, useChirality=True)) > 0
if len(smirks_patterns) == 0:
return []
rdmol = _rdmol_from_smiles(smiles_pattern)
return [smirks for smirks in smirks_patterns if _has_match(rdmol, smirks)]
def _apply(
cls, data_frame: pandas.DataFrame, schema: FilterBySmirksSchema, n_processes
) -> pandas.DataFrame:
smirks_to_match = (
if schema.smirks_to_include
else schema.smirks_to_exclude
def filter_function(data_row):
n_components = data_row["N Components"]
component_smiles = [
data_row[f"Component {index + 1}"] for index in range(n_components)
smirks_matches = {
smiles: cls._find_smirks_matches(smiles, *smirks_to_match)
for smiles in component_smiles
if schema.smirks_to_exclude is not None:
return not any(len(x) > 0 for x in smirks_matches.values())
if schema.allow_partial_inclusion:
return any(len(x) > 0 for x in smirks_matches.values())
return all(len(x) > 0 for x in smirks_matches.values())
# noinspection PyTypeChecker
return data_frame[data_frame.apply(filter_function, axis=1)]
[docs]class FilterByNComponentsSchema(CurationComponentSchema):
type: Literal["FilterByNComponents"] = "FilterByNComponents"
n_components: List[PositiveInt] = Field(
description="The number of components that measurements should have been "
"measured for in order to be retained.",
[docs]class FilterByNComponents(CurationComponent):
"""A component which filters out data points measured for systems with specified
number of components.
def _apply(
data_frame: pandas.DataFrame,
schema: FilterByNComponentsSchema,
) -> pandas.DataFrame:
return data_frame[data_frame["N Components"].isin(schema.n_components)]
[docs]class FilterBySubstancesSchema(CurationComponentSchema):
type: Literal["FilterBySubstances"] = "FilterBySubstances"
substances_to_include: Optional[List[Tuple[str, ...]]] = Field(
description="The substances compositions to retain, where each tuple in the "
"list contains the smiles patterns which make up the substance to include. "
"This option is mutually exclusive with `substances_to_exclude`.",
substances_to_exclude: Optional[List[Tuple[str, ...]]] = Field(
description="The substances compositions to retain, where each tuple in the "
"list contains the smiles patterns which make up the substance to exclude. "
"This option is mutually exclusive with `substances_to_include`.",
def _validate_mutually_exclusive(cls, values):
substances_to_include = values.get("substances_to_include")
substances_to_exclude = values.get("substances_to_exclude")
assert substances_to_include is not None or substances_to_exclude is not None
assert substances_to_include is None or substances_to_exclude is None
return values
[docs]class FilterBySubstances(CurationComponent):
"""A component which filters the data set so that it only contains properties
measured for particular substances.
This method is similar to `filter_by_smiles`, however here we explicitly define
the full substances compositions, rather than individual smiles which should
either be included or excluded.
To filter the data set to only include measurements for pure methanol, pure
benzene or an aqueous ethanol mix:
>>> schema = FilterBySubstancesSchema(
>>> substances_to_include=[
>>> ('CO',),
>>> ('C1=CC=CC=C1',),
>>> ('CCO', 'O')
>>> ]
>>> )
To filter out measurements made for an aqueous mix of benzene:
>>> schema = FilterBySubstancesSchema(
>>> substances_to_exclude=[('O', 'C1=CC=CC=C1')]
>>> )
def _apply(
cls, data_frame: pandas.DataFrame, schema: FilterBySubstancesSchema, n_processes
) -> pandas.DataFrame:
def filter_function(data_row):
n_components = data_row["N Components"]
substances_to_include = schema.substances_to_include
substances_to_exclude = schema.substances_to_exclude
if substances_to_include is not None:
substances_to_include = [
tuple(sorted(x)) for x in substances_to_include
if substances_to_exclude is not None:
substances_to_exclude = [
tuple(sorted(x)) for x in substances_to_exclude
substance = tuple(
data_row[f"Component {index + 1}"]
for index in range(n_components)
return (
substances_to_exclude is not None
and substance not in substances_to_exclude
) or (
substances_to_include is not None and substance in substances_to_include
# noinspection PyTypeChecker
return data_frame[data_frame.apply(filter_function, axis=1)]
[docs]class FilterByEnvironmentsSchema(CurationComponentSchema):
type: Literal["FilterByEnvironments"] = "FilterByEnvironments"
per_component_environments: Optional[Dict[int, ComponentEnvironments]] = Field(
description="The environments which should be present in the components of "
"the substance for which the measurements were made. Each dictionary "
"key corresponds to a number of components in the system, and each "
"value the environments which should be matched by those n components. "
"This option is mutually exclusive with `environments`.",
environments: Optional[List[ChemicalEnvironment]] = Field(
description="The environments which should be present in the substances for "
"which measurements were made. This option is mutually exclusive with "
at_least_one_environment: bool = Field(
description="If true, data points will only be retained if all of the "
"components in the measured system contain at least one of the specified "
"environments. This option is mutually exclusive with "
strictly_specified_environments: bool = Field(
description="If true, data points will only be retained if all of the "
"components in the measured system strictly contain only the specified "
"environments and no others. This option is mutually exclusive with "
def _validate_per_component_environments(cls, value):
if value is None:
return value
assert all(len(y) == x for x, y in value.items())
return value
def _validate_mutually_exclusive(cls, values):
at_least_one_environment = values.get("at_least_one_environment")
strictly_specified_environments = values.get("strictly_specified_environments")
assert (
at_least_one_environment is True or strictly_specified_environments is True
assert (
at_least_one_environment is False
or strictly_specified_environments is False
per_component_environments = values.get("per_component_environments")
environments = values.get("environments")
assert per_component_environments is not None or environments is not None
assert per_component_environments is None or environments is None
return values
[docs]class FilterByEnvironments(CurationComponent):
"""A component which filters a data set so that it only contains measurements made
for substances which contain specific chemical environments.
def _find_environments_per_component(cls, data_row: pandas.Series):
n_components = data_row["N Components"]
component_smiles = [
data_row[f"Component {index + 1}"] for index in range(n_components)
component_moieties = [analyse_functional_groups(x) for x in component_smiles]
if any(x is None for x in component_moieties):
f"Checkmol was unable to parse the system with components="
f"{component_smiles} and so this data point was discarded."
return None
return component_moieties
def _is_match(cls, component_environments, environments_to_match, schema):
operator = all if schema.strictly_specified_environments else any
return operator(
environment in environments_to_match
for environment in component_environments
def _filter_by_environments(cls, data_row, schema: FilterByEnvironmentsSchema):
environments_per_component = cls._find_environments_per_component(data_row)
if environments_per_component is None:
return False
return all(
cls._is_match(component_environments, schema.environments, schema)
for component_environments in environments_per_component
def _filter_by_per_component(cls, data_row, schema: FilterByEnvironmentsSchema):
n_components = data_row["N Components"]
if (
schema.per_component_environments is not None
and n_components not in schema.per_component_environments
# No filter was specified for this number of components.
return True
environments_per_component = cls._find_environments_per_component(data_row)
if environments_per_component is None:
return False
match_matrix = numpy.zeros((n_components, n_components))
for component_index, component_environments in enumerate(
# noinspection PyUnresolvedReferences
for environments_index, environments_to_match in enumerate(
match_matrix[component_index, environments_index] = cls._is_match(
component_environments, environments_to_match, schema
x_indices, y_indices = linear_sum_assignment(match_matrix, maximize=True)
return numpy.all(match_matrix[x_indices, y_indices] > 0)
def _apply(
data_frame: pandas.DataFrame,
schema: FilterByEnvironmentsSchema,
) -> pandas.DataFrame:
if schema.environments is not None:
filter_function = functools.partial(
cls._filter_by_environments, schema=schema
filter_function = functools.partial(
cls._filter_by_per_component, schema=schema
# noinspection PyTypeChecker
return data_frame[data_frame.apply(filter_function, axis=1)]
FilterComponentSchema = Union[