Source code for openff.evaluator.protocols.analysis
"""
A collection of protocols for running analysing the results of molecular simulations.
"""
import abc
import typing
from os import path
import numpy as np
import pint
from openff.evaluator.attributes import UNDEFINED
from openff.evaluator.utils import statistics, timeseries
from openff.evaluator.utils.statistics import StatisticsArray, bootstrap
from openff.evaluator.workflow import Protocol, workflow_protocol
from openff.evaluator.workflow.attributes import (
InequalityMergeBehaviour,
InputAttribute,
OutputAttribute,
)
[docs]class AveragePropertyProtocol(Protocol, abc.ABC):
"""An abstract base class for protocols which will calculate the
average of a property and its uncertainty via bootstrapping.
"""
bootstrap_iterations = InputAttribute(
docstring="The number of bootstrap iterations to perform.",
type_hint=int,
default_value=250,
merge_behavior=InequalityMergeBehaviour.LargestValue,
)
bootstrap_sample_size = InputAttribute(
docstring="The relative sample size to use for bootstrapping.",
type_hint=float,
default_value=1.0,
merge_behavior=InequalityMergeBehaviour.LargestValue,
)
equilibration_index = OutputAttribute(
docstring="The index in the data set after which the data is stationary.",
type_hint=int,
)
statistical_inefficiency = OutputAttribute(
docstring="The statistical inefficiency in the data set.", type_hint=float
)
value = OutputAttribute(
docstring="The average value and its uncertainty.", type_hint=pint.Measurement
)
uncorrelated_values = OutputAttribute(
docstring="The uncorrelated values which the average was calculated from.",
type_hint=pint.Quantity,
)
def _bootstrap_function(self, **sample_kwargs):
"""The function to perform on the data set being sampled by
bootstrapping.
Parameters
----------
sample_kwargs: dict of str and np.ndarray
A key words dictionary of the bootstrap sample data, where the
sample data is a numpy array of shape=(num_frames, num_dimensions)
with dtype=float.
Returns
-------
float
The result of evaluating the data.
"""
assert len(sample_kwargs) == 1
sample_data = next(iter(sample_kwargs.values()))
return sample_data.mean()
[docs]class AverageTrajectoryProperty(AveragePropertyProtocol, abc.ABC):
"""An abstract base class for protocols which will calculate the
average of a property from a simulation trajectory.
"""
input_coordinate_file = InputAttribute(
docstring="The file path to the starting coordinates of a trajectory.",
type_hint=str,
default_value=UNDEFINED,
)
trajectory_path = InputAttribute(
docstring="The file path to the trajectory to average over.",
type_hint=str,
default_value=UNDEFINED,
)
[docs]@workflow_protocol()
class ExtractAverageStatistic(AveragePropertyProtocol):
"""Extracts the average value from a statistics file which was generated
during a simulation.
"""
statistics_path = InputAttribute(
docstring="The file path to the statistics to average over.",
type_hint=str,
default_value=UNDEFINED,
)
statistics_type = InputAttribute(
docstring="The type of statistic to average over.",
type_hint=statistics.ObservableType,
default_value=UNDEFINED,
)
divisor = InputAttribute(
docstring="A value to divide the statistic by. This is useful if a statistic (such "
"as enthalpy) needs to be normalised by the number of molecules.",
type_hint=typing.Union[int, float, pint.Quantity],
default_value=1.0,
)
def _execute(self, directory, available_resources):
self._statistics = statistics.StatisticsArray.from_pandas_csv(
self.statistics_path
)
if self.statistics_type not in self._statistics:
raise ValueError(
f"The {self.statistics_path} statistics file contains no "
f"data of type {self.statistics_type}."
)
values = self._statistics[self.statistics_type]
statistics_unit = values[0].units
unitless_values = values.to(statistics_unit).magnitude
divisor = self.divisor
if isinstance(self.divisor, pint.Quantity):
statistics_unit /= self.divisor.units
divisor = self.divisor.magnitude
unitless_values = np.array(unitless_values) / divisor
(
unitless_values,
self.equilibration_index,
self.statistical_inefficiency,
) = timeseries.decorrelate_time_series(unitless_values)
final_value, final_uncertainty = bootstrap(
self._bootstrap_function,
self.bootstrap_iterations,
self.bootstrap_sample_size,
values=unitless_values,
)
self.uncorrelated_values = unitless_values * statistics_unit
self.value = (final_value * statistics_unit).plus_minus(
final_uncertainty * statistics_unit
)