Source code for a2rl._io

# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
#     http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
from __future__ import annotations

from dataclasses import asdict, dataclass, field
from os import PathLike
from pathlib import Path
from typing import Any, Dict, List, Optional

import pandas as pd
import yaml
from typeguard import check_type

import a2rl as wi

from ._metadatadict import MetadataDict


[docs]def sample_dataset_path(dataset_name: str) -> Path: """Resolve the path to the sample dataset. Args: dataset_name: Name of sample dataset. Returns: Path to the directory of ``dataset_name``. See Also -------- list_sample_datasets read_csv_dataset Examples: .. code-block:: python >>> import pandas as pd >>> import a2rl as wi >>> p = wi.sample_dataset_path('chiller') >>> p PosixPath('.../a2rl/dataset/chiller') >>> df = wi.read_csv_dataset(p).trim() >>> with pd.option_context('display.max_columns', 2): ... print(df.head()) condenser_inlet_temp ... system_power_consumption 0 29.5 ... 756.4 1 30.2 ... 959.3 2 29.3 ... 586.1 3 28.5 ... 1178.5 4 30.3 ... 880.9 <BLANKLINE> [5 rows x 4 columns] >>> df.shape (9153, 4) """ return Path(__file__).parent / "dataset" / dataset_name
[docs]def list_sample_datasets() -> list[str]: """List the name of sample datasets included in ``whatif``. Returns: Dataset names See Also -------- sample_dataset_path Examples: .. code-block:: python >>> import a2rl as wi >>> wi.list_sample_datasets() ['chiller', 'rtu'] """ prefix = Path(__file__).parent / "dataset" file_list = [metadata.parent.name for metadata in prefix.glob("*/metadata.yaml")] file_list.sort() return file_list
[docs]@dataclass class Metadata: """Metadata of a ``Whatif`` dataframe or dataset. Arguments: states: Column names for states. actions: Column names for actions. rewards: Column names for rewards. forced_categories: Numeric columns that must be interpreted as categorical or ordinal. Otherwise, column dtypes are automatically determines. frequency: Sampling frequency of the dataset, in the Pandas frequency string format. See the ``freq`` argument in :func:`pandas.tseries.frequencies.to_offset`, and the :pdug:`pandas DateOffset tutorial <timeseries.html#dateoffset-objects>`. Examples: ``H``, ``2H``, ``D``. tags: Additional custom metadata. Defaults to an empty dictionary ``{}``. See Also -------- read_metadata save_metadata Examples: Create an in-memory metadata object. .. code-block:: python >>> import a2rl as wi >>> m = wi.Metadata( ... states=["s", "t"], ... actions=["a"], ... rewards=["r"], ... frequency="H", ... ) >>> m # doctest: +NORMALIZE_WHITESPACE Metadata(states=['s', 't'], actions=['a'], rewards=['r'], forced_categories=None, frequency='H', tags={}) Create from a dictionary with default sampling frequency and tags. .. code-block:: python >>> d = { ... "states": ["s", "t"], ... "actions": ["a"], ... "rewards": ["r"], ... } >>> wi.Metadata(**d) # doctest: +NORMALIZE_WHITESPACE Metadata(states=['s', 't'], actions=['a'], rewards=['r'], forced_categories=None, frequency=None, tags={}) Convert the metadata object to a YAML string. Please note this is shown for pedagogical purpose only. In practice, we recommend :func:`read_metadata` and :func:`save_metadata` to convert between :class:`Metadata` and YAML file. .. code-block:: python >>> from dataclasses import asdict >>> import yaml >>> s = yaml.safe_dump(asdict(m), sort_keys=False) >>> print(s) states: - s - t actions: - a rewards: - r forced_categories: null frequency: H tags: {} <BLANKLINE> """ states: list[str] #: ``list[str]`` - Column names for states. actions: list[str] #: ``list[str]`` - Column names for actions. rewards: list[str] #: ``list[str]`` - Column names for rewards. #: ``None | list[str]`` -- Numerical column names that must be interpreted as categorical or #: ordinal. forced_categories: None | list[str] = None #: ``None | str`` - Sampling frequency of the dataset, in the Pandas frequency string format. #: See the ``freq`` argument in :func:`pandas.tseries.frequencies.to_offset`, and the #: :pdug:`pandas DateOffset tutorial <timeseries.html#dateoffset-objects>`. Examples: ``H``, #: ``2H``, ``D``. frequency: None | str = None #: ``dict[str, Any]`` - Additional custom metadata. tags: dict[str, Any] = field(default_factory=dict) def __post_init__(self) -> None: check_type(self.states, List[str]) check_type(self.actions, List[str]) check_type(self.rewards, List[str]) check_type(self.forced_categories, Optional[List[str]]) check_type(self.frequency, Optional[str]) check_type(self.tags, Dict[str, Any])
[docs]def read_metadata(yaml_file: str | Path) -> Metadata: """Load a YAML file into an in-memory metadata object. Arguments: yaml_file: Path to the input YAML file. Returns: In-memory, metadata object See Also -------- Metadata save_metadata Examples: Read the metadata of the ``chiller`` sample dataset. .. code-block:: python >>> import a2rl as wi >>> p = wi.sample_dataset_path("chiller") / "metadata.yaml" >>> m = wi.read_metadata(p) >>> m.states ['condenser_inlet_temp', 'evaporator_heat_load_rt'] """ p = yaml_file if isinstance(yaml_file, Path) else Path(yaml_file) with p.open("r") as f: d = yaml.safe_load(f) return Metadata(**d)
[docs]def save_metadata( metadata: Metadata | MetadataDict, yaml_file: str | Path, compact: bool = False, ) -> None: """Save an in-memory metadata object into a YAML file. Arguments: metadata: Metadata object. yaml_file: Path to the output YAML file. compact: When set to True, do not output ``None`` entries. See Also -------- Metadata read_metadata Examples: Save an in-memory metadata object. .. code-block:: python >>> import a2rl as wi >>> m = wi.Metadata( ... states=["s", "t"], ... actions=["a"], ... rewards=["r"], ... tags={"k": "v"} ... ) >>> wi.save_metadata(m, "/tmp/metadata.yaml") >>> with open("/tmp/metadata.yaml") as f: ... print(''.join(f.readlines())) states: - s - t <BLANKLINE> actions: - a <BLANKLINE> rewards: - r <BLANKLINE> forced_categories: null <BLANKLINE> frequency: null <BLANKLINE> tags: k: v <BLANKLINE> Save metadata in compact mode to exclude null items in the YAML output. .. code-block:: python >>> wi.save_metadata(m, "/tmp/metadata.yaml", compact=True) >>> with open("/tmp/metadata.yaml") as f: ... print(''.join(f.readlines())) states: - s - t <BLANKLINE> actions: - a <BLANKLINE> rewards: - r <BLANKLINE> tags: k: v <BLANKLINE> Save a dictionary. Be aware that the dictionary **must** specifies **all** the :class:`Metadata` fields, including the default ones. If you prefer the flexibility to not re-declare the default fields, please use :class:`Metadata` instead. .. code-block:: python >>> d: wi.MetadataDict = { ... "states": ["s", "t"], ... "actions": ["a"], ... "rewards": ["r"], ... "forced_categories": ["a"], ... "frequency": None, ... "tags": {}, ... } >>> wi.save_metadata(d, "/tmp/metadata.yaml", compact=True) >>> with open("/tmp/metadata.yaml") as f: ... print(''.join(f.readlines())) states: - s - t <BLANKLINE> actions: - a <BLANKLINE> rewards: - r <BLANKLINE> forced_categories: - a <BLANKLINE> tags: {} <BLANKLINE> """ # Based on https://github.com/yaml/pyyaml/issues/127#issuecomment-525800484 class BlankLiner(yaml.SafeDumper): def write_line_break(self, data=None): super().write_line_break(data) if len(self.indents) == 1: super().write_line_break() if isinstance(metadata, Metadata): m = metadata elif isinstance(metadata, dict): check_type(metadata, MetadataDict) m = Metadata(**metadata) p = yaml_file if isinstance(yaml_file, Path) else Path(yaml_file) d = asdict(m) if compact: d = {k: v for k, v in d.items() if v is not None} with p.open("w") as f: yaml.dump(d, f, sort_keys=False, Dumper=BlankLiner)
[docs]def read_csv_dataset( dirpath: str | PathLike[str], *args, test_mdp: bool = False, low_memory: bool = False, **kwargs, ) -> wi.WiDataFrame: """Read a dataset directory into a :class:`a2rl.WiDataFrame`. Args: dirpath: Path to the dataset directory. *args: Positional arguments passed as-is to :func:`pandas.read_csv`. mdp_test: When ``True``, perform Markovian self-check on the dataframe loaded. Raise :exc:`a2rl.utils.NotMDPDataError` if the check fails. low_memory: If ``False``, read the entire .csv payload. If ``True``, internally process the .csv payload in chunks. This argument is passed as-is to :func:`pandas.read_csv`, however note that we override the default to ``True``, which is opposite to the default in :func:`pandas.read_csv`. **kwargs: Keyword arguments passed as-is to :func:`pandas.read_csv`. Returns: The loaded dataset. See Also -------- sample_dataset_path WiDataFrame.to_csv_dataset Examples: .. code-block:: python >>> import a2rl as wi >>> p = wi.sample_dataset_path('chiller') >>> df = wi.read_csv_dataset(p) >>> df.info() # doctest: +NORMALIZE_WHITESPACE <class 'a2rl._dataframe.WiDataFrame'> RangeIndex: 9153 entries, 0 to 9152 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 timestamp 9153 non-null object 1 staging 9153 non-null object 2 condenser_inlet_temp 9153 non-null float64 3 evaporator_heat_load_rt 9153 non-null float64 4 system_power_consumption 9153 non-null float64 dtypes: float64(3), object(2) memory usage: ... KB """ p = dirpath if isinstance(dirpath, Path) else Path(dirpath) metadata = read_metadata(p / "metadata.yaml") files = p.glob("**/*.csv") kwargs["low_memory"] = low_memory if metadata.forced_categories: kwargs["dtype"] = {col: str for col in metadata.forced_categories} dfs = {str(fpath): pd.read_csv(fpath, *args, **kwargs) for fpath in files} df = wi.WiDataFrame( pd.concat(dfs.values()), states=metadata.states, actions=metadata.actions, rewards=metadata.rewards, ) if test_mdp: tokeniser = wi.DiscreteTokenizer(n_bins=50) df_tok = tokeniser.fit_transform(df.trim()) wi.utils.assert_mdp(df_tok) return df