Source code for a2rl._io
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
from __future__ import annotations
from dataclasses import asdict, dataclass, field
from os import PathLike
from pathlib import Path
from typing import Any, Dict, List, Optional
import pandas as pd
import yaml
from typeguard import check_type
import a2rl as wi
from ._metadatadict import MetadataDict
[docs]def sample_dataset_path(dataset_name: str) -> Path:
"""Resolve the path to the sample dataset.
Args:
dataset_name: Name of sample dataset.
Returns:
Path to the directory of ``dataset_name``.
See Also
--------
list_sample_datasets
read_csv_dataset
Examples:
.. code-block:: python
>>> import pandas as pd
>>> import a2rl as wi
>>> p = wi.sample_dataset_path('chiller')
>>> p
PosixPath('.../a2rl/dataset/chiller')
>>> df = wi.read_csv_dataset(p).trim()
>>> with pd.option_context('display.max_columns', 2):
... print(df.head())
condenser_inlet_temp ... system_power_consumption
0 29.5 ... 756.4
1 30.2 ... 959.3
2 29.3 ... 586.1
3 28.5 ... 1178.5
4 30.3 ... 880.9
<BLANKLINE>
[5 rows x 4 columns]
>>> df.shape
(9153, 4)
"""
return Path(__file__).parent / "dataset" / dataset_name
[docs]def list_sample_datasets() -> list[str]:
"""List the name of sample datasets included in ``whatif``.
Returns:
Dataset names
See Also
--------
sample_dataset_path
Examples:
.. code-block:: python
>>> import a2rl as wi
>>> wi.list_sample_datasets()
['chiller', 'rtu']
"""
prefix = Path(__file__).parent / "dataset"
file_list = [metadata.parent.name for metadata in prefix.glob("*/metadata.yaml")]
file_list.sort()
return file_list
[docs]@dataclass
class Metadata:
"""Metadata of a ``Whatif`` dataframe or dataset.
Arguments:
states: Column names for states.
actions: Column names for actions.
rewards: Column names for rewards.
forced_categories: Numeric columns that must be interpreted as categorical or ordinal.
Otherwise, column dtypes are automatically determines.
frequency: Sampling frequency of the dataset, in the Pandas frequency string format. See the
``freq`` argument in
:func:`pandas.tseries.frequencies.to_offset`, and the :pdug:`pandas DateOffset tutorial
<timeseries.html#dateoffset-objects>`. Examples: ``H``, ``2H``, ``D``.
tags: Additional custom metadata. Defaults to an empty dictionary ``{}``.
See Also
--------
read_metadata
save_metadata
Examples:
Create an in-memory metadata object.
.. code-block:: python
>>> import a2rl as wi
>>> m = wi.Metadata(
... states=["s", "t"],
... actions=["a"],
... rewards=["r"],
... frequency="H",
... )
>>> m # doctest: +NORMALIZE_WHITESPACE
Metadata(states=['s', 't'], actions=['a'], rewards=['r'], forced_categories=None,
frequency='H', tags={})
Create from a dictionary with default sampling frequency and tags.
.. code-block:: python
>>> d = {
... "states": ["s", "t"],
... "actions": ["a"],
... "rewards": ["r"],
... }
>>> wi.Metadata(**d) # doctest: +NORMALIZE_WHITESPACE
Metadata(states=['s', 't'], actions=['a'], rewards=['r'], forced_categories=None,
frequency=None, tags={})
Convert the metadata object to a YAML string. Please note this is shown for pedagogical
purpose only. In practice, we recommend :func:`read_metadata` and :func:`save_metadata`
to convert between :class:`Metadata` and YAML file.
.. code-block:: python
>>> from dataclasses import asdict
>>> import yaml
>>> s = yaml.safe_dump(asdict(m), sort_keys=False)
>>> print(s)
states:
- s
- t
actions:
- a
rewards:
- r
forced_categories: null
frequency: H
tags: {}
<BLANKLINE>
"""
states: list[str] #: ``list[str]`` - Column names for states.
actions: list[str] #: ``list[str]`` - Column names for actions.
rewards: list[str] #: ``list[str]`` - Column names for rewards.
#: ``None | list[str]`` -- Numerical column names that must be interpreted as categorical or
#: ordinal.
forced_categories: None | list[str] = None
#: ``None | str`` - Sampling frequency of the dataset, in the Pandas frequency string format.
#: See the ``freq`` argument in :func:`pandas.tseries.frequencies.to_offset`, and the
#: :pdug:`pandas DateOffset tutorial <timeseries.html#dateoffset-objects>`. Examples: ``H``,
#: ``2H``, ``D``.
frequency: None | str = None
#: ``dict[str, Any]`` - Additional custom metadata.
tags: dict[str, Any] = field(default_factory=dict)
def __post_init__(self) -> None:
check_type(self.states, List[str])
check_type(self.actions, List[str])
check_type(self.rewards, List[str])
check_type(self.forced_categories, Optional[List[str]])
check_type(self.frequency, Optional[str])
check_type(self.tags, Dict[str, Any])
[docs]def read_metadata(yaml_file: str | Path) -> Metadata:
"""Load a YAML file into an in-memory metadata object.
Arguments:
yaml_file: Path to the input YAML file.
Returns:
In-memory, metadata object
See Also
--------
Metadata
save_metadata
Examples:
Read the metadata of the ``chiller`` sample dataset.
.. code-block:: python
>>> import a2rl as wi
>>> p = wi.sample_dataset_path("chiller") / "metadata.yaml"
>>> m = wi.read_metadata(p)
>>> m.states
['condenser_inlet_temp', 'evaporator_heat_load_rt']
"""
p = yaml_file if isinstance(yaml_file, Path) else Path(yaml_file)
with p.open("r") as f:
d = yaml.safe_load(f)
return Metadata(**d)
[docs]def save_metadata(
metadata: Metadata | MetadataDict,
yaml_file: str | Path,
compact: bool = False,
) -> None:
"""Save an in-memory metadata object into a YAML file.
Arguments:
metadata: Metadata object.
yaml_file: Path to the output YAML file.
compact: When set to True, do not output ``None`` entries.
See Also
--------
Metadata
read_metadata
Examples:
Save an in-memory metadata object.
.. code-block:: python
>>> import a2rl as wi
>>> m = wi.Metadata(
... states=["s", "t"],
... actions=["a"],
... rewards=["r"],
... tags={"k": "v"}
... )
>>> wi.save_metadata(m, "/tmp/metadata.yaml")
>>> with open("/tmp/metadata.yaml") as f:
... print(''.join(f.readlines()))
states:
- s
- t
<BLANKLINE>
actions:
- a
<BLANKLINE>
rewards:
- r
<BLANKLINE>
forced_categories: null
<BLANKLINE>
frequency: null
<BLANKLINE>
tags:
k: v
<BLANKLINE>
Save metadata in compact mode to exclude null items in the YAML output.
.. code-block:: python
>>> wi.save_metadata(m, "/tmp/metadata.yaml", compact=True)
>>> with open("/tmp/metadata.yaml") as f:
... print(''.join(f.readlines()))
states:
- s
- t
<BLANKLINE>
actions:
- a
<BLANKLINE>
rewards:
- r
<BLANKLINE>
tags:
k: v
<BLANKLINE>
Save a dictionary. Be aware that the dictionary **must** specifies **all** the
:class:`Metadata` fields, including the default ones. If you prefer the flexibility to not
re-declare the default fields, please use :class:`Metadata` instead.
.. code-block:: python
>>> d: wi.MetadataDict = {
... "states": ["s", "t"],
... "actions": ["a"],
... "rewards": ["r"],
... "forced_categories": ["a"],
... "frequency": None,
... "tags": {},
... }
>>> wi.save_metadata(d, "/tmp/metadata.yaml", compact=True)
>>> with open("/tmp/metadata.yaml") as f:
... print(''.join(f.readlines()))
states:
- s
- t
<BLANKLINE>
actions:
- a
<BLANKLINE>
rewards:
- r
<BLANKLINE>
forced_categories:
- a
<BLANKLINE>
tags: {}
<BLANKLINE>
"""
# Based on https://github.com/yaml/pyyaml/issues/127#issuecomment-525800484
class BlankLiner(yaml.SafeDumper):
def write_line_break(self, data=None):
super().write_line_break(data)
if len(self.indents) == 1:
super().write_line_break()
if isinstance(metadata, Metadata):
m = metadata
elif isinstance(metadata, dict):
check_type(metadata, MetadataDict)
m = Metadata(**metadata)
p = yaml_file if isinstance(yaml_file, Path) else Path(yaml_file)
d = asdict(m)
if compact:
d = {k: v for k, v in d.items() if v is not None}
with p.open("w") as f:
yaml.dump(d, f, sort_keys=False, Dumper=BlankLiner)
[docs]def read_csv_dataset(
dirpath: str | PathLike[str],
*args,
test_mdp: bool = False,
low_memory: bool = False,
**kwargs,
) -> wi.WiDataFrame:
"""Read a dataset directory into a :class:`a2rl.WiDataFrame`.
Args:
dirpath: Path to the dataset directory.
*args: Positional arguments passed as-is to :func:`pandas.read_csv`.
mdp_test: When ``True``, perform Markovian self-check on the dataframe loaded.
Raise :exc:`a2rl.utils.NotMDPDataError` if the check fails.
low_memory: If ``False``, read the entire .csv payload. If ``True``, internally process the
.csv payload in chunks. This argument is passed as-is to :func:`pandas.read_csv`,
however note that we override the default to ``True``, which is opposite to the default
in :func:`pandas.read_csv`.
**kwargs: Keyword arguments passed as-is to :func:`pandas.read_csv`.
Returns:
The loaded dataset.
See Also
--------
sample_dataset_path
WiDataFrame.to_csv_dataset
Examples:
.. code-block:: python
>>> import a2rl as wi
>>> p = wi.sample_dataset_path('chiller')
>>> df = wi.read_csv_dataset(p)
>>> df.info() # doctest: +NORMALIZE_WHITESPACE
<class 'a2rl._dataframe.WiDataFrame'>
RangeIndex: 9153 entries, 0 to 9152
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 timestamp 9153 non-null object
1 staging 9153 non-null object
2 condenser_inlet_temp 9153 non-null float64
3 evaporator_heat_load_rt 9153 non-null float64
4 system_power_consumption 9153 non-null float64
dtypes: float64(3), object(2)
memory usage: ... KB
"""
p = dirpath if isinstance(dirpath, Path) else Path(dirpath)
metadata = read_metadata(p / "metadata.yaml")
files = p.glob("**/*.csv")
kwargs["low_memory"] = low_memory
if metadata.forced_categories:
kwargs["dtype"] = {col: str for col in metadata.forced_categories}
dfs = {str(fpath): pd.read_csv(fpath, *args, **kwargs) for fpath in files}
df = wi.WiDataFrame(
pd.concat(dfs.values()),
states=metadata.states,
actions=metadata.actions,
rewards=metadata.rewards,
)
if test_mdp:
tokeniser = wi.DiscreteTokenizer(n_bins=50)
df_tok = tokeniser.fit_transform(df.trim())
wi.utils.assert_mdp(df_tok)
return df