Skip to content

Module sagemaker_defect_detection.classifier

None

None

View Source
# mypy: ignore-errors

from typing import Dict

import os

from collections import OrderedDict

from argparse import ArgumentParser, Namespace

from multiprocessing import cpu_count

import torch

import torch.optim as optim

import torch.nn.functional as F

from torch.utils.data import DataLoader

import pytorch_lightning as pl

from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint

import pytorch_lightning.metrics.functional as plm

from sagemaker_defect_detection import Classification, NEUCLS, get_transform

from sagemaker_defect_detection.utils import load_checkpoint, freeze

def metrics(name: str, out: torch.Tensor, target: torch.Tensor) -> Dict[str, torch.Tensor]:

    pred = torch.argmax(out, 1).detach()

    target = target.detach()

    metrics = {}

    metrics[name + "_acc"] = plm.accuracy(pred, target)

    metrics[name + "_prec"] = plm.precision(pred, target)

    metrics[name + "_recall"] = plm.recall(pred, target)

    metrics[name + "_f1_score"] = plm.recall(pred, target)

    return metrics

class DDNClassification(pl.LightningModule):

    def __init__(

        self,

        data_path: str,

        backbone: str,

        freeze_backbone: bool,

        num_classes: int,

        learning_rate: float,

        batch_size: int,

        momentum: float,

        weight_decay: float,

        seed: int,

        **kwargs

    ) -> None:

        super().__init__()

        self.data_path = data_path

        self.backbone = backbone

        self.freeze_backbone = freeze_backbone

        self.num_classes = num_classes

        self.learning_rate = learning_rate

        self.batch_size = batch_size

        self.momentum = momentum

        self.weight_decay = weight_decay

        self.seed = seed

        self.train_dataset = NEUCLS(self.data_path, split="train", transform=get_transform("train"), seed=self.seed)

        self.val_dataset = NEUCLS(self.data_path, split="val", transform=get_transform("val"), seed=self.seed)

        self.test_dataset = NEUCLS(self.data_path, split="test", transform=get_transform("test"), seed=self.seed)

        self.model = Classification(self.backbone, self.num_classes)

        if self.freeze_backbone:

            for param in self.model.mfn.backbone.parameters():

                param.requires_grad = False

    def forward(self, x):  # ignore

        return self.model(x)

    def training_step(self, batch, batch_idx):

        images, target = batch

        output = self(images)

        loss_val = F.cross_entropy(output, target)

        metrics_dict = metrics("train", output, target)

        tqdm_dict = {"train_loss": loss_val, **metrics_dict}

        output = OrderedDict({"loss": loss_val, "progress_bar": tqdm_dict, "log": tqdm_dict})

        return output

    def validation_step(self, batch, batch_idx):

        images, target = batch

        output = self(images)

        loss_val = F.cross_entropy(output, target)

        metrics_dict = metrics("val", output, target)

        output = OrderedDict({"val_loss": loss_val, **metrics_dict})

        return output

    def validation_epoch_end(self, outputs):

        log_dict = {}

        for metric_name in outputs[0]:

            log_dict[metric_name] = torch.stack([x[metric_name] for x in outputs]).mean()

        return {"log": log_dict, "progress_bar": log_dict, **log_dict}

    def test_step(self, batch, batch_idx):

        images, target = batch

        output = self(images)

        loss_val = F.cross_entropy(output, target)

        metrics_dict = metrics("test", output, target)

        output = OrderedDict({"test_loss": loss_val, **metrics_dict})

        return output

    def test_epoch_end(self, outputs):

        log_dict = {}

        for metric_name in outputs[0]:

            log_dict[metric_name] = torch.stack([x[metric_name] for x in outputs]).mean()

        return {"log": log_dict, "progress_bar": log_dict, **log_dict}

    def configure_optimizers(self):

        optimizer = optim.SGD(

            self.parameters(), lr=self.learning_rate, momentum=self.momentum, weight_decay=self.weight_decay

        )

        return optimizer

    def train_dataloader(self):

        train_loader = DataLoader(

            dataset=self.train_dataset,

            batch_size=self.batch_size,

            shuffle=True,

            num_workers=cpu_count(),

        )

        return train_loader

    def val_dataloader(self):

        val_loader = DataLoader(

            self.val_dataset,

            batch_size=self.batch_size,

            shuffle=False,

            num_workers=cpu_count() // 2,

        )

        return val_loader

    def test_dataloader(self):

        test_loader = DataLoader(

            self.test_dataset,

            batch_size=self.batch_size,

            shuffle=False,

            num_workers=cpu_count(),

        )

        return test_loader

    @staticmethod

    def add_model_specific_args(parent_parser):  # pragma: no-cover

        parser = ArgumentParser(parents=[parent_parser], add_help=False)

        aa = parser.add_argument

        aa(

            "--data-path",

            metavar="DIR",

            type=str,

            default=os.getenv("SM_CHANNEL_TRAINING", ""),

        )

        aa(

            "--backbone",

            default="resnet34",

        )

        aa(

            "--freeze-backbone",

            action="store_true",

        )

        aa(

            "--num-classes",

            default=6,

            type=int,

            metavar="N",

        )

        aa(

            "-b",

            "--batch-size",

            default=64,

            type=int,

            metavar="N",

        )

        aa(

            "--lr",

            "--learning-rate",

            default=1e-3,

            type=float,

            metavar="LR",

            dest="learning_rate",

        )

        aa("--momentum", default=0.9, type=float, metavar="M", help="momentum")

        aa(

            "--wd",

            "--weight-decay",

            default=1e-4,

            type=float,

            metavar="W",

            dest="weight_decay",

        )

        aa(

            "--seed",

            type=int,

            default=42,

        )

        return parser

def get_args() -> Namespace:

    parent_parser = ArgumentParser(add_help=False)

    aa = parent_parser.add_argument

    aa("--epochs", type=int, default=100, help="number of training epochs")

    aa("--save-path", metavar="DIR", default=os.getenv("SM_MODEL_DIR", ""), type=str, help="path to save output")

    aa("--gpus", type=int, default=os.getenv("SM_NUM_GPUS", 1), help="how many gpus")

    aa(

        "--distributed-backend",

        type=str,

        default="",

        choices=("dp", "ddp", "ddp2"),

        help="supports three options dp, ddp, ddp2",

    )

    aa("--use-16bit", dest="use_16bit", action="store_true", help="if true uses 16 bit precision")

    parser = DDNClassification.add_model_specific_args(parent_parser)

    return parser.parse_args()

def model_fn(model_dir):

    # TODO: `model_fn` doesn't get more args

    # see: https://github.com/aws/sagemaker-inference-toolkit/issues/65

    backbone = "resnet34"

    num_classes = 6

    model = load_checkpoint(Classification(backbone, num_classes), model_dir, prefix="model")

    model = model.eval()

    freeze(model)

    return model

def main(args: Namespace) -> None:

    model = DDNClassification(**vars(args))

    if args.seed is not None:

        pl.seed_everything(args.seed)

        if torch.cuda.device_count() > 1:

            torch.cuda.manual_seed_all(args.seed)

    # TODO: add deterministic training

    # torch.backends.cudnn.deterministic = True

    checkpoint_callback = ModelCheckpoint(

        filepath=os.path.join(args.save_path, "{epoch}-{val_loss:.3f}-{val_acc:.3f}"),

        save_top_k=1,

        verbose=True,

        monitor="val_acc",

        mode="max",

    )

    early_stop_callback = EarlyStopping("val_loss", patience=10)

    trainer = pl.Trainer(

        default_root_dir=args.save_path,

        gpus=args.gpus,

        max_epochs=args.epochs,

        early_stop_callback=early_stop_callback,

        checkpoint_callback=checkpoint_callback,

        gradient_clip_val=10,

        num_sanity_val_steps=0,

        distributed_backend=args.distributed_backend or None,

        # precision=16 if args.use_16bit else 32, # TODO: amp apex support

    )

    trainer.fit(model)

    trainer.test()

    return

if __name__ == "__main__":

    main(get_args())

Functions

get_args

def get_args(

) -> argparse.Namespace
View Source
def get_args() -> Namespace:

    parent_parser = ArgumentParser(add_help=False)

    aa = parent_parser.add_argument

    aa("--epochs", type=int, default=100, help="number of training epochs")

    aa("--save-path", metavar="DIR", default=os.getenv("SM_MODEL_DIR", ""), type=str, help="path to save output")

    aa("--gpus", type=int, default=os.getenv("SM_NUM_GPUS", 1), help="how many gpus")

    aa(

        "--distributed-backend",

        type=str,

        default="",

        choices=("dp", "ddp", "ddp2"),

        help="supports three options dp, ddp, ddp2",

    )

    aa("--use-16bit", dest="use_16bit", action="store_true", help="if true uses 16 bit precision")

    parser = DDNClassification.add_model_specific_args(parent_parser)

    return parser.parse_args()

main

def main(
    args: argparse.Namespace
) -> None
View Source
def main(args: Namespace) -> None:

    model = DDNClassification(**vars(args))

    if args.seed is not None:

        pl.seed_everything(args.seed)

        if torch.cuda.device_count() > 1:

            torch.cuda.manual_seed_all(args.seed)

    # TODO: add deterministic training

    # torch.backends.cudnn.deterministic = True

    checkpoint_callback = ModelCheckpoint(

        filepath=os.path.join(args.save_path, "{epoch}-{val_loss:.3f}-{val_acc:.3f}"),

        save_top_k=1,

        verbose=True,

        monitor="val_acc",

        mode="max",

    )

    early_stop_callback = EarlyStopping("val_loss", patience=10)

    trainer = pl.Trainer(

        default_root_dir=args.save_path,

        gpus=args.gpus,

        max_epochs=args.epochs,

        early_stop_callback=early_stop_callback,

        checkpoint_callback=checkpoint_callback,

        gradient_clip_val=10,

        num_sanity_val_steps=0,

        distributed_backend=args.distributed_backend or None,

        # precision=16 if args.use_16bit else 32, # TODO: amp apex support

    )

    trainer.fit(model)

    trainer.test()

    return

metrics

def metrics(
    name: str,
    out: torch.Tensor,
    target: torch.Tensor
) -> Dict[str, torch.Tensor]
View Source
def metrics(name: str, out: torch.Tensor, target: torch.Tensor) -> Dict[str, torch.Tensor]:

    pred = torch.argmax(out, 1).detach()

    target = target.detach()

    metrics = {}

    metrics[name + "_acc"] = plm.accuracy(pred, target)

    metrics[name + "_prec"] = plm.precision(pred, target)

    metrics[name + "_recall"] = plm.recall(pred, target)

    metrics[name + "_f1_score"] = plm.recall(pred, target)

    return metrics

model_fn

def model_fn(
    model_dir
)
View Source
def model_fn(model_dir):

    # TODO: `model_fn` doesn't get more args

    # see: https://github.com/aws/sagemaker-inference-toolkit/issues/65

    backbone = "resnet34"

    num_classes = 6

    model = load_checkpoint(Classification(backbone, num_classes), model_dir, prefix="model")

    model = model.eval()

    freeze(model)

    return model

Classes

DDNClassification

class DDNClassification(
    data_path: str,
    backbone: str,
    freeze_backbone: bool,
    num_classes: int,
    learning_rate: float,
    batch_size: int,
    momentum: float,
    weight_decay: float,
    seed: int,
    **kwargs
)

Ancestors (in MRO)

  • pytorch_lightning.core.lightning.LightningModule
  • abc.ABC
  • pytorch_lightning.utilities.device_dtype_mixin.DeviceDtypeModuleMixin
  • pytorch_lightning.core.grads.GradInformation
  • pytorch_lightning.core.saving.ModelIO
  • pytorch_lightning.core.hooks.ModelHooks
  • torch.nn.modules.module.Module

Class variables

CHECKPOINT_HYPER_PARAMS_KEY
CHECKPOINT_HYPER_PARAMS_NAME
CHECKPOINT_HYPER_PARAMS_TYPE
T_destination
dump_patches

Static methods

add_model_specific_args

def add_model_specific_args(
    parent_parser
)
View Source
    @staticmethod

    def add_model_specific_args(parent_parser):  # pragma: no-cover

        parser = ArgumentParser(parents=[parent_parser], add_help=False)

        aa = parser.add_argument

        aa(

            "--data-path",

            metavar="DIR",

            type=str,

            default=os.getenv("SM_CHANNEL_TRAINING", ""),

        )

        aa(

            "--backbone",

            default="resnet34",

        )

        aa(

            "--freeze-backbone",

            action="store_true",

        )

        aa(

            "--num-classes",

            default=6,

            type=int,

            metavar="N",

        )

        aa(

            "-b",

            "--batch-size",

            default=64,

            type=int,

            metavar="N",

        )

        aa(

            "--lr",

            "--learning-rate",

            default=1e-3,

            type=float,

            metavar="LR",

            dest="learning_rate",

        )

        aa("--momentum", default=0.9, type=float, metavar="M", help="momentum")

        aa(

            "--wd",

            "--weight-decay",

            default=1e-4,

            type=float,

            metavar="W",

            dest="weight_decay",

        )

        aa(

            "--seed",

            type=int,

            default=42,

        )

        return parser

load_from_checkpoint

def load_from_checkpoint(
    checkpoint_path: str,
    *args,
    map_location: Union[Dict[str, str], str, torch.device, int, Callable, NoneType] = None,
    hparams_file: Union[str, NoneType] = None,
    tags_csv: Union[str, NoneType] = None,
    **kwargs
)

Primary way of loading a model from a checkpoint. When Lightning saves a checkpoint

it stores the arguments passed to __init__ in the checkpoint under module_arguments

Any arguments specified through *args and **kwargs will override args stored in hparams.

Parameters:

Name Type Description Default
checkpoint_path None Path to checkpoint. This can also be a URL. None
args None Any positional args needed to init the model. None
map_location None If your checkpoint saved a GPU model and you now load on CPUs
or a different number of GPUs, use this to map to the new setup.
The behaviour is the same as in :func:torch.load. None
hparams_file None Optional path to a .yaml file with hierarchical structure
as in this example::
drop_prob: 0.2
dataloader:
    batch_size: 32

You most likely won't need this since Lightning will always save the hyperparameters to the checkpoint. However, if your checkpoint weights don't have the hyperparameters saved, use this method to pass in a .yaml file with the hparams you'd like to use. These will be converted into a :class:~dict and passed into your :class:LightningModule for use.

If your model's hparams argument is :class:~argparse.Namespace and .yaml file has hierarchical structure, you need to refactor your model to treat hparams as :class:~dict.

.csv files are acceptable here till v0.9.0, see tags_csv argument for detailed usage. | None | | tags_csv | None | .. warning:: .. deprecated:: 0.7.6

`tags_csv` argument is deprecated in v0.7.6. Will be removed v0.9.0.

Optional path to a .csv file with two columns (key, value) as in this example::

key,value
drop_prob,0.2
batch_size,32

Use this method to pass in a .csv file with the hparams you'd like to use. | None | | hparam_overrides | None | A dictionary with keys to override in the hparams | None | | kwargs | None | Any keyword args needed to init the model. Return: | None | | | None | class:LightningModule with loaded weights and hyperparameters (if available). | None |

View Source
    @classmethod

    def load_from_checkpoint(

            cls,

            checkpoint_path: str,

            *args,

            map_location: Optional[Union[Dict[str, str], str, torch.device, int, Callable]] = None,

            hparams_file: Optional[str] = None,

            tags_csv: Optional[str] = None,  # backward compatible, todo: remove in v0.9.0

            **kwargs

    ):

        r"""

        Primary way of loading a model from a checkpoint. When Lightning saves a checkpoint

        it stores the arguments passed to `__init__`  in the checkpoint under `module_arguments`

        Any arguments specified through \*args and \*\*kwargs will override args stored in `hparams`.

        Args:

            checkpoint_path: Path to checkpoint. This can also be a URL.

            args: Any positional args needed to init the model.

            map_location:

                If your checkpoint saved a GPU model and you now load on CPUs

                or a different number of GPUs, use this to map to the new setup.

                The behaviour is the same as in :func:`torch.load`.

            hparams_file: Optional path to a .yaml file with hierarchical structure

                as in this example::

                    drop_prob: 0.2

                    dataloader:

                        batch_size: 32

                You most likely won't need this since Lightning will always save the hyperparameters

                to the checkpoint.

                However, if your checkpoint weights don't have the hyperparameters saved,

                use this method to pass in a .yaml file with the hparams you'd like to use.

                These will be converted into a :class:`~dict` and passed into your

                :class:`LightningModule` for use.

                If your model's `hparams` argument is :class:`~argparse.Namespace`

                and .yaml file has hierarchical structure, you need to refactor your model to treat

                `hparams` as :class:`~dict`.

                .csv files are acceptable here till v0.9.0, see tags_csv argument for detailed usage.

            tags_csv:

                .. warning:: .. deprecated:: 0.7.6

                    `tags_csv` argument is deprecated in v0.7.6. Will be removed v0.9.0.

                Optional path to a .csv file with two columns (key, value)

                as in this example::

                    key,value

                    drop_prob,0.2

                    batch_size,32

                Use this method to pass in a .csv file with the hparams you'd like to use.

            hparam_overrides: A dictionary with keys to override in the hparams

            kwargs: Any keyword args needed to init the model.

        Return:

            :class:`LightningModule` with loaded weights and hyperparameters (if available).

        Example:

            .. code-block:: python

                # load weights without mapping ...

                MyLightningModule.load_from_checkpoint('path/to/checkpoint.ckpt')

                # or load weights mapping all weights from GPU 1 to GPU 0 ...

                map_location = {'cuda:1':'cuda:0'}

                MyLightningModule.load_from_checkpoint(

                    'path/to/checkpoint.ckpt',

                    map_location=map_location

                )

                # or load weights and hyperparameters from separate files.

                MyLightningModule.load_from_checkpoint(

                    'path/to/checkpoint.ckpt',

                    hparams_file='/path/to/hparams_file.yaml'

                )

                # override some of the params with new values

                MyLightningModule.load_from_checkpoint(

                    PATH,

                    num_layers=128,

                    pretrained_ckpt_path: NEW_PATH,

                )

                # predict

                pretrained_model.eval()

                pretrained_model.freeze()

                y_hat = pretrained_model(x)

        """

        if map_location is not None:

            checkpoint = pl_load(checkpoint_path, map_location=map_location)

        else:

            checkpoint = pl_load(checkpoint_path, map_location=lambda storage, loc: storage)

        # add the hparams from csv file to checkpoint

        if tags_csv is not None:

            hparams_file = tags_csv

            rank_zero_warn('`tags_csv` argument is deprecated in v0.7.6. Will be removed v0.9.0', DeprecationWarning)

        if hparams_file is not None:

            extension = hparams_file.split('.')[-1]

            if extension.lower() in ('csv'):

                hparams = load_hparams_from_tags_csv(hparams_file)

            elif extension.lower() in ('yml', 'yaml'):

                hparams = load_hparams_from_yaml(hparams_file)

            else:

                raise ValueError('.csv, .yml or .yaml is required for `hparams_file`')

            hparams['on_gpu'] = False

            # overwrite hparams by the given file

            checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY] = hparams

        # for past checkpoint need to add the new key

        if cls.CHECKPOINT_HYPER_PARAMS_KEY not in checkpoint:

            checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY] = {}

        # override the hparams with values that were passed in

        checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY].update(kwargs)

        model = cls._load_model_state(checkpoint, *args, **kwargs)

        return model

load_from_metrics

def load_from_metrics(
    weights_path,
    tags_csv,
    map_location=None
)

Warning:

Deprecated in version 0.7.0. You should use :meth:load_from_checkpoint instead. Will be removed in v0.9.0.

View Source
    @classmethod

    def load_from_metrics(cls, weights_path, tags_csv, map_location=None):

        r"""

        Warning:

            Deprecated in version 0.7.0. You should use :meth:`load_from_checkpoint` instead.

            Will be removed in v0.9.0.

        """

        rank_zero_warn(

            "`load_from_metrics` method has been unified with `load_from_checkpoint` in v0.7.0."

            " The deprecated method will be removed in v0.9.0.", DeprecationWarning

        )

        return cls.load_from_checkpoint(weights_path, tags_csv=tags_csv, map_location=map_location)

Instance variables

device
dtype
example_input_array
hparams
on_gpu

True if your model is currently running on GPUs.

Useful to set flags around the LightningModule for different CPU vs GPU behavior.

Methods

add_module

def add_module(
    self,
    name: str,
    module: 'Module'
) -> None

Adds a child module to the current module.

The module can be accessed as an attribute using the given name.

Parameters:

Name Type Description Default
name string name of the child module. The child module can be
accessed from this module using the given name None
module Module child module to be added to the module. None
View Source
    def add_module(self, name: str, module: 'Module') -> None:

        r"""Adds a child module to the current module.

        The module can be accessed as an attribute using the given name.

        Args:

            name (string): name of the child module. The child module can be

                accessed from this module using the given name

            module (Module): child module to be added to the module.

        """

        if not isinstance(module, Module) and module is not None:

            raise TypeError("{} is not a Module subclass".format(

                torch.typename(module)))

        elif not isinstance(name, torch._six.string_classes):

            raise TypeError("module name should be a string. Got {}".format(

                torch.typename(name)))

        elif hasattr(self, name) and name not in self._modules:

            raise KeyError("attribute '{}' already exists".format(name))

        elif '.' in name:

            raise KeyError("module name can't contain \".\"")

        elif name == '':

            raise KeyError("module name can't be empty string \"\"")

        self._modules[name] = module

amp_scale_loss

def amp_scale_loss(
    self,
    unscaled_loss,
    optimizer,
    optimizer_idx
)
View Source
    def amp_scale_loss(self, unscaled_loss, optimizer, optimizer_idx):

        if NATIVE_AMP_AVALAIBLE:

            scaled_loss = self.trainer.scaler.scale(unscaled_loss)

        else:

            scaled_loss = amp.scale_loss(unscaled_loss, optimizer)

        return scaled_loss

apply

def apply(
    self: ~T,
    fn: Callable[[ForwardRef('Module')], NoneType]
) -> ~T

Applies fn recursively to every submodule (as returned by .children())

as well as self. Typical use includes initializing the parameters of a model (see also :ref:nn-init-doc).

Parameters:

Name Type Description Default
fn ( None class:Module -> None): function to be applied to each submodule None

Returns:

Type Description
Module self
Example::
>>> @torch.no_grad()
>>> def init_weights(m):
>>>     print(m)
>>>     if type(m) == nn.Linear:
>>>         m.weight.fill_(1.0)
>>>         print(m.weight)
>>> net = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2))
>>> net.apply(init_weights)
Linear(in_features=2, out_features=2, bias=True)
Parameter containing:
tensor([[ 1.,  1.],
        [ 1.,  1.]])
Linear(in_features=2, out_features=2, bias=True)
Parameter containing:
tensor([[ 1.,  1.],
        [ 1.,  1.]])
Sequential(
  (0): Linear(in_features=2, out_features=2, bias=True)
  (1): Linear(in_features=2, out_features=2, bias=True)
)
Sequential(
  (0): Linear(in_features=2, out_features=2, bias=True)
  (1): Linear(in_features=2, out_features=2, bias=True)
) |
View Source
    def apply(self: T, fn: Callable[['Module'], None]) -> T:

        r"""Applies ``fn`` recursively to every submodule (as returned by ``.children()``)

        as well as self. Typical use includes initializing the parameters of a model

        (see also :ref:`nn-init-doc`).

        Args:

            fn (:class:`Module` -> None): function to be applied to each submodule

        Returns:

            Module: self

        Example::

            >>> @torch.no_grad()

            >>> def init_weights(m):

            >>>     print(m)

            >>>     if type(m) == nn.Linear:

            >>>         m.weight.fill_(1.0)

            >>>         print(m.weight)

            >>> net = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2))

            >>> net.apply(init_weights)

            Linear(in_features=2, out_features=2, bias=True)

            Parameter containing:

            tensor([[ 1.,  1.],

                    [ 1.,  1.]])

            Linear(in_features=2, out_features=2, bias=True)

            Parameter containing:

            tensor([[ 1.,  1.],

                    [ 1.,  1.]])

            Sequential(

              (0): Linear(in_features=2, out_features=2, bias=True)

              (1): Linear(in_features=2, out_features=2, bias=True)

            )

            Sequential(

              (0): Linear(in_features=2, out_features=2, bias=True)

              (1): Linear(in_features=2, out_features=2, bias=True)

            )

        """

        for module in self.children():

            module.apply(fn)

        fn(self)

        return self

backward

def backward(
    self,
    trainer,
    loss: torch.Tensor,
    optimizer: torch.optim.optimizer.Optimizer,
    optimizer_idx: int
) -> None

Override backward with your own implementation if you need to.

Parameters:

Name Type Description Default
trainer None Pointer to the trainer None
loss None Loss is already scaled by accumulated grads None
optimizer None Current optimizer being used None
optimizer_idx None Index of the current optimizer being used
Called to perform backward step.
Feel free to override as needed.

The loss passed in has already been scaled for accumulated gradients if requested.

Example:: | None | | def backward | self, trainer, loss, optimizer, optimizer_idx | loss.backward() | None |

View Source
    def backward(self, trainer, loss: Tensor, optimizer: Optimizer, optimizer_idx: int) -> None:

        """

        Override backward with your own implementation if you need to.

        Args:

            trainer: Pointer to the trainer

            loss: Loss is already scaled by accumulated grads

            optimizer: Current optimizer being used

            optimizer_idx: Index of the current optimizer being used

        Called to perform backward step.

        Feel free to override as needed.

        The loss passed in has already been scaled for accumulated gradients if requested.

        Example::

            def backward(self, trainer, loss, optimizer, optimizer_idx):

                loss.backward()

        """

        loss.backward()

bfloat16

def bfloat16(
    self: ~T
) -> ~T

Casts all floating point parameters and buffers to bfloat16 datatype.

Returns:

Type Description
Module self
View Source
    def bfloat16(self: T) -> T:

        r"""Casts all floating point parameters and buffers to ``bfloat16`` datatype.

        Returns:

            Module: self

        """

        return self._apply(lambda t: t.bfloat16() if t.is_floating_point() else t)

buffers

def buffers(
    self,
    recurse: bool = True
) -> Iterator[torch.Tensor]

Returns an iterator over module buffers.

Parameters:

Name Type Description Default
recurse bool if True, then yields buffers of this module
and all submodules. Otherwise, yields only buffers that
are direct members of this module. None

Yields:

Type Description
torch.Tensor module buffer
Example::
>>> for buf in model.buffers():
>>>     print(type(buf), buf.size())
<class 'torch.Tensor'> (20L,)
<class 'torch.Tensor'> (20L, 1L, 5L, 5L) |
View Source
    def buffers(self, recurse: bool = True) -> Iterator[Tensor]:

        r"""Returns an iterator over module buffers.

        Args:

            recurse (bool): if True, then yields buffers of this module

                and all submodules. Otherwise, yields only buffers that

                are direct members of this module.

        Yields:

            torch.Tensor: module buffer

        Example::

            >>> for buf in model.buffers():

            >>>     print(type(buf), buf.size())

            <class 'torch.Tensor'> (20L,)

            <class 'torch.Tensor'> (20L, 1L, 5L, 5L)

        """

        for name, buf in self.named_buffers(recurse=recurse):

            yield buf

children

def children(
    self
) -> Iterator[ForwardRef('Module')]

Returns an iterator over immediate children modules.

Yields:

Type Description
Module a child module
View Source
    def children(self) -> Iterator['Module']:

        r"""Returns an iterator over immediate children modules.

        Yields:

            Module: a child module

        """

        for name, module in self.named_children():

            yield module

configure_apex

def configure_apex(
    self,
    amp: object,
    model: 'LightningModule',
    optimizers: List[torch.optim.optimizer.Optimizer],
    amp_level: str
) -> Tuple[ForwardRef('LightningModule'), List[torch.optim.optimizer.Optimizer]]

Override to init AMP your own way. Must return a model and list of optimizers.

Args: amp: pointer to amp library object. model: pointer to current :class:LightningModule. optimizers: list of optimizers passed in :meth:configure_optimizers. amp_level: AMP mode chosen ('O1', 'O2', etc...)

Return: Apex wrapped model and optimizers

Examples: .. code-block:: python

    # Default implementation used by Trainer.
    def configure_apex(self, amp, model, optimizers, amp_level):
        model, optimizers = amp.initialize(
            model, optimizers, opt_level=amp_level,
        )

        return model, optimizers
View Source
    def configure_apex(

            self,

            amp: object,

            model: 'LightningModule',

            optimizers: List[Optimizer],

            amp_level: str

    ) -> Tuple['LightningModule', List[Optimizer]]:

        r"""

        Override to init AMP your own way.

        Must return a model and list of optimizers.

        Args:

            amp: pointer to amp library object.

            model: pointer to current :class:`LightningModule`.

            optimizers: list of optimizers passed in :meth:`configure_optimizers`.

            amp_level: AMP mode chosen ('O1', 'O2', etc...)

        Return:

            Apex wrapped model and optimizers

        Examples:

            .. code-block:: python

                # Default implementation used by Trainer.

                def configure_apex(self, amp, model, optimizers, amp_level):

                    model, optimizers = amp.initialize(

                        model, optimizers, opt_level=amp_level,

                    )

                    return model, optimizers

        """

        model, optimizers = amp.initialize(model, optimizers, opt_level=amp_level)

        return model, optimizers

configure_ddp

def configure_ddp(
    self,
    model: 'LightningModule',
    device_ids: List[int]
) -> torch.nn.parallel.distributed.DistributedDataParallel

Override to init DDP in your own way or with your own wrapper. The only requirements are that:

  1. On a validation batch the call goes to model.validation_step.
  2. On a training batch the call goes to model.training_step.
  3. On a testing batch, the call goes to model.test_step.+

Args: model: the :class:LightningModule currently being optimized. device_ids: the list of GPU ids.

Return: DDP wrapped model

Examples: .. code-block:: python

    # default implementation used in Trainer
    def configure_ddp(self, model, device_ids):
        # Lightning DDP simply routes to test_step, val_step, etc...
        model = LightningDistributedDataParallel(
            model,
            device_ids=device_ids,
            find_unused_parameters=True
        )
        return model
View Source
    def configure_ddp(

            self,

            model: 'LightningModule',

            device_ids: List[int]

    ) -> DistributedDataParallel:

        r"""

        Override to init DDP in your own way or with your own wrapper.

        The only requirements are that:

        1. On a validation batch the call goes to ``model.validation_step``.

        2. On a training batch the call goes to ``model.training_step``.

        3. On a testing batch, the call goes to ``model.test_step``.+

        Args:

            model: the :class:`LightningModule` currently being optimized.

            device_ids: the list of GPU ids.

        Return:

            DDP wrapped model

        Examples:

            .. code-block:: python

                # default implementation used in Trainer

                def configure_ddp(self, model, device_ids):

                    # Lightning DDP simply routes to test_step, val_step, etc...

                    model = LightningDistributedDataParallel(

                        model,

                        device_ids=device_ids,

                        find_unused_parameters=True

                    )

                    return model

        """

        model = LightningDistributedDataParallel(

            model,

            device_ids=device_ids,

            find_unused_parameters=True

        )

        return model

configure_optimizers

def configure_optimizers(
    self
)
View Source
    def configure_optimizers(self):

        optimizer = optim.SGD(

            self.parameters(), lr=self.learning_rate, momentum=self.momentum, weight_decay=self.weight_decay

        )

        return optimizer

cpu

def cpu(
    self
) -> torch.nn.modules.module.Module

Moves all model parameters and buffers to the CPU.

Returns:

Type Description
Module self
View Source
    def cpu(self) -> Module:

        """Moves all model parameters and buffers to the CPU.

        Returns:

            Module: self

        """

        self._device = torch.device('cpu')

        return super().cpu()

cuda

def cuda(
    self,
    device: Union[int, NoneType] = None
) -> torch.nn.modules.module.Module

Moves all model parameters and buffers to the GPU.

This also makes associated parameters and buffers different objects. So it should be called before constructing optimizer if the module will live on GPU while being optimized.

Parameters:

Name Type Description Default
device None if specified, all parameters will be
copied to that device None

Returns:

Type Description
Module self
View Source
    def cuda(self, device: Optional[int] = None) -> Module:

        """Moves all model parameters and buffers to the GPU.

        This also makes associated parameters and buffers different objects. So

        it should be called before constructing optimizer if the module will

        live on GPU while being optimized.

        Arguments:

            device: if specified, all parameters will be

                copied to that device

        Returns:

            Module: self

        """

        self._device = torch.device('cuda', index=device)

        return super().cuda(device=device)

double

def double(
    self
) -> torch.nn.modules.module.Module

Casts all floating point parameters and buffers to double datatype.

Returns:

Type Description
Module self
View Source
    def double(self) -> Module:

        """Casts all floating point parameters and buffers to ``double`` datatype.

        Returns:

            Module: self

        """

        self._dtype = torch.double

        return super().double()

eval

def eval(
    self: ~T
) -> ~T

Sets the module in evaluation mode.

This has any effect only on certain modules. See documentations of particular modules for details of their behaviors in training/evaluation mode, if they are affected, e.g. :class:Dropout, :class:BatchNorm, etc.

This is equivalent with :meth:self.train(False) <torch.nn.Module.train>.

Returns:

Type Description
Module self
View Source
    def eval(self: T) -> T:

        r"""Sets the module in evaluation mode.

        This has any effect only on certain modules. See documentations of

        particular modules for details of their behaviors in training/evaluation

        mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`,

        etc.

        This is equivalent with :meth:`self.train(False) <torch.nn.Module.train>`.

        Returns:

            Module: self

        """

        return self.train(False)

extra_repr

def extra_repr(
    self
) -> str

Set the extra representation of the module

To print customized extra information, you should reimplement this method in your own modules. Both single-line and multi-line strings are acceptable.

View Source
    def extra_repr(self) -> str:

        r"""Set the extra representation of the module

        To print customized extra information, you should reimplement

        this method in your own modules. Both single-line and multi-line

        strings are acceptable.

        """

        return ''

float

def float(
    self
) -> torch.nn.modules.module.Module

Casts all floating point parameters and buffers to float datatype.

Returns:

Type Description
Module self
View Source
    def float(self) -> Module:

        """Casts all floating point parameters and buffers to float datatype.

        Returns:

            Module: self

        """

        self._dtype = torch.float

        return super().float()

forward

def forward(
    self,
    x
)
View Source
    def forward(self, x):  # ignore

        return self.model(x)

freeze

def freeze(
    self
) -> None

Freeze all params for inference.

View Source
    def freeze(self) -> None:

        r"""

        Freeze all params for inference.

        Example:

            .. code-block:: python

                model = MyLightningModule(...)

                model.freeze()

        """

        for param in self.parameters():

            param.requires_grad = False

        self.eval()

get_progress_bar_dict

def get_progress_bar_dict(
    self
) -> Dict[str, Union[int, str]]

Additional items to be displayed in the progress bar.

Return: Dictionary with the items to be displayed in the progress bar.

View Source
    def get_progress_bar_dict(self) -> Dict[str, Union[int, str]]:

        r"""

        Additional items to be displayed in the progress bar.

        Return:

            Dictionary with the items to be displayed in the progress bar.

        """

        # call .item() only once but store elements without graphs

        running_train_loss = self.trainer.running_loss.mean()

        avg_training_loss = running_train_loss.cpu().item() if running_train_loss is not None else float('NaN')

        tqdm_dict = {

            'loss': '{:.3f}'.format(avg_training_loss)

        }

        if self.trainer.truncated_bptt_steps is not None:

            tqdm_dict['split_idx'] = self.trainer.split_idx

        if self.trainer.logger is not None and self.trainer.logger.version is not None:

            tqdm_dict['v_num'] = self.trainer.logger.version

        return tqdm_dict

get_tqdm_dict

def get_tqdm_dict(
    self
) -> Dict[str, Union[int, str]]

Additional items to be displayed in the progress bar.

Return: Dictionary with the items to be displayed in the progress bar.

Warning: Deprecated since v0.7.3. Use :meth:get_progress_bar_dict instead.

View Source
    def get_tqdm_dict(self) -> Dict[str, Union[int, str]]:

        """

        Additional items to be displayed in the progress bar.

        Return:

            Dictionary with the items to be displayed in the progress bar.

        Warning:

            Deprecated since v0.7.3.

            Use :meth:`get_progress_bar_dict` instead.

        """

        rank_zero_warn("`get_tqdm_dict` was renamed to `get_progress_bar_dict` in v0.7.3"

                       " and this method will be removed in v1.0.0", DeprecationWarning)

        return self.get_progress_bar_dict()

grad_norm

def grad_norm(
    self,
    norm_type: Union[float, int, str]
) -> Dict[str, float]

Compute each parameter's gradient's norm and their overall norm.

The overall norm is computed over all gradients together, as if they were concatenated into a single vector.

Parameters:

Name Type Description Default
norm_type None The type of the used p-norm, cast to float if necessary.
Can be 'inf' for infinity norm.

Return: | None | | norms | None | The dictionary of p-norms of each parameter's gradient and a special entry for the total p-norm of the gradients viewed as a single vector. | None |

View Source
    def grad_norm(self, norm_type: Union[float, int, str]) -> Dict[str, float]:

        """Compute each parameter's gradient's norm and their overall norm.

        The overall norm is computed over all gradients together, as if they

        were concatenated into a single vector.

        Args:

            norm_type: The type of the used p-norm, cast to float if necessary.

                Can be ``'inf'`` for infinity norm.

        Return:

            norms: The dictionary of p-norms of each parameter's gradient and

                a special entry for the total p-norm of the gradients viewed

                as a single vector.

        """

        norm_type = float(norm_type)

        norms, all_norms = {}, []

        for name, p in self.named_parameters():

            if p.grad is None:

                continue

            param_norm = float(p.grad.data.norm(norm_type))

            norms[f'grad_{norm_type}_norm_{name}'] = round(param_norm, 3)

            all_norms.append(param_norm)

        total_norm = float(torch.tensor(all_norms).norm(norm_type))

        norms[f'grad_{norm_type}_norm_total'] = round(total_norm, 3)

        return norms

half

def half(
    self
) -> torch.nn.modules.module.Module

Casts all floating point parameters and buffers to half datatype.

Returns:

Type Description
Module self
View Source
    def half(self) -> Module:

        """Casts all floating point parameters and buffers to ``half`` datatype.

        Returns:

            Module: self

        """

        self._dtype = torch.half

        return super().half()

init_ddp_connection

def init_ddp_connection(
    self,
    global_rank: int,
    world_size: int,
    is_slurm_managing_tasks: bool = True
) -> None

Override to define your custom way of setting up a distributed environment.

Lightning's implementation uses env:// init by default and sets the first node as root for SLURM managed cluster.

Parameters:

Name Type Description Default
global_rank None The global process idx. None
world_size None Number of GPUs being use across all nodes. (num_nodes * num_gpus). None
is_slurm_managing_tasks None is cluster managed by SLURM. None
View Source
    def init_ddp_connection(

            self,

            global_rank: int,

            world_size: int,

            is_slurm_managing_tasks: bool = True

    ) -> None:

        """

        Override to define your custom way of setting up a distributed environment.

        Lightning's implementation uses env:// init by default and sets the first node as root

        for SLURM managed cluster.

        Args:

            global_rank: The global process idx.

            world_size: Number of GPUs being use across all nodes. (num_nodes * num_gpus).

            is_slurm_managing_tasks: is cluster managed by SLURM.

        """

        if is_slurm_managing_tasks:

            self._init_slurm_connection()

        if 'MASTER_ADDR' not in os.environ:

            rank_zero_warn("MASTER_ADDR environment variable is not defined. Set as localhost")

            os.environ['MASTER_ADDR'] = '127.0.0.1'

        log.debug(f"MASTER_ADDR: {os.environ['MASTER_ADDR']}")

        if 'MASTER_PORT' not in os.environ:

            rank_zero_warn("MASTER_PORT environment variable is not defined. Set as 12910")

            os.environ['MASTER_PORT'] = '12910'

        log.debug(f"MASTER_PORT: {os.environ['MASTER_PORT']}")

        if 'WORLD_SIZE' in os.environ and int(os.environ['WORLD_SIZE']) != world_size:

            rank_zero_warn(f"WORLD_SIZE environment variable ({os.environ['WORLD_SIZE']}) "

                           f"is not equal to the computed world size ({world_size}). Ignored.")

        torch_backend = "nccl" if self.trainer.on_gpu else "gloo"

        log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank+1}/{world_size}")

        torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size)

load_state_dict

def load_state_dict(
    self,
    state_dict: Dict[str, torch.Tensor],
    strict: bool = True
)

Copies parameters and buffers from :attr:state_dict into

this module and its descendants. If :attr:strict is True, then the keys of :attr:state_dict must exactly match the keys returned by this module's :meth:~torch.nn.Module.state_dict function.

Parameters:

Name Type Description Default
state_dict dict a dict containing parameters and
persistent buffers. None
strict bool whether to strictly enforce that the keys
in :attr:state_dict match the keys returned by this module's
:meth:~torch.nn.Module.state_dict function. Default: True None

Returns:

Type Description
None NamedTuple with missing_keys and unexpected_keys fields:
* missing_keys is a list of str containing the missing keys
* unexpected_keys is a list of str containing the unexpected keys
View Source
    def load_state_dict(self, state_dict: Union[Dict[str, Tensor], Dict[str, Tensor]],

                        strict: bool = True):

        r"""Copies parameters and buffers from :attr:`state_dict` into

        this module and its descendants. If :attr:`strict` is ``True``, then

        the keys of :attr:`state_dict` must exactly match the keys returned

        by this module's :meth:`~torch.nn.Module.state_dict` function.

        Arguments:

            state_dict (dict): a dict containing parameters and

                persistent buffers.

            strict (bool, optional): whether to strictly enforce that the keys

                in :attr:`state_dict` match the keys returned by this module's

                :meth:`~torch.nn.Module.state_dict` function. Default: ``True``

        Returns:

            ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:

                * **missing_keys** is a list of str containing the missing keys

                * **unexpected_keys** is a list of str containing the unexpected keys

        """

        missing_keys = []

        unexpected_keys = []

        error_msgs = []

        # copy state_dict so _load_from_state_dict can modify it

        metadata = getattr(state_dict, '_metadata', None)

        state_dict = state_dict.copy()

        if metadata is not None:

            state_dict._metadata = metadata

        def load(module, prefix=''):

            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})

            module._load_from_state_dict(

                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)

            for name, child in module._modules.items():

                if child is not None:

                    load(child, prefix + name + '.')

        load(self)

        load = None  # break load->load reference cycle

        if strict:

            if len(unexpected_keys) > 0:

                error_msgs.insert(

                    0, 'Unexpected key(s) in state_dict: {}. '.format(

                        ', '.join('"{}"'.format(k) for k in unexpected_keys)))

            if len(missing_keys) > 0:

                error_msgs.insert(

                    0, 'Missing key(s) in state_dict: {}. '.format(

                        ', '.join('"{}"'.format(k) for k in missing_keys)))

        if len(error_msgs) > 0:

            raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(

                               self.__class__.__name__, "\n\t".join(error_msgs)))

        return _IncompatibleKeys(missing_keys, unexpected_keys)

modules

def modules(
    self
) -> Iterator[ForwardRef('Module')]

Returns an iterator over all modules in the network.

Yields:

Type Description
Module a module in the network
Note:
Duplicate modules are returned only once. In the following
example, l will be returned only once.

Example::

>>> l = nn.Linear(2, 2)
>>> net = nn.Sequential(l, l)
>>> for idx, m in enumerate(net.modules()):
        print(idx, '->', m)

0 -> Sequential(
  (0): Linear(in_features=2, out_features=2, bias=True)
  (1): Linear(in_features=2, out_features=2, bias=True)
)
1 -> Linear(in_features=2, out_features=2, bias=True) |
View Source
    def modules(self) -> Iterator['Module']:

        r"""Returns an iterator over all modules in the network.

        Yields:

            Module: a module in the network

        Note:

            Duplicate modules are returned only once. In the following

            example, ``l`` will be returned only once.

        Example::

            >>> l = nn.Linear(2, 2)

            >>> net = nn.Sequential(l, l)

            >>> for idx, m in enumerate(net.modules()):

                    print(idx, '->', m)

            0 -> Sequential(

              (0): Linear(in_features=2, out_features=2, bias=True)

              (1): Linear(in_features=2, out_features=2, bias=True)

            )

            1 -> Linear(in_features=2, out_features=2, bias=True)

        """

        for name, module in self.named_modules():

            yield module

named_buffers

def named_buffers(
    self,
    prefix: str = '',
    recurse: bool = True
) -> Iterator[Tuple[str, torch.Tensor]]

Returns an iterator over module buffers, yielding both the

name of the buffer as well as the buffer itself.

Parameters:

Name Type Description Default
prefix str prefix to prepend to all buffer names. None
recurse bool if True, then yields buffers of this module
and all submodules. Otherwise, yields only buffers that
are direct members of this module. None

Yields:

Type Description
None (string, torch.Tensor): Tuple containing the name and buffer

Example::

>>> for name, buf in self.named_buffers():
>>>    if name in ['running_var']:
>>>        print(buf.size()) |
View Source
    def named_buffers(self, prefix: str = '', recurse: bool = True) -> Iterator[Tuple[str, Tensor]]:

        r"""Returns an iterator over module buffers, yielding both the

        name of the buffer as well as the buffer itself.

        Args:

            prefix (str): prefix to prepend to all buffer names.

            recurse (bool): if True, then yields buffers of this module

                and all submodules. Otherwise, yields only buffers that

                are direct members of this module.

        Yields:

            (string, torch.Tensor): Tuple containing the name and buffer

        Example::

            >>> for name, buf in self.named_buffers():

            >>>    if name in ['running_var']:

            >>>        print(buf.size())

        """

        gen = self._named_members(

            lambda module: module._buffers.items(),

            prefix=prefix, recurse=recurse)

        for elem in gen:

            yield elem

named_children

def named_children(
    self
) -> Iterator[Tuple[str, ForwardRef('Module')]]

Returns an iterator over immediate children modules, yielding both

the name of the module as well as the module itself.

Yields:

Type Description
None (string, Module): Tuple containing a name and child module

Example::

>>> for name, module in model.named_children():
>>>     if name in ['conv4', 'conv5']:
>>>         print(module) |
View Source
    def named_children(self) -> Iterator[Tuple[str, 'Module']]:

        r"""Returns an iterator over immediate children modules, yielding both

        the name of the module as well as the module itself.

        Yields:

            (string, Module): Tuple containing a name and child module

        Example::

            >>> for name, module in model.named_children():

            >>>     if name in ['conv4', 'conv5']:

            >>>         print(module)

        """

        memo = set()

        for name, module in self._modules.items():

            if module is not None and module not in memo:

                memo.add(module)

                yield name, module

named_modules

def named_modules(
    self,
    memo: Union[Set[ForwardRef('Module')], NoneType] = None,
    prefix: str = ''
)

Returns an iterator over all modules in the network, yielding

both the name of the module as well as the module itself.

Yields:

Type Description
None (string, Module): Tuple of name and module

Note: Duplicate modules are returned only once. In the following example, l will be returned only once.

Example::

>>> l = nn.Linear(2, 2)
>>> net = nn.Sequential(l, l)
>>> for idx, m in enumerate(net.named_modules()):
        print(idx, '->', m)

0 -> ('', Sequential(
  (0): Linear(in_features=2, out_features=2, bias=True)
  (1): Linear(in_features=2, out_features=2, bias=True)
))
1 -> ('0', Linear(in_features=2, out_features=2, bias=True)) |
View Source
    def named_modules(self, memo: Optional[Set['Module']] = None, prefix: str = ''):

        r"""Returns an iterator over all modules in the network, yielding

        both the name of the module as well as the module itself.

        Yields:

            (string, Module): Tuple of name and module

        Note:

            Duplicate modules are returned only once. In the following

            example, ``l`` will be returned only once.

        Example::

            >>> l = nn.Linear(2, 2)

            >>> net = nn.Sequential(l, l)

            >>> for idx, m in enumerate(net.named_modules()):

                    print(idx, '->', m)

            0 -> ('', Sequential(

              (0): Linear(in_features=2, out_features=2, bias=True)

              (1): Linear(in_features=2, out_features=2, bias=True)

            ))

            1 -> ('0', Linear(in_features=2, out_features=2, bias=True))

        """

        if memo is None:

            memo = set()

        if self not in memo:

            memo.add(self)

            yield prefix, self

            for name, module in self._modules.items():

                if module is None:

                    continue

                submodule_prefix = prefix + ('.' if prefix else '') + name

                for m in module.named_modules(memo, submodule_prefix):

                    yield m

named_parameters

def named_parameters(
    self,
    prefix: str = '',
    recurse: bool = True
) -> Iterator[Tuple[str, torch.Tensor]]

Returns an iterator over module parameters, yielding both the

name of the parameter as well as the parameter itself.

Parameters:

Name Type Description Default
prefix str prefix to prepend to all parameter names. None
recurse bool if True, then yields parameters of this module
and all submodules. Otherwise, yields only parameters that
are direct members of this module. None

Yields:

Type Description
None (string, Parameter): Tuple containing the name and parameter

Example::

>>> for name, param in self.named_parameters():
>>>    if name in ['bias']:
>>>        print(param.size()) |
View Source
    def named_parameters(self, prefix: str = '', recurse: bool = True) -> Iterator[Tuple[str, Tensor]]:

        r"""Returns an iterator over module parameters, yielding both the

        name of the parameter as well as the parameter itself.

        Args:

            prefix (str): prefix to prepend to all parameter names.

            recurse (bool): if True, then yields parameters of this module

                and all submodules. Otherwise, yields only parameters that

                are direct members of this module.

        Yields:

            (string, Parameter): Tuple containing the name and parameter

        Example::

            >>> for name, param in self.named_parameters():

            >>>    if name in ['bias']:

            >>>        print(param.size())

        """

        gen = self._named_members(

            lambda module: module._parameters.items(),

            prefix=prefix, recurse=recurse)

        for elem in gen:

            yield elem

on_after_backward

def on_after_backward(
    self
) -> None

Called in the training loop after loss.backward() and before optimizers do anything.

This is the ideal place to inspect or log gradient information.

Example::

def on_after_backward(self):
    # example to inspect gradient information in tensorboard
    if self.trainer.global_step % 25 == 0:  # don't make the tf file huge
        params = self.state_dict()
        for k, v in params.items():
            grads = v
            name = k
            self.logger.experiment.add_histogram(tag=name, values=grads,
                                                 global_step=self.trainer.global_step)
View Source
    def on_after_backward(self) -> None:

        """

        Called in the training loop after loss.backward() and before optimizers do anything.

        This is the ideal place to inspect or log gradient information.

        Example::

            def on_after_backward(self):

                # example to inspect gradient information in tensorboard

                if self.trainer.global_step % 25 == 0:  # don't make the tf file huge

                    params = self.state_dict()

                    for k, v in params.items():

                        grads = v

                        name = k

                        self.logger.experiment.add_histogram(tag=name, values=grads,

                                                             global_step=self.trainer.global_step)

        """

on_batch_end

def on_batch_end(
    self
) -> None

Called in the training loop after the batch.

View Source
    def on_batch_end(self) -> None:

        """

        Called in the training loop after the batch.

        """

        # do something when the batch ends

on_batch_start

def on_batch_start(
    self,
    batch: Any
) -> None

Called in the training loop before anything happens for that batch.

If you return -1 here, you will skip training for the rest of the current epoch.

Parameters:

Name Type Description Default
batch None The batched data as it is returned by the training DataLoader. None
View Source
    def on_batch_start(self, batch: Any) -> None:

        """

        Called in the training loop before anything happens for that batch.

        If you return -1 here, you will skip training for the rest of the current epoch.

        Args:

            batch: The batched data as it is returned by the training DataLoader.

        """

        # do something when the batch starts

on_before_zero_grad

def on_before_zero_grad(
    self,
    optimizer: torch.optim.optimizer.Optimizer
) -> None

Called after optimizer.step() and before optimizer.zero_grad().

Called in the training loop after taking an optimizer step and before zeroing grads. Good place to inspect weight information with weights updated.

This is where it is called::

for optimizer in optimizers:
    optimizer.step()
    model.on_before_zero_grad(optimizer) # < ---- called here
    optimizer.zero_grad

Parameters:

Name Type Description Default
optimizer None The optimizer for which grads should be zeroed. None
View Source
    def on_before_zero_grad(self, optimizer: Optimizer) -> None:

        """

        Called after optimizer.step() and before optimizer.zero_grad().

        Called in the training loop after taking an optimizer step and before zeroing grads.

        Good place to inspect weight information with weights updated.

        This is where it is called::

            for optimizer in optimizers:

                optimizer.step()

                model.on_before_zero_grad(optimizer) # < ---- called here

                optimizer.zero_grad

        Args:

            optimizer: The optimizer for which grads should be zeroed.

        """

        # do something with the optimizer or inspect it.

on_epoch_end

def on_epoch_end(
    self
) -> None

Called in the training loop at the very end of the epoch.

View Source
    def on_epoch_end(self) -> None:

        """

        Called in the training loop at the very end of the epoch.

        """

        # do something when the epoch ends

on_epoch_start

def on_epoch_start(
    self
) -> None

Called in the training loop at the very beginning of the epoch.

View Source
    def on_epoch_start(self) -> None:

        """

        Called in the training loop at the very beginning of the epoch.

        """

        # do something when the epoch starts

on_fit_end

def on_fit_end(
    self
)

Called at the very end of fit.

If on DDP it is called on every process

View Source
    def on_fit_end(self):

        """

        Called at the very end of fit.

        If on DDP it is called on every process

        """

on_fit_start

def on_fit_start(
    self
)

Called at the very beginning of fit.

If on DDP it is called on every process

View Source
    def on_fit_start(self):

        """

        Called at the very beginning of fit.

        If on DDP it is called on every process

        """

on_hpc_load

def on_hpc_load(
    self,
    checkpoint: Dict[str, Any]
) -> None

Hook to do whatever you need right before Slurm manager loads the model.

Parameters:

Name Type Description Default
checkpoint None A dictionary with variables from the checkpoint. None
View Source
    def on_hpc_load(self, checkpoint: Dict[str, Any]) -> None:

        """

        Hook to do whatever you need right before Slurm manager loads the model.

        Args:

            checkpoint: A dictionary with variables from the checkpoint.

        """

on_hpc_save

def on_hpc_save(
    self,
    checkpoint: Dict[str, Any]
) -> None

Hook to do whatever you need right before Slurm manager saves the model.

Parameters:

Name Type Description Default
checkpoint None A dictionary in which you can save variables to save in a checkpoint.
Contents need to be pickleable. None
View Source
    def on_hpc_save(self, checkpoint: Dict[str, Any]) -> None:

        """

        Hook to do whatever you need right before Slurm manager saves the model.

        Args:

            checkpoint: A dictionary in which you can save variables to save in a checkpoint.

                Contents need to be pickleable.

        """

on_load_checkpoint

def on_load_checkpoint(
    self,
    checkpoint: Dict[str, Any]
) -> None

Called by Lightning to restore your model.

If you saved something with :meth:on_save_checkpoint this is your chance to restore this.

Parameters:

Name Type Description Default
checkpoint None Loaded checkpoint None
View Source
    def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None:

        r"""

        Called by Lightning to restore your model.

        If you saved something with :meth:`on_save_checkpoint` this is your chance to restore this.

        Args:

            checkpoint: Loaded checkpoint

        Example:

            .. code-block:: python

                def on_load_checkpoint(self, checkpoint):

                    # 99% of the time you don't need to implement this method

                    self.something_cool_i_want_to_save = checkpoint['something_cool_i_want_to_save']

        Note:

            Lightning auto-restores global step, epoch, and train state including amp scaling.

            There is no need for you to restore anything regarding training.

        """

on_post_performance_check

def on_post_performance_check(
    self
) -> None

Called at the very end of the validation loop.

View Source
    def on_post_performance_check(self) -> None:

        """

        Called at the very end of the validation loop.

        """

        # do something before validation end

on_pre_performance_check

def on_pre_performance_check(
    self
) -> None

Called at the very beginning of the validation loop.

View Source
    def on_pre_performance_check(self) -> None:

        """

        Called at the very beginning of the validation loop.

        """

        # do something before validation starts

on_sanity_check_start

def on_sanity_check_start(
    self
)

Called before starting evaluation.

Warning: Deprecated. Will be removed in v0.9.0.

View Source
    def on_sanity_check_start(self):

        """

        Called before starting evaluation.

        Warning:

            Deprecated. Will be removed in v0.9.0.

        """

on_save_checkpoint

def on_save_checkpoint(
    self,
    checkpoint: Dict[str, Any]
) -> None

Called by Lightning when saving a checkpoint to give you a chance to store anything

else you might want to save.

Parameters:

Name Type Description Default
checkpoint None Checkpoint to be saved None
View Source
    def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> None:

        r"""

        Called by Lightning when saving a checkpoint to give you a chance to store anything

        else you might want to save.

        Args:

            checkpoint: Checkpoint to be saved

        Example:

            .. code-block:: python

                def on_save_checkpoint(self, checkpoint):

                    # 99% of use cases you don't need to implement this method

                    checkpoint['something_cool_i_want_to_save'] = my_cool_pickable_object

        Note:

            Lightning saves all aspects of training (epoch, global step, etc...)

            including amp scaling.

            There is no need for you to store anything about training.

        """

on_train_end

def on_train_end(
    self
) -> None

Called at the end of training before logger experiment is closed.

View Source
    def on_train_end(self) -> None:

        """

        Called at the end of training before logger experiment is closed.

        """

        # do something at the end of training

on_train_start

def on_train_start(
    self
) -> None

Called at the beginning of training before sanity check.

View Source
    def on_train_start(self) -> None:

        """

        Called at the beginning of training before sanity check.

        """

        # do something at the start of training

optimizer_step

def optimizer_step(
    self,
    epoch: int,
    batch_idx: int,
    optimizer: torch.optim.optimizer.Optimizer,
    optimizer_idx: int,
    second_order_closure: Union[Callable, NoneType] = None,
    on_tpu: bool = False,
    using_native_amp: bool = False,
    using_lbfgs: bool = False
) -> None

Override this method to adjust the default way the

:class:~pytorch_lightning.trainer.trainer.Trainer calls each optimizer. By default, Lightning calls step() and zero_grad() as shown in the example once per optimizer.

Parameters:

Name Type Description Default
epoch None Current epoch None
batch_idx None Index of current batch None
optimizer None A PyTorch optimizer None
optimizer_idx None If you used multiple optimizers this indexes into that list. None
second_order_closure None closure for second order methods None
on_tpu None true if TPU backward is required None
using_native_amp None True if using native amp None
using_lbfgs None True if the matching optimizer is lbfgs None
View Source
    def optimizer_step(

            self,

            epoch: int,

            batch_idx: int,

            optimizer: Optimizer,

            optimizer_idx: int,

            second_order_closure: Optional[Callable] = None,

            on_tpu: bool = False,

            using_native_amp: bool = False,

            using_lbfgs: bool = False,

    ) -> None:

        r"""

        Override this method to adjust the default way the

        :class:`~pytorch_lightning.trainer.trainer.Trainer` calls each optimizer.

        By default, Lightning calls ``step()`` and ``zero_grad()`` as shown in the example

        once per optimizer.

        Args:

            epoch: Current epoch

            batch_idx: Index of current batch

            optimizer: A PyTorch optimizer

            optimizer_idx: If you used multiple optimizers this indexes into that list.

            second_order_closure: closure for second order methods

            on_tpu: true if TPU backward is required

            using_native_amp: True if using native amp

            using_lbfgs: True if the matching optimizer is lbfgs

        Examples:

            .. code-block:: python

                # DEFAULT

                def optimizer_step(self, current_epoch, batch_idx, optimizer, optimizer_idx,

                                   second_order_closure, on_tpu, using_native_amp, using_lbfgs):

                    optimizer.step()

                # Alternating schedule for optimizer steps (i.e.: GANs)

                def optimizer_step(self, current_epoch, batch_idx, optimizer, optimizer_idx,

                                   second_order_closure, on_tpu, using_native_amp, using_lbfgs):

                    # update generator opt every 2 steps

                    if optimizer_idx == 0:

                        if batch_idx % 2 == 0 :

                            optimizer.step()

                            optimizer.zero_grad()

                    # update discriminator opt every 4 steps

                    if optimizer_idx == 1:

                        if batch_idx % 4 == 0 :

                            optimizer.step()

                            optimizer.zero_grad()

                    # ...

                    # add as many optimizers as you want

            Here's another example showing how to use this for more advanced things such as

            learning rate warm-up:

            .. code-block:: python

                # learning rate warm-up

                def optimizer_step(self, current_epoch, batch_idx, optimizer,

                                    optimizer_idx, second_order_closure, on_tpu, using_native_amp, using_lbfgs):

                    # warm up lr

                    if self.trainer.global_step < 500:

                        lr_scale = min(1., float(self.trainer.global_step + 1) / 500.)

                        for pg in optimizer.param_groups:

                            pg['lr'] = lr_scale * self.learning_rate

                    # update params

                    optimizer.step()

                    optimizer.zero_grad()

        Note:

            If you also override the :meth:`~pytorch_lightning.core.hooks.ModelHooks.on_before_zero_grad`

            model hook don't forget to add the call to it before ``optimizer.zero_grad()`` yourself.

        """

        if on_tpu:

            xm.optimizer_step(optimizer)

        elif using_native_amp:

            self.trainer.scaler.step(optimizer)

        elif using_lbfgs:

            optimizer.step(second_order_closure)

        else:

            optimizer.step()

optimizer_zero_grad

def optimizer_zero_grad(
    self,
    epoch: int,
    batch_idx: int,
    optimizer: torch.optim.optimizer.Optimizer,
    optimizer_idx: int
)
View Source
    def optimizer_zero_grad(self,

                            epoch: int,

                            batch_idx: int,

                            optimizer: Optimizer,

                            optimizer_idx: int):

        optimizer.zero_grad()

parameters

def parameters(
    self,
    recurse: bool = True
) -> Iterator[torch.nn.parameter.Parameter]

Returns an iterator over module parameters.

This is typically passed to an optimizer.

Parameters:

Name Type Description Default
recurse bool if True, then yields parameters of this module
and all submodules. Otherwise, yields only parameters that
are direct members of this module. None

Yields:

Type Description
Parameter module parameter
Example::
>>> for param in model.parameters():
>>>     print(type(param), param.size())
<class 'torch.Tensor'> (20L,)
<class 'torch.Tensor'> (20L, 1L, 5L, 5L) |
View Source
    def parameters(self, recurse: bool = True) -> Iterator[Parameter]:

        r"""Returns an iterator over module parameters.

        This is typically passed to an optimizer.

        Args:

            recurse (bool): if True, then yields parameters of this module

                and all submodules. Otherwise, yields only parameters that

                are direct members of this module.

        Yields:

            Parameter: module parameter

        Example::

            >>> for param in model.parameters():

            >>>     print(type(param), param.size())

            <class 'torch.Tensor'> (20L,)

            <class 'torch.Tensor'> (20L, 1L, 5L, 5L)

        """

        for name, param in self.named_parameters(recurse=recurse):

            yield param

prepare_data

def prepare_data(
    self
) -> None

Use this to download and prepare data.

.. warning:: DO NOT set state to the model (use setup instead) since this is NOT called on every GPU in DDP/TPU

Example::

def prepare_data(self):
    # good
    download_data()
    tokenize()
    etc()

    # bad
    self.split = data_split
    self.some_state = some_other_state()

In DDP prepare_data can be called in two ways (using Trainer(prepare_data_per_node)):

  1. Once per node. This is the default and is only called on LOCAL_RANK=0.
  2. Once in total. Only called on GLOBAL_RANK=0.

Example::

# DEFAULT
# called once per node on LOCAL_RANK=0 of that node
Trainer(prepare_data_per_node=True)

# call on GLOBAL_RANK=0 (great for shared file systems)
Trainer(prepare_data_per_node=False)

This is called before requesting the dataloaders:

.. code-block:: python

model.prepare_data()
    if ddp/tpu: init()
model.setup(stage)
model.train_dataloader()
model.val_dataloader()
model.test_dataloader()
View Source
    def prepare_data(self) -> None:

        """

        Use this to download and prepare data.

        .. warning:: DO NOT set state to the model (use `setup` instead)

            since this is NOT called on every GPU in DDP/TPU

        Example::

            def prepare_data(self):

                # good

                download_data()

                tokenize()

                etc()

                # bad

                self.split = data_split

                self.some_state = some_other_state()

        In DDP prepare_data can be called in two ways (using Trainer(prepare_data_per_node)):

        1. Once per node. This is the default and is only called on LOCAL_RANK=0.

        2. Once in total. Only called on GLOBAL_RANK=0.

        Example::

            # DEFAULT

            # called once per node on LOCAL_RANK=0 of that node

            Trainer(prepare_data_per_node=True)

            # call on GLOBAL_RANK=0 (great for shared file systems)

            Trainer(prepare_data_per_node=False)

        This is called before requesting the dataloaders:

        .. code-block:: python

            model.prepare_data()

                if ddp/tpu: init()

            model.setup(stage)

            model.train_dataloader()

            model.val_dataloader()

            model.test_dataloader()

        """

print

def print(
    self,
    *args,
    **kwargs
) -> None

Prints only from process 0. Use this in any distributed mode to log only once.

Parameters:

Name Type Description Default
*args None The thing to print. Will be passed to Python's built-in print function. None
**kwargs None Will be passed to Python's built-in print function. None
View Source
    def print(self, *args, **kwargs) -> None:

        r"""

        Prints only from process 0. Use this in any distributed mode to log only once.

        Args:

            *args: The thing to print. Will be passed to Python's built-in print function.

            **kwargs: Will be passed to Python's built-in print function.

        Example:

            .. code-block:: python

                def forward(self, x):

                    self.print(x, 'in forward')

        """

        if self.trainer.is_global_zero:

            print(*args, **kwargs)

register_backward_hook

def register_backward_hook(
    self,
    hook: Callable[[ForwardRef('Module'), Union[Tuple[torch.Tensor, ...], torch.Tensor], Union[Tuple[torch.Tensor, ...], torch.Tensor]], Union[NoneType, torch.Tensor]]
) -> torch.utils.hooks.RemovableHandle

Registers a backward hook on the module.

.. warning ::

The current implementation will not have the presented behavior
for complex :class:`Module` that perform many operations.
In some failure cases, :attr:`grad_input` and :attr:`grad_output` will only
contain the gradients for a subset of the inputs and outputs.
For such :class:`Module`, you should use :func:`torch.Tensor.register_hook`
directly on a specific input or output to get the required gradients.

The hook will be called every time the gradients with respect to module inputs are computed. The hook should have the following signature::

hook(module, grad_input, grad_output) -> Tensor or None

The :attr:grad_input and :attr:grad_output may be tuples if the module has multiple inputs or outputs. The hook should not modify its arguments, but it can optionally return a new gradient with respect to input that will be used in place of :attr:grad_input in subsequent computations. :attr:grad_input will only correspond to the inputs given as positional arguments.

Returns:

Type Description
None :class:torch.utils.hooks.RemovableHandle:
a handle that can be used to remove the added hook by calling
handle.remove()
View Source
    def register_backward_hook(

        self, hook: Callable[['Module', _grad_t, _grad_t], Union[None, Tensor]]

    ) -> RemovableHandle:

        r"""Registers a backward hook on the module.

        .. warning ::

            The current implementation will not have the presented behavior

            for complex :class:`Module` that perform many operations.

            In some failure cases, :attr:`grad_input` and :attr:`grad_output` will only

            contain the gradients for a subset of the inputs and outputs.

            For such :class:`Module`, you should use :func:`torch.Tensor.register_hook`

            directly on a specific input or output to get the required gradients.

        The hook will be called every time the gradients with respect to module

        inputs are computed. The hook should have the following signature::

            hook(module, grad_input, grad_output) -> Tensor or None

        The :attr:`grad_input` and :attr:`grad_output` may be tuples if the

        module has multiple inputs or outputs. The hook should not modify its

        arguments, but it can optionally return a new gradient with respect to

        input that will be used in place of :attr:`grad_input` in subsequent

        computations. :attr:`grad_input` will only correspond to the inputs given

        as positional arguments.

        Returns:

            :class:`torch.utils.hooks.RemovableHandle`:

                a handle that can be used to remove the added hook by calling

                ``handle.remove()``

        """

        handle = hooks.RemovableHandle(self._backward_hooks)

        self._backward_hooks[handle.id] = hook

        return handle

register_buffer

def register_buffer(
    self,
    name: str,
    tensor: torch.Tensor,
    persistent: bool = True
) -> None

Adds a buffer to the module.

This is typically used to register a buffer that should not to be considered a model parameter. For example, BatchNorm's running_mean is not a parameter, but is part of the module's state. Buffers, by default, are persistent and will be saved alongside parameters. This behavior can be changed by setting :attr:persistent to False. The only difference between a persistent buffer and a non-persistent buffer is that the latter will not be a part of this module's :attr:state_dict.

Buffers can be accessed as attributes using given names.

Args: name (string): name of the buffer. The buffer can be accessed from this module using the given name tensor (Tensor): buffer to be registered. persistent (bool): whether the buffer is part of this module's :attr:state_dict.

Example::

>>> self.register_buffer('running_mean', torch.zeros(num_features))
View Source
    def register_buffer(self, name: str, tensor: Tensor, persistent: bool = True) -> None:

        r"""Adds a buffer to the module.

        This is typically used to register a buffer that should not to be

        considered a model parameter. For example, BatchNorm's ``running_mean``

        is not a parameter, but is part of the module's state. Buffers, by

        default, are persistent and will be saved alongside parameters. This

        behavior can be changed by setting :attr:`persistent` to ``False``. The

        only difference between a persistent buffer and a non-persistent buffer

        is that the latter will not be a part of this module's

        :attr:`state_dict`.

        Buffers can be accessed as attributes using given names.

        Args:

            name (string): name of the buffer. The buffer can be accessed

                from this module using the given name

            tensor (Tensor): buffer to be registered.

            persistent (bool): whether the buffer is part of this module's

                :attr:`state_dict`.

        Example::

            >>> self.register_buffer('running_mean', torch.zeros(num_features))

        """

        if persistent is False and isinstance(self, torch.jit.ScriptModule):

            raise RuntimeError("ScriptModule does not support non-persistent buffers")

        if '_buffers' not in self.__dict__:

            raise AttributeError(

                "cannot assign buffer before Module.__init__() call")

        elif not isinstance(name, torch._six.string_classes):

            raise TypeError("buffer name should be a string. "

                            "Got {}".format(torch.typename(name)))

        elif '.' in name:

            raise KeyError("buffer name can't contain \".\"")

        elif name == '':

            raise KeyError("buffer name can't be empty string \"\"")

        elif hasattr(self, name) and name not in self._buffers:

            raise KeyError("attribute '{}' already exists".format(name))

        elif tensor is not None and not isinstance(tensor, torch.Tensor):

            raise TypeError("cannot assign '{}' object to buffer '{}' "

                            "(torch Tensor or None required)"

                            .format(torch.typename(tensor), name))

        else:

            self._buffers[name] = tensor

            if persistent:

                self._non_persistent_buffers_set.discard(name)

            else:

                self._non_persistent_buffers_set.add(name)

register_forward_hook

def register_forward_hook(
    self,
    hook: Callable[..., NoneType]
) -> torch.utils.hooks.RemovableHandle

Registers a forward hook on the module.

The hook will be called every time after :func:forward has computed an output. It should have the following signature::

hook(module, input, output) -> None or modified output

The input contains only the positional arguments given to the module. Keyword arguments won't be passed to the hooks and only to the forward. The hook can modify the output. It can modify the input inplace but it will not have effect on forward since this is called after

View Source
    def register_forward_hook(self, hook: Callable[..., None]) -> RemovableHandle:

        r"""Registers a forward hook on the module.

        The hook will be called every time after :func:`forward` has computed an output.

        It should have the following signature::

            hook(module, input, output) -> None or modified output

        The input contains only the positional arguments given to the module.

        Keyword arguments won't be passed to the hooks and only to the ``forward``.

        The hook can modify the output. It can modify the input inplace but

        it will not have effect on forward since this is called after

        :func:`forward` is called.

        Returns:

            :class:`torch.utils.hooks.RemovableHandle`:

                a handle that can be used to remove the added hook by calling

                ``handle.remove()``

        """

        handle = hooks.RemovableHandle(self._forward_hooks)

        self._forward_hooks[handle.id] = hook

        return handle

register_forward_pre_hook

def register_forward_pre_hook(
    self,
    hook: Callable[..., NoneType]
) -> torch.utils.hooks.RemovableHandle

Registers a forward pre-hook on the module.

The hook will be called every time before :func:forward is invoked. It should have the following signature::

hook(module, input) -> None or modified input

The input contains only the positional arguments given to the module. Keyword arguments won't be passed to the hooks and only to the forward. The hook can modify the input. User can either return a tuple or a single modified value in the hook. We will wrap the value into a tuple if a single value is returned(unless that value is already a tuple).

Returns:

Type Description
None :class:torch.utils.hooks.RemovableHandle:
a handle that can be used to remove the added hook by calling
handle.remove()
View Source
    def register_forward_pre_hook(self, hook: Callable[..., None]) -> RemovableHandle:

        r"""Registers a forward pre-hook on the module.

        The hook will be called every time before :func:`forward` is invoked.

        It should have the following signature::

            hook(module, input) -> None or modified input

        The input contains only the positional arguments given to the module.

        Keyword arguments won't be passed to the hooks and only to the ``forward``.

        The hook can modify the input. User can either return a tuple or a

        single modified value in the hook. We will wrap the value into a tuple

        if a single value is returned(unless that value is already a tuple).

        Returns:

            :class:`torch.utils.hooks.RemovableHandle`:

                a handle that can be used to remove the added hook by calling

                ``handle.remove()``

        """

        handle = hooks.RemovableHandle(self._forward_pre_hooks)

        self._forward_pre_hooks[handle.id] = hook

        return handle

register_parameter

def register_parameter(
    self,
    name: str,
    param: torch.nn.parameter.Parameter
) -> None

Adds a parameter to the module.

The parameter can be accessed as an attribute using given name.

Parameters:

Name Type Description Default
name string name of the parameter. The parameter can be accessed
from this module using the given name None
param Parameter parameter to be added to the module. None
View Source
    def register_parameter(self, name: str, param: Parameter) -> None:

        r"""Adds a parameter to the module.

        The parameter can be accessed as an attribute using given name.

        Args:

            name (string): name of the parameter. The parameter can be accessed

                from this module using the given name

            param (Parameter): parameter to be added to the module.

        """

        if '_parameters' not in self.__dict__:

            raise AttributeError(

                "cannot assign parameter before Module.__init__() call")

        elif not isinstance(name, torch._six.string_classes):

            raise TypeError("parameter name should be a string. "

                            "Got {}".format(torch.typename(name)))

        elif '.' in name:

            raise KeyError("parameter name can't contain \".\"")

        elif name == '':

            raise KeyError("parameter name can't be empty string \"\"")

        elif hasattr(self, name) and name not in self._parameters:

            raise KeyError("attribute '{}' already exists".format(name))

        if param is None:

            self._parameters[name] = None

        elif not isinstance(param, Parameter):

            raise TypeError("cannot assign '{}' object to parameter '{}' "

                            "(torch.nn.Parameter or None required)"

                            .format(torch.typename(param), name))

        elif param.grad_fn:

            raise ValueError(

                "Cannot assign non-leaf Tensor to parameter '{0}'. Model "

                "parameters must be created explicitly. To express '{0}' "

                "as a function of another Tensor, compute the value in "

                "the forward() method.".format(name))

        else:

            self._parameters[name] = param

requires_grad_

def requires_grad_(
    self: ~T,
    requires_grad: bool = True
) -> ~T

Change if autograd should record operations on parameters in this

module.

This method sets the parameters' :attr:requires_grad attributes in-place.

This method is helpful for freezing part of the module for finetuning or training parts of a model individually (e.g., GAN training).

Parameters:

Name Type Description Default
requires_grad bool whether autograd should record operations on
parameters in this module. Default: True. None

Returns:

Type Description
Module self
View Source
    def requires_grad_(self: T, requires_grad: bool = True) -> T:

        r"""Change if autograd should record operations on parameters in this

        module.

        This method sets the parameters' :attr:`requires_grad` attributes

        in-place.

        This method is helpful for freezing part of the module for finetuning

        or training parts of a model individually (e.g., GAN training).

        Args:

            requires_grad (bool): whether autograd should record operations on

                                  parameters in this module. Default: ``True``.

        Returns:

            Module: self

        """

        for p in self.parameters():

            p.requires_grad_(requires_grad)

        return self

save_hyperparameters

def save_hyperparameters(
    self,
    *args,
    frame=None
) -> None

Save all model arguments.

Parameters:

Name Type Description Default
args None single object of dict, NameSpace or OmegaConf
or string names or argumenst from class __init__

from collections import OrderedDict class ManuallyArgsModel(LightningModule): ... def init(self, arg1, arg2, arg3): ... super().init() ... # manually assine arguments ... self.save_hyperparameters('arg1', 'arg3') ... def forward(self, args, *kwargs): ... ... model = ManuallyArgsModel(1, 'abc', 3.14) model.hparams "arg1": 1 "arg3": 3.14

class AutomaticArgsModel(LightningModule): ... def init(self, arg1, arg2, arg3): ... super().init() ... # equivalent automatic ... self.save_hyperparameters() ... def forward(self, args, *kwargs): ... ... model = AutomaticArgsModel(1, 'abc', 3.14) model.hparams "arg1": 1 "arg2": abc "arg3": 3.14

class SingleArgModel(LightningModule): ... def init(self, params): ... super().init() ... # manually assign single argument ... self.save_hyperparameters(params) ... def forward(self, args, *kwargs): ... ... model = SingleArgModel(Namespace(p1=1, p2='abc', p3=3.14)) model.hparams "p1": 1 "p2": abc "p3": 3.14 | None |

View Source
    def save_hyperparameters(self, *args, frame=None) -> None:

        """Save all model arguments.

        Args:

            args: single object of `dict`, `NameSpace` or `OmegaConf`

             or string names or argumenst from class `__init__`

        >>> from collections import OrderedDict

        >>> class ManuallyArgsModel(LightningModule):

        ...     def __init__(self, arg1, arg2, arg3):

        ...         super().__init__()

        ...         # manually assine arguments

        ...         self.save_hyperparameters('arg1', 'arg3')

        ...     def forward(self, *args, **kwargs):

        ...         ...

        >>> model = ManuallyArgsModel(1, 'abc', 3.14)

        >>> model.hparams

        "arg1": 1

        "arg3": 3.14

        >>> class AutomaticArgsModel(LightningModule):

        ...     def __init__(self, arg1, arg2, arg3):

        ...         super().__init__()

        ...         # equivalent automatic

        ...         self.save_hyperparameters()

        ...     def forward(self, *args, **kwargs):

        ...         ...

        >>> model = AutomaticArgsModel(1, 'abc', 3.14)

        >>> model.hparams

        "arg1": 1

        "arg2": abc

        "arg3": 3.14

        >>> class SingleArgModel(LightningModule):

        ...     def __init__(self, params):

        ...         super().__init__()

        ...         # manually assign single argument

        ...         self.save_hyperparameters(params)

        ...     def forward(self, *args, **kwargs):

        ...         ...

        >>> model = SingleArgModel(Namespace(p1=1, p2='abc', p3=3.14))

        >>> model.hparams

        "p1": 1

        "p2": abc

        "p3": 3.14

        """

        if not frame:

            frame = inspect.currentframe().f_back

        init_args = get_init_args(frame)

        assert init_args, 'failed to inspect the self init'

        if not args:

            hp = init_args

            self._hparams_name = 'kwargs' if hp else None

        else:

            isx_non_str = [i for i, arg in enumerate(args) if not isinstance(arg, str)]

            if len(isx_non_str) == 1:

                hp = args[isx_non_str[0]]

                cand_names = [k for k, v in init_args.items() if v == hp]

                self._hparams_name = cand_names[0] if cand_names else None

            else:

                hp = {arg: init_args[arg] for arg in args if isinstance(arg, str)}

                self._hparams_name = 'kwargs'

        # `hparams` are expected here

        if hp:

            self._set_hparams(hp)

setup

def setup(
    self,
    stage: str
)

Called at the beginning of fit and test.

This is a good hook when you need to build models dynamically or adjust something about them. This hook is called on every process when using DDP.

Parameters:

Name Type Description Default
stage None either 'fit' or 'test'
Example:: None
class LitModel ... def init(self):
self.l1 = None

def prepare_data(self): download_data() tokenize()

# don't do this
self.something = else

def setup(stage): data = Load_data(...) self.l1 = nn.Linear(28, data.num_classes) | None |

View Source
    def setup(self, stage: str):

        """

        Called at the beginning of fit and test.

        This is a good hook when you need to build models dynamically or adjust something about them.

        This hook is called on every process when using DDP.

        Args:

            stage: either 'fit' or 'test'

        Example::

            class LitModel(...):

                def __init__(self):

                    self.l1 = None

                def prepare_data(self):

                    download_data()

                    tokenize()

                    # don't do this

                    self.something = else

                def setup(stage):

                    data = Load_data(...)

                    self.l1 = nn.Linear(28, data.num_classes)

        """

share_memory

def share_memory(
    self: ~T
) -> ~T
View Source
    def share_memory(self: T) -> T:

        return self._apply(lambda t: t.share_memory_())

state_dict

def state_dict(
    self,
    destination=None,
    prefix='',
    keep_vars=False
)

Returns a dictionary containing a whole state of the module.

Both parameters and persistent buffers (e.g. running averages) are included. Keys are corresponding parameter and buffer names.

Returns:

Type Description
dict a dictionary containing a whole state of the module

Example::

>>> module.state_dict().keys()
['bias', 'weight'] |
View Source
    def state_dict(self, destination=None, prefix='', keep_vars=False):

        r"""Returns a dictionary containing a whole state of the module.

        Both parameters and persistent buffers (e.g. running averages) are

        included. Keys are corresponding parameter and buffer names.

        Returns:

            dict:

                a dictionary containing a whole state of the module

        Example::

            >>> module.state_dict().keys()

            ['bias', 'weight']

        """

        if destination is None:

            destination = OrderedDict()

            destination._metadata = OrderedDict()

        destination._metadata[prefix[:-1]] = local_metadata = dict(version=self._version)

        self._save_to_state_dict(destination, prefix, keep_vars)

        for name, module in self._modules.items():

            if module is not None:

                module.state_dict(destination, prefix + name + '.', keep_vars=keep_vars)

        for hook in self._state_dict_hooks.values():

            hook_result = hook(self, destination, prefix, local_metadata)

            if hook_result is not None:

                destination = hook_result

        return destination

summarize

def summarize(
    self,
    mode: str = 'top'
) -> pytorch_lightning.core.memory.ModelSummary
View Source
    def summarize(self, mode: str = ModelSummary.MODE_DEFAULT) -> ModelSummary:

        model_summary = ModelSummary(self, mode=mode)

        log.info('\n' + str(model_summary))

        return model_summary

tbptt_split_batch

def tbptt_split_batch(
    self,
    batch: torch.Tensor,
    split_size: int
) -> list

When using truncated backpropagation through time, each batch must be split along the time dimension. Lightning handles this by default, but for custom behavior override this function.

Args: batch: Current batch split_size: The size of the split

Return: List of batch splits. Each split will be passed to :meth:training_step to enable truncated back propagation through time. The default implementation splits root level Tensors and Sequences at dim=1 (i.e. time dim). It assumes that each time dim is the same length.

Examples: .. code-block:: python

    def tbptt_split_batch(self, batch, split_size):
      splits = []
      for t in range(0, time_dims[0], split_size):
          batch_split = []
          for i, x in enumerate(batch):
              if isinstance(x, torch.Tensor):
                  split_x = x[:, t:t + split_size]
              elif isinstance(x, collections.Sequence):
                  split_x = [None] * len(x)
                  for batch_idx in range(len(x)):
                      split_x[batch_idx] = x[batch_idx][t:t + split_size]

              batch_split.append(split_x)

          splits.append(batch_split)

      return splits

Note: Called in the training loop after :meth:~pytorch_lightning.callbacks.base.Callback.on_batch_start if :paramref:~pytorch_lightning.trainer.Trainer.truncated_bptt_steps > 0. Each returned batch split is passed separately to :meth:training_step.

View Source
    def tbptt_split_batch(self, batch: Tensor, split_size: int) -> list:

        r"""

        When using truncated backpropagation through time, each batch must be split along the

        time dimension. Lightning handles this by default, but for custom behavior override

        this function.

        Args:

            batch: Current batch

            split_size: The size of the split

        Return:

            List of batch splits. Each split will be passed to :meth:`training_step` to enable truncated

            back propagation through time. The default implementation splits root level Tensors and

            Sequences at dim=1 (i.e. time dim). It assumes that each time dim is the same length.

        Examples:

            .. code-block:: python

                def tbptt_split_batch(self, batch, split_size):

                  splits = []

                  for t in range(0, time_dims[0], split_size):

                      batch_split = []

                      for i, x in enumerate(batch):

                          if isinstance(x, torch.Tensor):

                              split_x = x[:, t:t + split_size]

                          elif isinstance(x, collections.Sequence):

                              split_x = [None] * len(x)

                              for batch_idx in range(len(x)):

                                  split_x[batch_idx] = x[batch_idx][t:t + split_size]

                          batch_split.append(split_x)

                      splits.append(batch_split)

                  return splits

        Note:

            Called in the training loop after

            :meth:`~pytorch_lightning.callbacks.base.Callback.on_batch_start`

            if :paramref:`~pytorch_lightning.trainer.Trainer.truncated_bptt_steps` > 0.

            Each returned batch split is passed separately to :meth:`training_step`.

        """

        time_dims = [len(x[0]) for x in batch if isinstance(x, (torch.Tensor, collections.Sequence))]

        assert len(time_dims) >= 1, "Unable to determine batch time dimension"

        assert all(x == time_dims[0] for x in time_dims), "Batch time dimension length is ambiguous"

        splits = []

        for t in range(0, time_dims[0], split_size):

            batch_split = []

            for i, x in enumerate(batch):

                if isinstance(x, torch.Tensor):

                    split_x = x[:, t:t + split_size]

                elif isinstance(x, collections.Sequence):

                    split_x = [None] * len(x)

                    for batch_idx in range(len(x)):

                        split_x[batch_idx] = x[batch_idx][t:t + split_size]

                batch_split.append(split_x)

            splits.append(batch_split)

        return splits

teardown

def teardown(
    self,
    stage: str
)

Called at the end of fit and test.

Parameters:

Name Type Description Default
stage None either 'fit' or 'test' None
View Source
    def teardown(self, stage: str):

        """

        Called at the end of fit and test.

        Args:

            stage: either 'fit' or 'test'

        """

test_dataloader

def test_dataloader(
    self
)
View Source
    def test_dataloader(self):

        test_loader = DataLoader(

            self.test_dataset,

            batch_size=self.batch_size,

            shuffle=False,

            num_workers=cpu_count(),

        )

        return test_loader

test_end

def test_end(
    self,
    outputs
)

Warnings:

Deprecated in v0.7.0. Use :meth:test_epoch_end instead. Will be removed in 1.0.0.

View Source
    def test_end(self, outputs):

        """

        Warnings:

             Deprecated in v0.7.0. Use :meth:`test_epoch_end` instead.

             Will be removed in 1.0.0.

        """

test_epoch_end

def test_epoch_end(
    self,
    outputs
)
View Source
    def test_epoch_end(self, outputs):

        log_dict = {}

        for metric_name in outputs[0]:

            log_dict[metric_name] = torch.stack([x[metric_name] for x in outputs]).mean()

        return {"log": log_dict, "progress_bar": log_dict, **log_dict}

test_step

def test_step(
    self,
    batch,
    batch_idx
)
View Source
    def test_step(self, batch, batch_idx):

        images, target = batch

        output = self(images)

        loss_val = F.cross_entropy(output, target)

        metrics_dict = metrics("test", output, target)

        output = OrderedDict({"test_loss": loss_val, **metrics_dict})

        return output

test_step_end

def test_step_end(
    self,
    *args,
    **kwargs
) -> Dict[str, torch.Tensor]

Use this when testing with dp or ddp2 because :meth:test_step will operate

on only part of the batch. However, this is still optional and only needed for things like softmax or NCE loss.

Note: If you later switch to ddp or some other mode, this will still be called so that you don't have to change your code.

.. code-block:: python

# pseudocode
sub_batches = split_batches_for_dp(batch)
batch_parts_outputs = [test_step(sub_batch) for sub_batch in sub_batches]
test_step_end(batch_parts_outputs)

Parameters:

Name Type Description Default
batch_parts_outputs None What you return in :meth:test_step for each batch part.
Return:
Dict or OrderedDict - passed to the :meth:test_epoch_end. None
View Source
    def test_step_end(self, *args, **kwargs) -> Dict[str, Tensor]:

        """

        Use this when testing with dp or ddp2 because :meth:`test_step` will operate

        on only part of the batch. However, this is still optional

        and only needed for things like softmax or NCE loss.

        Note:

            If you later switch to ddp or some other mode, this will still be called

            so that you don't have to change your code.

        .. code-block:: python

            # pseudocode

            sub_batches = split_batches_for_dp(batch)

            batch_parts_outputs = [test_step(sub_batch) for sub_batch in sub_batches]

            test_step_end(batch_parts_outputs)

        Args:

            batch_parts_outputs: What you return in :meth:`test_step` for each batch part.

        Return:

             Dict or OrderedDict - passed to the :meth:`test_epoch_end`.

        Examples:

            .. code-block:: python

                # WITHOUT test_step_end

                # if used in DP or DDP2, this batch is 1/num_gpus large

                def test_step(self, batch, batch_idx):

                    # batch is 1/num_gpus big

                    x, y = batch

                    out = self(x)

                    loss = self.softmax(out)

                    loss = nce_loss(loss)

                    return {'loss': loss}

                # --------------

                # with test_step_end to do softmax over the full batch

                def test_step(self, batch, batch_idx):

                    # batch is 1/num_gpus big

                    x, y = batch

                    out = self(x)

                    return {'out': out}

                def test_step_end(self, outputs):

                    # this out is now the full size of the batch

                    out = outputs['out']

                    # this softmax now uses the full batch size

                    loss = nce_loss(loss)

                    return {'loss': loss}

        See Also:

            See the :ref:`multi-gpu-training` guide for more details.

        """

tng_dataloader

def tng_dataloader(
    self
)

Warnings:

Deprecated in v0.5.0. Use :meth:train_dataloader instead. Will be removed in 1.0.0.

View Source
    def tng_dataloader(self):  # todo: remove in v1.0.0

        """

        Warnings:

            Deprecated in v0.5.0. Use :meth:`train_dataloader` instead. Will be removed in 1.0.0.

        """

        output = self.train_dataloader()

        rank_zero_warn("`tng_dataloader` has been renamed to `train_dataloader` since v0.5.0."

                       " and this method will be removed in v1.0.0", DeprecationWarning)

        return output

to

def to(
    self,
    *args,
    **kwargs
) -> torch.nn.modules.module.Module

Moves and/or casts the parameters and buffers.

This can be called as .. function:: to(device=None, dtype=None, non_blocking=False) .. function:: to(dtype, non_blocking=False) .. function:: to(tensor, non_blocking=False) Its signature is similar to :meth:torch.Tensor.to, but only accepts floating point desired :attr:dtype s. In addition, this method will only cast the floating point parameters and buffers to :attr:dtype (if given). The integral parameters and buffers will be moved :attr:device, if that is given, but with dtypes unchanged. When :attr:non_blocking is set, it tries to convert/move asynchronously with respect to the host if possible, e.g., moving CPU Tensors with pinned memory to CUDA devices. See below for examples.

Note: This method modifies the module in-place.

Parameters:

Name Type Description Default
device None the desired device of the parameters
and buffers in this module None
dtype None the desired floating point type of
the floating point parameters and buffers in this module None
tensor None Tensor whose dtype and device are the desired
dtype and device for all parameters and buffers in this module None

Returns:

Type Description
Module self
Example::
>>> class ExampleModule(DeviceDtypeModuleMixin):
... def init(self, weight: torch.Tensor):
... super().init()
... self.register_buffer('weight', weight)
>>> _ = torch.manual_seed(0)
>>> module = ExampleModule(torch.rand(3, 4))
>>> module.weight #doctest: +ELLIPSIS
tensor([[...]])
>>> module.to(torch.double)
ExampleModule()
>>> module.weight #doctest: +ELLIPSIS
tensor([[...]], dtype=torch.float64)
>>> cpu = torch.device('cpu')
>>> module.to(cpu, dtype=torch.half, non_blocking=True)
ExampleModule()
>>> module.weight #doctest: +ELLIPSIS
tensor([[...]], dtype=torch.float16)
>>> module.to(cpu)
ExampleModule()
>>> module.weight #doctest: +ELLIPSIS
tensor([[...]], dtype=torch.float16)
View Source
    def to(self, *args, **kwargs) -> Module:

        """Moves and/or casts the parameters and buffers.

        This can be called as

        .. function:: to(device=None, dtype=None, non_blocking=False)

        .. function:: to(dtype, non_blocking=False)

        .. function:: to(tensor, non_blocking=False)

        Its signature is similar to :meth:`torch.Tensor.to`, but only accepts

        floating point desired :attr:`dtype` s. In addition, this method will

        only cast the floating point parameters and buffers to :attr:`dtype`

        (if given). The integral parameters and buffers will be moved

        :attr:`device`, if that is given, but with dtypes unchanged. When

        :attr:`non_blocking` is set, it tries to convert/move asynchronously

        with respect to the host if possible, e.g., moving CPU Tensors with

        pinned memory to CUDA devices.

        See below for examples.

        Note:

            This method modifies the module in-place.

        Args:

            device: the desired device of the parameters

                and buffers in this module

            dtype: the desired floating point type of

                the floating point parameters and buffers in this module

            tensor: Tensor whose dtype and device are the desired

                dtype and device for all parameters and buffers in this module

        Returns:

            Module: self

        Example::

            >>> class ExampleModule(DeviceDtypeModuleMixin):

            ...     def __init__(self, weight: torch.Tensor):

            ...         super().__init__()

            ...         self.register_buffer('weight', weight)

            >>> _ = torch.manual_seed(0)

            >>> module = ExampleModule(torch.rand(3, 4))

            >>> module.weight #doctest: +ELLIPSIS

            tensor([[...]])

            >>> module.to(torch.double)

            ExampleModule()

            >>> module.weight #doctest: +ELLIPSIS

            tensor([[...]], dtype=torch.float64)

            >>> cpu = torch.device('cpu')

            >>> module.to(cpu, dtype=torch.half, non_blocking=True)

            ExampleModule()

            >>> module.weight #doctest: +ELLIPSIS

            tensor([[...]], dtype=torch.float16)

            >>> module.to(cpu)

            ExampleModule()

            >>> module.weight #doctest: +ELLIPSIS

            tensor([[...]], dtype=torch.float16)

        """

        # there is diff nb vars in PT 1.5

        out = torch._C._nn._parse_to(*args, **kwargs)

        device = out[0]

        dtype = out[1]

        if device is not None:

            self._device = device

        if dtype is not None:

            self._dtype = dtype

        return super().to(*args, **kwargs)

train

def train(
    self: ~T,
    mode: bool = True
) -> ~T

Sets the module in training mode.

This has any effect only on certain modules. See documentations of particular modules for details of their behaviors in training/evaluation mode, if they are affected, e.g. :class:Dropout, :class:BatchNorm, etc.

Parameters:

Name Type Description Default
mode bool whether to set training mode (True) or evaluation
mode (False). Default: True. None

Returns:

Type Description
Module self
View Source
    def train(self: T, mode: bool = True) -> T:

        r"""Sets the module in training mode.

        This has any effect only on certain modules. See documentations of

        particular modules for details of their behaviors in training/evaluation

        mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`,

        etc.

        Args:

            mode (bool): whether to set training mode (``True``) or evaluation

                         mode (``False``). Default: ``True``.

        Returns:

            Module: self

        """

        self.training = mode

        for module in self.children():

            module.train(mode)

        return self

train_dataloader

def train_dataloader(
    self
)
View Source
    def train_dataloader(self):

        train_loader = DataLoader(

            dataset=self.train_dataset,

            batch_size=self.batch_size,

            shuffle=True,

            num_workers=cpu_count(),

        )

        return train_loader

training_end

def training_end(
    self,
    *args,
    **kwargs
)

Warnings:

Deprecated in v0.7.0. Use :meth:training_step_end instead.

View Source
    def training_end(self, *args, **kwargs):

        """

        Warnings:

            Deprecated in v0.7.0. Use  :meth:`training_step_end` instead.

        """

training_epoch_end

def training_epoch_end(
    self,
    outputs: Union[List[Dict[str, torch.Tensor]], List[List[Dict[str, torch.Tensor]]]]
) -> Dict[str, Dict[str, torch.Tensor]]

Called at the end of the training epoch with the outputs of all training steps.

.. code-block:: python

# the pseudocode for these calls
train_outs = []
for train_batch in train_data:
    out = training_step(train_batch)
    train_outs.append(out)
training_epoch_end(train_outs)

Args: outputs: List of outputs you defined in :meth:training_step, or if there are multiple dataloaders, a list containing a list of outputs for each dataloader.

Return: Dict or OrderedDict. May contain the following optional keys:

- log (metrics to be added to the logger; only tensors)
- progress_bar (dict for progress bar display)
- any metric used in a callback (e.g. early stopping).

Note: If this method is not overridden, this won't be called.

  • The outputs here are strictly for logging or progress bar.
  • If you don't need to display anything, don't return anything.
  • If you want to manually set current step, you can specify the 'step' key in the 'log' dict.

Examples: With a single dataloader:

.. code-block:: python

    def training_epoch_end(self, outputs):
        train_acc_mean = 0
        for output in outputs:
            train_acc_mean += output['train_acc']

        train_acc_mean /= len(outputs)

        # log training accuracy at the end of an epoch
        results = {
            'log': {'train_acc': train_acc_mean.item()},
            'progress_bar': {'train_acc': train_acc_mean},
        }
        return results

With multiple dataloaders, ``outputs`` will be a list of lists. The outer list contains
one entry per dataloader, while the inner list contains the individual outputs of
each training step for that dataloader.

.. code-block:: python

    def training_epoch_end(self, outputs):
        train_acc_mean = 0
        i = 0
        for dataloader_outputs in outputs:
            for output in dataloader_outputs:
                train_acc_mean += output['train_acc']
                i += 1

        train_acc_mean /= i

        # log training accuracy at the end of an epoch
        results = {
            'log': {'train_acc': train_acc_mean.item(), 'step': self.current_epoch}
            'progress_bar': {'train_acc': train_acc_mean},
        }
        return results
View Source
    def training_epoch_end(

            self,

            outputs: Union[List[Dict[str, Tensor]], List[List[Dict[str, Tensor]]]]

    ) -> Dict[str, Dict[str, Tensor]]:

        """Called at the end of the training epoch with the outputs of all training steps.

        .. code-block:: python

            # the pseudocode for these calls

            train_outs = []

            for train_batch in train_data:

                out = training_step(train_batch)

                train_outs.append(out)

            training_epoch_end(train_outs)

        Args:

            outputs: List of outputs you defined in :meth:`training_step`, or if there are

                multiple dataloaders, a list containing a list of outputs for each dataloader.

        Return:

            Dict or OrderedDict.

            May contain the following optional keys:

            - log (metrics to be added to the logger; only tensors)

            - progress_bar (dict for progress bar display)

            - any metric used in a callback (e.g. early stopping).

        Note:

            If this method is not overridden, this won't be called.

        - The outputs here are strictly for logging or progress bar.

        - If you don't need to display anything, don't return anything.

        - If you want to manually set current step, you can specify the 'step' key in the 'log' dict.

        Examples:

            With a single dataloader:

            .. code-block:: python

                def training_epoch_end(self, outputs):

                    train_acc_mean = 0

                    for output in outputs:

                        train_acc_mean += output['train_acc']

                    train_acc_mean /= len(outputs)

                    # log training accuracy at the end of an epoch

                    results = {

                        'log': {'train_acc': train_acc_mean.item()},

                        'progress_bar': {'train_acc': train_acc_mean},

                    }

                    return results

            With multiple dataloaders, ``outputs`` will be a list of lists. The outer list contains

            one entry per dataloader, while the inner list contains the individual outputs of

            each training step for that dataloader.

            .. code-block:: python

                def training_epoch_end(self, outputs):

                    train_acc_mean = 0

                    i = 0

                    for dataloader_outputs in outputs:

                        for output in dataloader_outputs:

                            train_acc_mean += output['train_acc']

                            i += 1

                    train_acc_mean /= i

                    # log training accuracy at the end of an epoch

                    results = {

                        'log': {'train_acc': train_acc_mean.item(), 'step': self.current_epoch}

                        'progress_bar': {'train_acc': train_acc_mean},

                    }

                    return results

        """

training_step

def training_step(
    self,
    batch,
    batch_idx
)
View Source
    def training_step(self, batch, batch_idx):

        images, target = batch

        output = self(images)

        loss_val = F.cross_entropy(output, target)

        metrics_dict = metrics("train", output, target)

        tqdm_dict = {"train_loss": loss_val, **metrics_dict}

        output = OrderedDict({"loss": loss_val, "progress_bar": tqdm_dict, "log": tqdm_dict})

        return output

training_step_end

def training_step_end(
    self,
    *args,
    **kwargs
) -> Dict[str, Union[torch.Tensor, Dict[str, torch.Tensor]]]

Use this when training with dp or ddp2 because :meth:training_step will operate on only part of the batch. However, this is still optional and only needed for things like softmax or NCE loss.

Note: If you later switch to ddp or some other mode, this will still be called so that you don't have to change your code

.. code-block:: python

# pseudocode
sub_batches = split_batches_for_dp(batch)
batch_parts_outputs = [training_step(sub_batch) for sub_batch in sub_batches]
training_step_end(batch_parts_outputs)

Args: batch_parts_outputs: What you return in training_step for each batch part.

Return: Dict with loss key and optional log or progress bar keys.

- loss -> tensor scalar **REQUIRED**
- progress_bar -> Dict for progress bar display. Must have only tensors
- log -> Dict of metrics to add to logger. Must have only tensors (no images, etc)

Examples: .. code-block:: python

    # WITHOUT training_step_end
    # if used in DP or DDP2, this batch is 1/num_gpus large
    def training_step(self, batch, batch_idx):
        # batch is 1/num_gpus big
        x, y = batch

        out = self(x)
        loss = self.softmax(out)
        loss = nce_loss(loss)
        return {'loss': loss}

    # --------------
    # with training_step_end to do softmax over the full batch
    def training_step(self, batch, batch_idx):
        # batch is 1/num_gpus big
        x, y = batch

        out = self(x)
        return {'out': out}

    def training_step_end(self, outputs):
        # this out is now the full size of the batch
        out = outputs['out']

        # this softmax now uses the full batch size
        loss = nce_loss(loss)
        return {'loss': loss}

See Also: See the :ref:multi-gpu-training guide for more details.

View Source
    def training_step_end(self, *args, **kwargs) -> Dict[

        str, Union[Tensor, Dict[str, Tensor]]

    ]:

        """

        Use this when training with dp or ddp2 because :meth:`training_step`

        will operate on only part of the batch. However, this is still optional

        and only needed for things like softmax or NCE loss.

        Note:

            If you later switch to ddp or some other mode, this will still be called

            so that you don't have to change your code

        .. code-block:: python

            # pseudocode

            sub_batches = split_batches_for_dp(batch)

            batch_parts_outputs = [training_step(sub_batch) for sub_batch in sub_batches]

            training_step_end(batch_parts_outputs)

        Args:

            batch_parts_outputs: What you return in `training_step` for each batch part.

        Return:

            Dict with loss key and optional log or progress bar keys.

            - loss -> tensor scalar **REQUIRED**

            - progress_bar -> Dict for progress bar display. Must have only tensors

            - log -> Dict of metrics to add to logger. Must have only tensors (no images, etc)

        Examples:

            .. code-block:: python

                # WITHOUT training_step_end

                # if used in DP or DDP2, this batch is 1/num_gpus large

                def training_step(self, batch, batch_idx):

                    # batch is 1/num_gpus big

                    x, y = batch

                    out = self(x)

                    loss = self.softmax(out)

                    loss = nce_loss(loss)

                    return {'loss': loss}

                # --------------

                # with training_step_end to do softmax over the full batch

                def training_step(self, batch, batch_idx):

                    # batch is 1/num_gpus big

                    x, y = batch

                    out = self(x)

                    return {'out': out}

                def training_step_end(self, outputs):

                    # this out is now the full size of the batch

                    out = outputs['out']

                    # this softmax now uses the full batch size

                    loss = nce_loss(loss)

                    return {'loss': loss}

        See Also:

            See the :ref:`multi-gpu-training` guide for more details.

        """

transfer_batch_to_device

def transfer_batch_to_device(
    self,
    batch: Any,
    device: torch.device
) -> Any

Override this hook if your :class:~torch.utils.data.DataLoader returns tensors

wrapped in a custom data structure.

The data types listed below (and any arbitrary nesting of them) are supported out of the box:

  • :class:torch.Tensor or anything that implements .to(...)
  • :class:list
  • :class:dict
  • :class:tuple
  • :class:torchtext.data.batch.Batch

For anything else, you need to define how the data is moved to the target device (CPU, GPU, TPU, ...).

Example::

def transfer_batch_to_device(self, batch, device)
    if isinstance(batch, CustomBatch):
        # move all tensors in your custom data structure to the device
        batch.samples = batch.samples.to(device)
        batch.targets = batch.targets.to(device)
    else:
        batch = super().transfer_batch_to_device(data, device)
    return batch

Parameters:

Name Type Description Default
batch None A batch of data that needs to be transferred to a new device. None
device None The target device as defined in PyTorch. None

Returns:

Type Description
None A reference to the data on the new device.

Note: This hook should only transfer the data and not modify it, nor should it move the data to any other device than the one passed in as argument (unless you know what you are doing). The :class:~pytorch_lightning.trainer.trainer.Trainer already takes care of splitting the batch and determines the target devices.

See Also: - :func:~pytorch_lightning.utilities.apply_func.move_data_to_device - :func:~pytorch_lightning.utilities.apply_func.apply_to_collection |

View Source
    def transfer_batch_to_device(self, batch: Any, device: torch.device) -> Any:

        """

        Override this hook if your :class:`~torch.utils.data.DataLoader` returns tensors

        wrapped in a custom data structure.

        The data types listed below (and any arbitrary nesting of them) are supported out of the box:

        - :class:`torch.Tensor` or anything that implements `.to(...)`

        - :class:`list`

        - :class:`dict`

        - :class:`tuple`

        - :class:`torchtext.data.batch.Batch`

        For anything else, you need to define how the data is moved to the target device (CPU, GPU, TPU, ...).

        Example::

            def transfer_batch_to_device(self, batch, device)

                if isinstance(batch, CustomBatch):

                    # move all tensors in your custom data structure to the device

                    batch.samples = batch.samples.to(device)

                    batch.targets = batch.targets.to(device)

                else:

                    batch = super().transfer_batch_to_device(data, device)

                return batch

        Args:

            batch: A batch of data that needs to be transferred to a new device.

            device: The target device as defined in PyTorch.

        Returns:

            A reference to the data on the new device.

        Note:

            This hook should only transfer the data and not modify it, nor should it move the data to

            any other device than the one passed in as argument (unless you know what you are doing).

            The :class:`~pytorch_lightning.trainer.trainer.Trainer` already takes care of splitting the

            batch and determines the target devices.

        See Also:

            - :func:`~pytorch_lightning.utilities.apply_func.move_data_to_device`

            - :func:`~pytorch_lightning.utilities.apply_func.apply_to_collection`

        """

        return move_data_to_device(batch, device)

type

def type(
    self,
    dst_type: Union[str, torch.dtype]
) -> torch.nn.modules.module.Module

Casts all parameters and buffers to :attr:dst_type.

Parameters:

Name Type Description Default
dst_type type or string the desired type None

Returns:

Type Description
Module self
View Source
    def type(self, dst_type: Union[str, torch.dtype]) -> Module:

        """Casts all parameters and buffers to :attr:`dst_type`.

        Arguments:

            dst_type (type or string): the desired type

        Returns:

            Module: self

        """

        self._dtype = dst_type

        return super().type(dst_type=dst_type)

unfreeze

def unfreeze(
    self
) -> None

Unfreeze all parameters for training.

.. code-block:: python

model = MyLightningModule(...)
model.unfreeze()
View Source
    def unfreeze(self) -> None:

        """

        Unfreeze all parameters for training.

        .. code-block:: python

            model = MyLightningModule(...)

            model.unfreeze()

        """

        for param in self.parameters():

            param.requires_grad = True

        self.train()

val_dataloader

def val_dataloader(
    self
)
View Source
    def val_dataloader(self):

        val_loader = DataLoader(

            self.val_dataset,

            batch_size=self.batch_size,

            shuffle=False,

            num_workers=cpu_count() // 2,

        )

        return val_loader

validation_end

def validation_end(
    self,
    outputs
)

Warnings:

Deprecated in v0.7.0. Use :meth:validation_epoch_end instead. Will be removed in 1.0.0.

View Source
    def validation_end(self, outputs):

        """

        Warnings:

            Deprecated in v0.7.0. Use :meth:`validation_epoch_end` instead.

            Will be removed in 1.0.0.

        """

validation_epoch_end

def validation_epoch_end(
    self,
    outputs
)
View Source
    def validation_epoch_end(self, outputs):

        log_dict = {}

        for metric_name in outputs[0]:

            log_dict[metric_name] = torch.stack([x[metric_name] for x in outputs]).mean()

        return {"log": log_dict, "progress_bar": log_dict, **log_dict}

validation_step

def validation_step(
    self,
    batch,
    batch_idx
)
View Source
    def validation_step(self, batch, batch_idx):

        images, target = batch

        output = self(images)

        loss_val = F.cross_entropy(output, target)

        metrics_dict = metrics("val", output, target)

        output = OrderedDict({"val_loss": loss_val, **metrics_dict})

        return output

validation_step_end

def validation_step_end(
    self,
    *args,
    **kwargs
) -> Dict[str, torch.Tensor]

Use this when validating with dp or ddp2 because :meth:validation_step

will operate on only part of the batch. However, this is still optional and only needed for things like softmax or NCE loss.

Note: If you later switch to ddp or some other mode, this will still be called so that you don't have to change your code.

.. code-block:: python

# pseudocode
sub_batches = split_batches_for_dp(batch)
batch_parts_outputs = [validation_step(sub_batch) for sub_batch in sub_batches]
validation_step_end(batch_parts_outputs)

Parameters:

Name Type Description Default
batch_parts_outputs None What you return in :meth:validation_step
for each batch part.

Return: Dict or OrderedDict - passed to the :meth:validation_epoch_end method. | None |

View Source
    def validation_step_end(self, *args, **kwargs) -> Dict[str, Tensor]:

        """

        Use this when validating with dp or ddp2 because :meth:`validation_step`

        will operate on only part of the batch. However, this is still optional

        and only needed for things like softmax or NCE loss.

        Note:

            If you later switch to ddp or some other mode, this will still be called

            so that you don't have to change your code.

        .. code-block:: python

            # pseudocode

            sub_batches = split_batches_for_dp(batch)

            batch_parts_outputs = [validation_step(sub_batch) for sub_batch in sub_batches]

            validation_step_end(batch_parts_outputs)

        Args:

            batch_parts_outputs: What you return in :meth:`validation_step`

                for each batch part.

        Return:

           Dict or OrderedDict - passed to the :meth:`validation_epoch_end` method.

        Examples:

            .. code-block:: python

                # WITHOUT validation_step_end

                # if used in DP or DDP2, this batch is 1/num_gpus large

                def validation_step(self, batch, batch_idx):

                    # batch is 1/num_gpus big

                    x, y = batch

                    out = self(x)

                    loss = self.softmax(out)

                    loss = nce_loss(loss)

                    return {'loss': loss}

                # --------------

                # with validation_step_end to do softmax over the full batch

                def validation_step(self, batch, batch_idx):

                    # batch is 1/num_gpus big

                    x, y = batch

                    out = self(x)

                    return {'out': out}

                def validation_epoch_end(self, outputs):

                    # this out is now the full size of the batch

                    out = outputs['out']

                    # this softmax now uses the full batch size

                    loss = nce_loss(loss)

                    return {'loss': loss}

        See Also:

            See the :ref:`multi-gpu-training` guide for more details.

        """

zero_grad

def zero_grad(
    self
) -> None

Sets gradients of all model parameters to zero.

View Source
    def zero_grad(self) -> None:

        r"""Sets gradients of all model parameters to zero."""

        if getattr(self, '_is_replica', False):

            warnings.warn(

                "Calling .zero_grad() from a module created with nn.DataParallel() has no effect. "

                "The parameters are copied (in a differentiable manner) from the original module. "

                "This means they are not leaf nodes in autograd and so don't accumulate gradients. "

                "If you need gradients in your forward method, consider using autograd.grad instead.")

        for p in self.parameters():

            if p.grad is not None:

                p.grad.detach_()

                p.grad.zero_()